Import pbbam_1.0.6+dfsg.orig.tar.xz

author Andreas Tille <tille@debian.org>

Fri, 20 Dec 2019 10:27:03 +0000 (10:27 +0000)

committer Andreas Tille <tille@debian.org>

Fri, 20 Dec 2019 10:27:03 +0000 (10:27 +0000)
author Andreas Tille <tille@debian.org>
Fri, 20 Dec 2019 10:27:03 +0000 (10:27 +0000)
committer Andreas Tille <tille@debian.org>
Fri, 20 Dec 2019 10:27:03 +0000 (10:27 +0000)
diff --git a/.clang-format b/.clang-format

new file mode 100644 (file)

index 0000000..1519f35
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,13 @@
+BasedOnStyle:  Google
+BreakBeforeBraces: Mozilla
+
+AllowShortLoopsOnASingleLine: false
+AccessModifierOffset: -4
+BreakConstructorInitializersBeforeComma: true
+ColumnLimit: 100
+IndentWidth: 4
+PointerAlignment: Left
+TabWidth: 4
+
+ReflowComments: false  # protect ASCII art in comments
+KeepEmptyLinesAtTheStartOfBlocks: true
diff --git a/.gitignore b/.gitignore

new file mode 100644 (file)

index 0000000..c26bb9a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+.DS_Store
+/build*
+
+# cram
+*.pyc
+
+# Meson WrapDB stuff
+/subprojects/*
+!/subprojects/*.wrap
diff --git a/.travis.yml b/.travis.yml

new file mode 100644 (file)

index 0000000..33b861e
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,61 @@
+language: cpp
+compiler:
+  - gcc
+
+before_install:
+
+  # Travis's default installs of gcc, boost, & cmake currently lag behind the minimums we need.
+  # So we need to manually setup them up. 
+  #
+  #  - gcc 4.8 (current default on Travis is 4.7, which is no good for C++11 work)
+  #  - boost 1.55
+  #  - cmake 3.x
+  
+  # add external repos
+  - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test           # gcc
+  - sudo add-apt-repository -y ppa:boost-latest/ppa                  # boost
+  - sudo add-apt-repository -y ppa:george-edison55/precise-backports # cmake
+  
+  # remove existing cmake install
+  - sudo apt-get remove -qq cmake cmake-data
+  - sudo apt-get autoremove -qq
+  
+  # update apt 
+  - sudo apt-get update -y -qq
+
+  # install
+  - sudo apt-get install -y -qq g++-4.8 boost1.55 cmake-data cmake 
+  
+  # make sure we're using new gcc tools
+  - sudo update-alternatives --install /usr/bin/g++  g++  /usr/bin/g++-4.8  90
+  - sudo update-alternatives --install /usr/bin/gcc  gcc  /usr/bin/gcc-4.8  90 
+  - sudo update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-4.8 90
+
+  # prep zlib
+  - sudo apt-get install -y -qq zlib1g-dev
+
+  # prep GoogleTest 
+  - sudo apt-get install -y -qq libgtest-dev
+
+before_script:
+  # run cmake
+  - mkdir build 
+  - cd build
+  - cmake .. -DGTEST_SRC_DIR=/usr/src/gtest -DCMAKE_BUILD_TYPE=Debug
+    
+script:
+  # build & test
+  - make -j 3
+  - make test
+
+branches:
+  only:
+    - master
+    
+notifications:
+  recipients:
+    - dbarnett@pacb.com
+  email:
+    on_success: change
+    on_failure: always 
+   
diff --git a/CHANGELOG.md b/CHANGELOG.md

new file mode 100644 (file)

index 0000000..86c13c0
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,621 @@
+# PacBio::BAM - change log\r
+\r
+All notable changes to this project will be documented in this file.\r
+This project adheres to [Semantic Versioning](http://semver.org/).\r
+\r
+## Active\r
+\r
+### Added\r
+ - CCSRecord API to work with the minimally required data for CCS\r
+ - TextFileReader & TextFileWriter for generic line files (plain text or gzipped)\r
+ - BedReader & BedWriter for BED format support\r
+\r
+## [1.0.6] - 2019-06-14\r
+\r
+### Added\r
+ - IFastaWriter & IFastqWriter abstract base classes\r
+\r
+## [1.0.5] - 2019-06-11\r
+\r
+### Changed\r
+ - BAM tag lookup improvements under the hood.\r
+\r
+## [1.0.4] - 2019-06-07\r
+\r
+### Added\r
+ - General-purpose BgzipWriter\r
+ - BgzipFastaWriter and BgzipFastqWriter\r
+ - Read-only view to read indices passing a PbiFilter\r
+ - IPD field to SimpleRead\r
+\r
+## [1.0.3] - 2019-05-20\r
+\r
+### Added\r
+ - IndexedFastqReader for random access to FASTQ subregions\r
+\r
+### Fixed\r
+ - MappedSimpleRead clipping: on disjoint aligned/requested regions and on requests larger\r
+   than available sequence.\r
+\r
+## [1.0.2] - 2019-05-10\r
+\r
+### Added\r
+ - Range-for iteration on FastaReader & FastqReader\r
+\r
+## [1.0.1] - 2019-05-09\r
+\r
+### Added\r
+ - SimpleRead & MappedSimpleRead for htslib-free processing.\r
+\r
+### Fixed\r
+ - Incorrect type displayed in SAM output (pure-text) for floating-point values.\r
+\r
+## [1.0.0] - 2019-04-22\r
+\r
+### Changed\r
+ - C++14 is now a *hard* minimum.\r
+\r
+### Removed\r
+ - Headers emulating C++14 features for C++11.\r
+\r
+### Fixed\r
+ - Inconsistent whitelist/blacklist filters in DataSet XML.\r
+\r
+## [0.25.0] - 2019-04-11\r
+\r
+### Changed\r
+ - Requires C++14 at minimum.\r
+\r
+### Fixed\r
+ - Reading BioSample(s) elements from DataSet XML.\r
+\r
+## [0.24.0] - 2019-04-05\r
+\r
+### Added\r
+ - Built-in support for dataset elements: BioSample(s) & DNABarcode(s).\r
+ - BaiIndexCache for reusing data from *.bai files(s).\r
+ - Support in GenomicIntervalQuery for new BaiIndexCache.\r
+\r
+## [0.23.1] - 2019-03-21\r
+\r
+### Added\r
+ - Streamable BamReader (via stdin).\r
+ - Enabled range-for on BamReader, compatible with the other *Query inputs.\r
+\r
+## [0.23.0] - 2019-03-11\r
+\r
+### Added\r
+ - PbiIndexCache and FastaCache for reusing file data\r
+ - BaiIndexedBamReader and GenomicIntervalQuery can be constructed without\r
+   initial interval.\r
+\r
+## [0.22.0] - 2019-02-11\r
+\r
+### Fixed\r
+ - Handles zero-length reads for stitching ZMW reads.\r
+ - Clipping to query on reverse-strand aligned reads.\r
+ - Removed UB in dataset API.\r
+\r
+### Added\r
+ - "exciseFlankingInserts" option for clipping reads w.r.t reference.\r
+\r
+## [0.21.0] - 2018-12-21\r
+\r
+### Added\r
+ - New local context flags: ADAPTER_BEFORE_BAD and ADAPTER_AFTER_BAD.\r
+\r
+### Changed\r
+ - Current PacBioBAM spec now 3.0.7.\r
+\r
+### Removed\r
+ - CMake has been removed completely.\r
+\r
+## [0.20.0] - 2018-10-03\r
+\r
+### Added\r
+ - Support for (optionally) barcode-labeled read group IDs.\r
+\r
+## [0.19.0] - 2018-09-11\r
+\r
+### Added\r
+ - TranscriptAlignmentSet to XML support\r
+\r
+## [0.17.0] - 2018-03-18\r
+\r
+### Added\r
+- CompressionLevel/NumThreads parameter implementation to PbiBuilder.\r
+- Dataset ctor to PbiFileQuery.\r
+- TranscriptSet to XML support.\r
+- Auto-enabled "permissive CIGAR mode" for pbbamify tool.\r
+- IndexedBamWriter, for more efficient writing of BAM & PBI simultaneously.\r
+\r
+## [0.16.0] - 2018-01-17\r
+\r
+### Removed\r
+- Removed the PbiIndex class and its "lookup data"-related helpers. These were\r
+never as useful as initially intended. PbiRawData and its related classes are the\r
+recommended interface for working with PBI index data.\r
+\r
+## [0.15.0] - 2018-01-12\r
+\r
+### Added\r
+- Support for long CIGARs (>64K operations).\r
+\r
+## [0.14.0] - 2017-12-12\r
+\r
+### Added\r
+- Support for newer style QNAMEs. Recent version of htslib (1.4+) have started\r
+adding extra null terminators to make the subsequent CIGAR section 32-bit aligned.\r
+\r
+### Changed\r
+- Requirements for htslib version used. Must now be htslib v1.4+.\r
+\r
+## [0.13.2] - 2017-09-25\r
+\r
+### Added\r
+- Backward compatibility for C++11 (std::make_unique which is 11/14 agnostic).\r
+\r
+## [0.13.1] - 2017-09-25\r
+\r
+### Added\r
+- Support for "pe" tag in stitched, virtual reads.\r
+\r
+## [0.13.0] - 2017-09-25\r
+\r
+### Changed\r
+- Ran clang-tidy (modernize) over codebase to clean up legacy coding styles.\r
+\r
+## [0.12.2] - 2017-09-22\r
+\r
+### Added\r
+- HasPulseExclusion() to BamRecord (& derived types).\r
+\r
+## [0.12.1] - 2017-09-21\r
+\r
+### Added\r
+- Pulse exclusion base feature to read group.\r
+\r
+## [0.12.0] - 2017-09-19\r
+\r
+### Added\r
+- NumReads() for PBI filter-based queries. This allows fetching of the number\r
+of reads that pass the filter, without needing to iterate over the entire\r
+file(s).\r
+\r
+## [0.11.0] - 2017-09-15\r
+\r
+### Added\r
+- Support for internal tag: pulse exclusion reason ("pe"). New methods on\r
+BamRecord, and new enum PulseExclusionReason.\r
+\r
+### Changed\r
+- Default PacBioBAM format version now 3.0.5\r
+\r
+## [0.10.2] - 2017-09-14\r
+\r
+### Changed\r
+- Explicitly trim all whitespace from FASTA input.\r
+\r
+## [0.10.1] - 2017-09-11\r
+\r
+### Changed\r
+- Frames, add mutex to avoid race condition in InitIpdDownsampling(void)\r
+\r
+## [0.10.0] - 2017-09-08\r
+\r
+### Changed\r
+- PbiBuilder backend for generating PBI index files "on-the-fly" along with\r
+writing BAM files. The previous implementation's memory usage scaled linearly\r
+with the number of reads, sometimes reaching huge numbers (several gigs or more).\r
+The new implementation's memory usage remains constant for any number of reads,\r
+without any runtime hit on files/architectures tested.\r
+\r
+### Removed\r
+- PbiBuilder::Result(). Returned an intermediate snapshot of the index under\r
+construction. This method isn't usable with the new PbiBuilder backend and was\r
+really only useful for initial debugging/testing. It is no longer used in the\r
+test framework and is unlikely to be used by client code either. Dropping this\r
+method from the API, and thus bumping the version number.\r
+\r
+## [0.9.0] - 2017-08-07\r
+\r
+### Removed\r
+- Bundled htslib. Now using 'stock' htslib (v1.3.1+).\r
+- Built-in SWIG wrappers.\r
+\r
+## [0.8.0] - 2017-07-24\r
+\r
+### Added\r
+- Default DataSet 'Version' attribute if none already present (currently 4.0.0)\r
+- Added whitelist support for filtering ZMWs via DataSetXML.\r
+- Added iterable query over FASTA files & ReferenceSet datasets.\r
+- Added DataSet::AllFiles to access primary resources AND their child files (indices,\r
+scraps, etc).\r
+\r
+### Fixed\r
+- Bug in the build system preventing clean rebuilds.\r
+\r
+### Removed\r
+- Dropped the bundled, PacBio-forked version of htslib. Now using stock htslib (v1.3.1+).\r
+\r
+## [0.7.4] - 2016-11-18\r
+\r
+### Changed\r
+- Compatibility for merging BAM files no longer requires exact match of PacBioBAM\r
+version number (header @HD:pb tag). As long as both files meet the minimum\r
+supported version number, the merge is allowed.\r
+\r
+## [0.7.3] - 2016-11-11\r
+\r
+### Added\r
+- Support for S/P2-C2 chemistry and forthcoming 4.0 basecaller\r
+\r
+## [0.7.2] - 2016-11-10\r
+\r
+### Removed\r
+- SAM header version equality check for merging BAM files. PacBioBAM version\r
+number carries more meaning for PacBio data and thus will be the basis of\r
+ensuring compatible merging.\r
+\r
+## [0.7.1] - 2016-11-09\r
+\r
+### Added\r
+- (Unindexed) FASTA reader & FastaSequence data structure.\r
+- Missing unit tests for internal BAM tag access.\r
+- Chemistry data for basecaller v3.3.\r
+- Missing parsers for filtering barcode quality ("bq"), barcode forward ("bcf"),\r
+and barcode reverse ("bcr") from DataSetXML.\r
+- Integrated htslib into project.\r
+\r
+### Fixed\r
+- Reverse complement on padding base.\r
+\r
+## [0.7.0] - 2016-09-26\r
+\r
+### Added\r
+- Clipping for CCS records\r
+\r
+### Fixed\r
+- Cached position data leaking across records while iterating.\r
+- Rolled back default pulse behavior in internal BAM API, to be backward-\r
+compatible with existing client code (for now at least). v0.6.0 introduced\r
+returning basecalled positions ONLY by default, rather than return ALL\r
+pulses.\r
+- Fixed crash when attempting to read from empty BAM/PBI files using the\r
+PbiFilter-enabled APIs.\r
+\r
+## [0.6.0] - 2016-09-13\r
+\r
+### Added\r
+- BamWriter writes to a BAM file with the target name plus a ".tmp" suffix. On\r
+successful completion (i.e. normal BamWriter destruction, not triggered by a\r
+thrown exception) the file is renamed to the actual requested filename.\r
+- PBI file creation follows the same temporary naming convention.\r
+- Support for barcode pair (forward, reverse) in DataSetXML filter.\r
+- Validation API & 'auto-validate' compile-time switch.\r
+- Added support for a batched QNAME whitelist filter in DataSet XML. Uses (new)\r
+Property name 'qname_file', with the value being the filepath containing the\r
+whitelist.\r
+- Exposed MD5 hashing to API.\r
+- Ability to remove base features from a ReadGroupInfo object.\r
+- Can construct an aggregate PbiRawData index object from a DataSet: essentially\r
+concatenates all PBI data within the dataset.\r
+- New SamWriter class to create SAM-formatted output of PacBio BAM data.\r
+- Extended APIs for accessing "internal BAM" data, including PulseBehavior\r
+switch for selecting between all pulses & basecalls only.\r
+\r
+### Fixed\r
+- Improper 'clip to reference' product for BamRecord in some cases.\r
+- Improper behavior in tag accessors (e.g. BamRecord::IPD()) on reverse strand-\r
+aligned reads (bug 31339).\r
+- Improper basecaller version parsing in ReadGroupInfo.\r
+\r
+### Changed\r
+- RecordType::POLYMERASE renamed to RecordType::ZMW to reflect changes in\r
+PacBio BAM spec v3.0.4\r
+- Refactored the 'virtual' reader classes - to match the new nomenclature,\r
+and to combine the virtual reader & composite readers behind a shared\r
+interface. The old class names still exist, as typedefs to the new ones,\r
+and the interfaces are completely source-compatible - so as not to break\r
+existing code. However, the old classes should be considered deprecated and\r
+the new ones preferred. Below is the mapping of old -> new:\r
+\r
+   VirtualPolymeraseBamRecord        ->  VirtualZmwBamRecord\r
+   VirtualPolymeraseReader           ->  ZmwReadStitcher\r
+   VirtualPolymeraseCompositeReader  ->  ZmwReadStitcher\r
+   ZmwWhitelistVirtualReader         ->  WhitelistedZmwReadStitcher\r
+\r
+\r
+## [0.5.0] - 2016-02-22\r
+\r
+### Added\r
+- Platform model tag added to read group as RG::PM\r
+- New scrap zmw type sz\r
+- pbmerge accepts DataSetXML as input - using top-level resource BAMs as input,\r
+applying filters, and generating a merged BAM. Also added FOFN support, instead\r
+of listing out BAMs as command line args.\r
+- PbiLocalContextFilter to allow filtering on subread local context.\r
+- PbiBuilder: multithreading & zlib compression-level tuning for PBI output\r
+\r
+### Fixed\r
+- Fixed mishandling of relative BAM filenames in the filename constructor for\r
+DataSet (e.g. DataSet ds("../data.bam")).\r
+\r
+## [0.4.5] - 2016-01-14\r
+\r
+### Changed\r
+- PbiFilterQuery (and any other PBI-backed query, e.g. ZmwQuery ) now throws if\r
+PBI file(s) missing insted of returning empty result.\r
+- GenomicIntervalQuery now throws if BAI file(s) missing instead of returning\r
+empty result.\r
+- BamFile will throw if file is truncated (e.g. missing the EOF block). Disable\r
+by defining PBBAM_NO_CHECK_EOF .\r
+\r
+## [0.4.4] - 2016-01-07\r
+\r
+### Added\r
+- bam2sam command line utility. The primary benefit is removing the dependency\r
+on samtools during tests, but also provides users a functioning BAM -> SAM\r
+converter in the absence of samtools.\r
+- pbmerge command line utility. Allows merging N BAM files into one, optionally\r
+creating the PBI file alongside.\r
+- Added BamRecord::Pkmean2 & Pkmid2, 2D equivalent of Pkmean/Pkmid, for internal\r
+BAMs.\r
+\r
+### Removed\r
+- samtools dependency\r
+\r
+## [0.4.3] - 2015-12-22\r
+\r
+### Added\r
+- Compile using ccache by default, if available. Can be manually disabled using\r
+-DPacBioBAM_use_ccache=OFF with cmake.\r
+- pbindexdump: command-line utility that converts PBI file data into human-\r
+readable formats. (JSON by default).\r
+\r
+### Changed\r
+- CMake option PacBioBAM_build_pbindex is being deprecated. Use\r
+PacBioBAM_build_tools instead.\r
+\r
+## [0.4.2] - 2015-12-22\r
+\r
+### Changed\r
+- BamFile::PacBioIndexExists & StandardIndexExists no longer check timestamps.\r
+Copying/moving files around can yield timestamps that are not helpful (no longer\r
+guaranteed that the .pbi will be "newer" than the .bam, even though no content\r
+changed). Added methods (e.g. bool BamFile::PacBioIndexIsNewer()) to do that\r
+lookup if needed, but it is no longer done automatically.\r
+\r
+## [0.4.1] - 2015-12-18\r
+\r
+### Added\r
+- BamRecord::HasNumPasses\r
+\r
+### Changed\r
+- VirtualPolymeraseBamRecord::VirtualRegionsTable(type) returns an empty vector\r
+of regions if none are associated with the requested type, instead of throwing.\r
+\r
+## [0.4.0] - 2015-12-15\r
+\r
+### Changed\r
+- Redesigned PbiFilter interface and backend. Previous implementation did not\r
+scale well as intermediate results were far too unwieldy. This redesign provides\r
+speedups of orders of magnitude in many cases.\r
+\r
+## [0.3.2] - 2015-12-10\r
+\r
+### Added\r
+- Support for ReadGroupInfo sequencing chemistry data.\r
+InvalidSequencingChemistryException thrown if an unsupported combination is\r
+encountered.\r
+- VirtualPolymeraseCompositeReader - for re-stitching records, across multiple\r
+resources (e.g. from DataSetXML). Reader respects DataSet filter criteria.\r
+\r
+## [0.3.1] - 2015-10-30\r
+\r
+### Added\r
+- ZmwWhitelistVirtualReader: similar to VirtualPolymeraseReader but restricts\r
+iteration to a whitelist of ZMW hole numbers, leveraging PBI index data for\r
+random-access.\r
+\r
+### Fixed\r
+- Fixed error in PBI construction, in which entire file sections (e.g.\r
+BarcodeData or MappedData) where being dropped when any one record lacked data.\r
+Correct behavior is to allow file section ommission if all records lack that\r
+data type.\r
+\r
+## [0.3.0] - 2015-10-29\r
+\r
+### Fixed\r
+- Improper reporting of current offset from multi-threaded BamWriter. This had\r
+the effect of creating broken PBIs that were written alongside the BAM. Added a\r
+flush step, which incurs a performance hit, but restores correctness.\r
+\r
+## [0.2.4] - 2015-10-26\r
+\r
+### Fixed\r
+- Empty PbiFilter now returns all records, instead of filtering away all records.\r
+\r
+## [0.2.3] - 2015-10-26\r
+\r
+### Added/Fixed\r
+- Syncing DataSetXML across APIs. Primary changes include output of Version\r
+attribute ("3.0.1") on appropriate elements, as well as resolution of namespace\r
+issues.\r
+\r
+## [0.2.2] - 2015-10-22\r
+\r
+### Added\r
+- Added BAI bin calculation to BamWriter::Write, to ensure maximal compatibility\r
+with downstream tools (e.g. 'samtools index'). A new BinCalculationMode enum\r
+flag in BamWriter constructor cotnrols whether this behavior is enabled[default]\r
+or not.\r
+\r
+## [0.2.1] - 2015-10-19\r
+\r
+### Added\r
+- Exposed the following classes to public API:\r
+  - BamReader\r
+  - BaiIndexedBamReader\r
+  - PbiIndexedBamReader\r
+  - GenomicIntervalCompositeBamReader\r
+  - PbiFilterCompositeBamReader\r
+\r
+## [0.2.0] - 2015-10-09\r
+\r
+### Changed\r
+- BAM spec v3.0.1 compliance. Previous (betas) versions of the BAM spec are not\r
+supported and will causean exception to be throw if encountered.\r
+- PBI lookup interface & backend, see PbiIndex.h & PbiLookupData.h for details.\r
+\r
+### Added\r
+- BamFile::PacBioIndexExists() & BamFile::StandardIndexExists() - query the\r
+existence of index files without auto-building them if they are missing, as in\r
+BamFile::Ensure*IndexExists().\r
+- GenomicInterval now accepts an htslib/samtools-style REGION string in the\r
+constructor: GenomicInterval("chr1:1000-2000"). Please note though, that pbbam\r
+uses 0-based coordinates throughout, whereas samtools expects 1-based. The above\r
+string is equivalent to "chr1:1001-2000" in samtools.\r
+- Built-in PBI filters. See PbiFlter.h & PbiFilterTypes.h for built-in filters\r
+and constructing composite filters. These can be used in conjunction with the\r
+new PbiFilterQuery, which takes a generic PbiFilter and applies that to a\r
+DataSet for iteration.\r
+- New built-in queries: BarcodeQuery, ReadAccuracyQuery, SubreadLengthQuery.\r
+These leverage the new filter API to construct a PbiFilter and apply to a\r
+DataSet.\r
+- Built-in BamRecord comparators that are STL-compatible. See Compare.h for full\r
+list. This allows for statements like the following, which sorts records by ZMW\r
+number:\r
+``` c++\r
+    vector<BamRecord> data;\r
+    std::sort(data.begin(), data.end(), Compare::Zmw());\r
+```\r
+- "exciseSoftClips" option to BamRecord::CigarData()\r
+\r
+## [0.1.0] - 2015-07-17\r
+\r
+### Changed\r
+- BAM spec v3.0b7 compliance\r
+ - Removal of 'M' as allowed CIGAR operation. Attempt to use such a CIGAR op\r
+ will throw an exception.\r
+ - Addition of IPD/PulseWidth codec version info in header\r
+\r
+### Added\r
+- Auto-generation of UTC timestamp for DataSet objects\r
+- PbiBuilder - allows generation of PBI index data alongside generation or\r
+modification of BAM record data. This obviates the need to wait for a completed\r
+BAM, then go through the zlib decompression, etc.\r
+- Added DataSet::FromXml(string xml) to create DataSets from "raw" XML string,\r
+rather than building up using DataSet API or loading from existing file.\r
+- "pbindex" command line tool to generate ".pbi" files from BAM data. The\r
+executable is built by default, but can be disabled using the cmake option\r
+"-DPacBioBAM_build_pbindex=OFF".\r
+\r
+### Fixed\r
+- PBI construction failing on CCS reads\r
+\r
+## [0.0.8] - 2015-07-02\r
+\r
+### Changed\r
+- Build system refactoring.\r
+\r
+## [0.0.7] - 2015-07-02\r
+\r
+### Added\r
+- PBI index lookup API. Not so much intended for client use directly, but will\r
+enable construction of higher-level semantic queries: grouping by, filtering,\r
+etc.\r
+- DataSet & PBI-aware queries (e.g. ZmwGroupQuery). More PBI-enabled queries to\r
+follow.\r
+- More flexibility in tag access. Samtools has a habit of performing a\r
+"shrink-to-fit" when it handles integer-valued tag data. Thus we cannot\r
+**guarantee** the binary type that our API will have to process. Safe\r
+conversions are allowed on integer-like data only. Under- or overflows in\r
+casting will trigger an exception. All other tag data types must be asked for\r
+explicitly, or else an exception will be raised, as before.\r
+- BamHeader::DeepCopy - allows creation of editable header data, without\r
+overwriting all shared instances\r
+\r
+### Fixed\r
+- XSD compliance for DataSet APIs.\r
+\r
+### Changed\r
+- The functionality provided by ZmwQuery (group by hole number), is now\r
+available using the ZmwGroupQuery object. The new ZmwQuery returns a single-\r
+record iterator (a la EntireFileQuery), but limited to a whitelist of requested\r
+hole numbers.\r
+\r
+### Removed\r
+- XSD non-compliant classes (e.g. ExternalDataReference)\r
+\r
+## [0.0.6] - 2015-06-07\r
+\r
+### Added\r
+\r
+- Accessor methods for pulse bam support:\r
+ - LabelQV()\r
+ - AltLabelQV()\r
+ - LabelTag()\r
+ - AltLabelTag()\r
+ - Pkmean()\r
+ - Pkmid()\r
+ - PrePulseFrames() only RC, no clipping\r
+ - PulseCallWidth() only RC, no clipping\r
+ - PulseCall() case-sensitive RC, no clipping\r
+ - IPDRaw() to avoid up and downscaling for stitching\r
+- BamRecord::ParseTagName and BamRecord::ParseTagString to convert a two\r
+  character tag string to a TagName enum and back. Allows a switch over tags.\r
+- VirtualPolymeraseReader to create VirtualPolymeraseBamRecord from a\r
+  subreads|hqregion+scraps.bam\r
+- VirtualRegion represents annotations of the polymerase reads, for adapters,\r
+  barcodes, lqregions, and hqregions.\r
+- ReadGroupInfo operator==\r
+\r
+### Fixed\r
+\r
+- Reimplemented QueryStart(int), QueryEnd(int), UpdateName(void),\r
+  ReadGroup(ReadGroupInfo&), ReadGroupId(std::string&);\r
+\r
+## [0.0.5] - 2015-05-29\r
+\r
+### Added\r
+\r
+- DataSet support. This includes XML I/O, basic dataset query/manipulation, and\r
+multi-BAM-file queries. New classes are located in <pbbam/dataset/>. DataSet-\r
+capable queries currently reside in the PacBio::BAM::staging namespace. These\r
+will be ported over to the main namespace once the support is stabilized and\r
+works seamlessly with either a single BamFile or DataSet object as input. (bug\r
+25941)\r
+- PBI support. This includes read/write raw data & building from a BamFile. The\r
+lookup API for random-access queries is under development, but the raw data is\r
+available - for creating PBI files & generating summary statistics. (bug 26025)\r
+- C# SWIG bindings, alongside existing Python and R wrappers.\r
+- LocalContextFlags support in BamRecord (bug 26623)\r
+\r
+### Fixed\r
+\r
+- BamRecord[Impl] map quality now  initialized with 255 (missing) value, instead\r
+of 0. (bug 26228)\r
+- ReadGroupId calculation. (bug 25940)\r
+\r
+## [0.0.4] - 2015-04-22\r
+\r
+### Added\r
+\r
+- This changelog. Hope it helps.\r
+- Hook to set verbosity of underlying htslib warnings.\r
+- Grouped queries. (bug 26361)\r
+\r
+### Changed\r
+\r
+- Now using exceptions instead of return codes, output parameters, etc.\r
+- Removed "messy" shared_ptrs across interface (see especially BamHeader). These\r
+are now taken care of within the API, not exposed to client code.\r
+\r
+### Removed\r
+\r
+- BamReader\r
+\r
+### Fixed\r
+\r
+- ASCII tag output. (bug 26381)\r
diff --git a/INSTALL.md b/INSTALL.md

new file mode 100644 (file)

index 0000000..86dddda
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,3 @@
+# PacBio::BAM - building & integrating\r
+\r
+Detailed build instructions can be found [here](http://pbbam.readthedocs.org/en/latest/getting_started.html).\r
diff --git a/LICENSE.txt b/LICENSE.txt

new file mode 100644 (file)

index 0000000..fc6affb
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,34 @@
+Copyright (c) 2014-2018, Pacific Biosciences of California, Inc.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted (subject to the limitations in the
+disclaimer below) provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following
+   disclaimer in the documentation and/or other materials provided
+   with the distribution.
+
+ * Neither the name of Pacific Biosciences nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
diff --git a/README.md b/README.md

new file mode 100644 (file)

index 0000000..c9db996
--- /dev/null
+++ b/README.md
@@ -0,0 +1,62 @@
+# pbbam
+
+[![Build Status](https://travis-ci.org/PacificBiosciences/pbbam.svg?branch=master)](https://travis-ci.org/PacificBiosciences/pbbam) [![Documentation Status](https://readthedocs.org/projects/pbbam/badge/?version=latest)](http://pbbam.readthedocs.org/en/latest/?badge=latest)
+
+As of the 3.0 release of SMRTanalysis, PacBio is embracing the industry standard BAM
+format for (both aligned and unaligned) basecall data files. We have also formulated
+a BAM companion file format (bam.pbi) enabling fast access to a richer set of per-read
+information as well as compatibility for software built around the legacy cmp.h5 format.
+
+The **pbbam** software package provides components to create, query, & edit PacBio BAM
+files and associated indices. These components include a core C++ library, bindings for
+additional languages, and command-line utilities.
+
+### Note:
+
+This library is **not** intended to be used as a general-purpose BAM utility - all input & output BAMs must adhere to the [PacBio BAM format specification](https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/BAM.rst). Non-PacBio BAMs will cause exceptions to be thrown.
+
+##  Documentation
+
+  - [Documentation Home](http://pbbam.readthedocs.org/en/latest/index.html)
+    - [Getting Started](http://pbbam.readthedocs.org/en/latest/getting_started.html)
+    - [C++ API Reference](http://pbbam.readthedocs.org/en/latest/api_reference.html)
+
+  - [Changelog](https://github.com/PacificBiosciences/pbbam/blob/master/CHANGELOG.md)
+
+## FAQ
+
+### [Help! I am getting "unsupported sequencing chemistry combination"!](#chemistry-bundle)
+
+**pbbam** validates all BAM files, and as part of this validation, it checks whether the
+`BindingKit` and `SequencingKit` variables in every ReadGroup of the provided BAM file are
+known. As part of ongoing chemistry developments, we might need to introduce new part numbers
+to identify novel reagents and/or SMRT Cells. You are unlikely to encounter such issues
+when using SMRT Link, as it has an integrated auto-updater that will periodically check and
+install new chemistries automatically. All PacBio tools being used without a proper SMRT Link
+installation will require manual intervention to download new chemistries:
+
+  ```sh
+  cd <some persistent dir>
+  export SMRT_CHEMISTRY_BUNDLE_DIR="${PWD}"
+
+  wget https://raw.githubusercontent.com/PacificBiosciences/pbcore/develop/pbcore/chemistry/resources/mapping.xml -O chemistry.xml
+  ```
+
+This will cause **pbbam** to try to load the out-of-band `chemistry.xml` from
+`SMRT_CHEMISTRY_BUNDLE_DIR` and should allow you to use somewhat older software
+with somewhat newer BAMs. **Note:** this only allows **pbbam**'s internal validation
+to pass, this will not automatically make other chemistry-dependent software work
+with newer chemistries. For instance, Arrow's backend ([Unanimity](https://github.com/PacificBiosciences/unanimity))
+is parametrized on chemistry too, and it will fail should a completely new chemistry
+be introduced. See Unanimity's FAQ on how to employ `SMRT_CHEMISTRY_BUNDLE_DIR`
+to load models for new chemistries.
+
+
+## License
+
+ - [PacBio open source license](https://github.com/PacificBiosciences/pbbam/blob/master/LICENSE.txt)
+
+DISCLAIMER
+----------
+THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, ARE PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A PARTICULAR PURPOSE. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF THIS SITE, ALL SITE-RELATED SERVICES, AND ANY THIRD PARTY WEBSITES OR APPLICATIONS. NO ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A WARRANTY OF ANY KIND. ANY REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO NOT CONSTITUTE OR IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACIFIC BIOSCIENCES.
+
diff --git a/bamboo_build.sh b/bamboo_build.sh

new file mode 100755 (executable)

index 0000000..7e01cf6
--- /dev/null
+++ b/bamboo_build.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+set -vex
+
+################
+# DEPENDENCIES #
+################
+
+## Load modules
+type module >& /dev/null || . /mnt/software/Modules/current/init/bash
+
+module purge
+
+module load meson
+module load ninja
+
+module load zlib
+module load htslib
+module load samtools
+
+module load boost
+
+module load cram
+
+
+export BUILD_NUMBER="0"
+case "${bamboo_planRepository_branchName}" in
+  develop|master)
+    _install_image_default="${INSTALL_IMAGE:-false}"
+    _create_artifact_default="${CREATE_ARTIFACT:-false}"
+
+    export PREFIX_ARG="/mnt/software/p/pbbam/${bamboo_planRepository_branchName}"
+    export BUILD_NUMBER="${bamboo_globalBuildNumber:-0}"
+    ;;
+esac
+
+export _install_image="${_install_image_default:-false}"
+export _create_artifact="${_create_artifact_default:-false}"
+
+
+BOOST_ROOT="${BOOST_ROOT%/include}"
+# unset these variables to have meson discover all
+# boost-dependent variables from BOOST_ROOT alone
+unset BOOST_INCLUDEDIR
+unset BOOST_LIBRARYDIR
+
+# in order to make shared libraries consumable
+# by conda and other package managers
+export LDFLAGS=${LDFLAGS:-"-fuse-ld=gold -static-libstdc++ -static-libgcc"}
+
+source scripts/ci/setup.sh
+source scripts/ci/build.sh
+source scripts/ci/test.sh
+source scripts/ci/install.sh
+source scripts/ci/artifact.sh
diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in

new file mode 100644 (file)

index 0000000..90f6f63
--- /dev/null
+++ b/docs/Doxyfile.in
@@ -0,0 +1,1602 @@
+# Doxyfile 1.6.3
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file 
+# that follow. The default is UTF-8 which is also the encoding used for all 
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the 
+# iconv built into libc) for the transcoding. See 
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded 
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = @PacBioBAM_NAME@
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. 
+# This could be handy for archiving the generated documentation or 
+# if some version control system is used.
+
+PROJECT_NUMBER         = @PacBioBAM_VERSION@
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
+# base path where the generated documentation will be put. 
+# If a relative path is entered, it will be relative to the location 
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = @PacBioBAM_DocsDir@
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 
+# 4096 sub-directories (in 2 levels) under the output directory of each output 
+# format and will distribute the generated files over these directories. 
+# Enabling this option can be useful when feeding doxygen a huge amount of 
+# source files, where putting all generated files in the same directory would 
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all 
+# documentation generated by doxygen is written. Doxygen will use this 
+# information to generate all constant output in the proper language. 
+# The default language is English, other supported languages are: 
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, 
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, 
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English 
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, 
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, 
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will 
+# include brief member descriptions after the members that are listed in 
+# the file and class documentation (similar to JavaDoc). 
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend 
+# the brief description of a member or function before the detailed description. 
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the 
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator 
+# that is used to form the text in various listings. Each string 
+# in this list, if found as the leading text of the brief description, will be 
+# stripped from the text and the result after processing the whole list, is 
+# used as the annotated text. Otherwise, the brief description is used as-is. 
+# If left blank, the following values are used ("$name" is automatically 
+# replaced with the name of the entity): "The $name class" "The $name widget" 
+# "The $name file" "is" "provides" "specifies" "contains" 
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then 
+# Doxygen will generate a detailed section even if there is only a brief 
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all 
+# inherited members of a class in the documentation of that class as if those 
+# members were ordinary class members. Constructors, destructors and assignment 
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full 
+# path before files name in the file list and in the header files. If set 
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag 
+# can be used to strip a user-defined part of the path. Stripping is 
+# only done if one of the specified strings matches the left-hand part of 
+# the path. The tag can be used to show relative paths in the file list. 
+# If left blank the directory from which doxygen is run is used as the 
+# path to strip.
+
+STRIP_FROM_PATH        = 
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of 
+# the path mentioned in the documentation of a class, which tells 
+# the reader which header file to include in order to use a class. 
+# If left blank only the name of the header file containing the class 
+# definition is used. Otherwise one should specify the include paths that 
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    = @PacBioBAM_IncludeDir@
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter 
+# (but less readable) file names. This can be useful is your file systems 
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen 
+# will interpret the first line (until the first dot) of a JavaDoc-style 
+# comment as the brief description. If set to NO, the JavaDoc 
+# comments will behave just like regular Qt-style comments 
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will 
+# interpret the first line (until the first dot) of a Qt-style 
+# comment as the brief description. If set to NO, the comments 
+# will behave just like regular Qt-style comments (thus requiring 
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen 
+# treat a multi-line C++ special comment block (i.e. a block of //! or /// 
+# comments) as a brief description. This used to be the default behaviour. 
+# The new default is to treat a multi-line C++ comment block as a detailed 
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented 
+# member inherits the documentation from any documented member that it 
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce 
+# a new page for each member. If set to NO, the documentation of a member will 
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. 
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 1
+
+# This tag can be used to specify a number of aliases that acts 
+# as commands in the documentation. An alias has the form "name=value". 
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to 
+# put the command \sideeffect (or @sideeffect) in the documentation, which 
+# will result in a user-defined paragraph with heading "Side Effects:". 
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+#samSpecURL=http://samtools.sourceforge.net/SAM1.pdf
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C 
+# sources only. Doxygen will then generate output that is more tailored for C. 
+# For instance, some of the names that are used will be different. The list 
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java 
+# sources only. Doxygen will then generate output that is more tailored for 
+# Java. For instance, namespaces will be presented as packages, qualified 
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran 
+# sources only. Doxygen will then generate output that is more tailored for 
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL 
+# sources. Doxygen will then generate output that is tailored for 
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it parses. 
+# With this tag you can assign which parser to use for a given extension. 
+# Doxygen has a built-in mapping, but you can override or extend it using this tag. 
+# The format is ext=language, where ext is a file extension, and language is one of 
+# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP, 
+# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat 
+# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran), 
+# use: inc=Fortran f=C. Note that for custom extensions you also need to set
+# FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      = 
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want 
+# to include (a tag file for) the STL sources as input, then you should 
+# set this tag to YES in order to let doxygen match functions declarations and 
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. 
+# func(std::string) {}). This also make the inheritance and collaboration 
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = YES
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to 
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. 
+# Doxygen will parse them like normal C++ but will assume all classes use public 
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter 
+# and setter methods for a property. Setting this option to YES (the default) 
+# will make doxygen to replace the get and set methods by a property in the 
+# documentation. This will only work if the methods are indeed getting or 
+# setting a simple type. If this is not the case, or you want to show the 
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC 
+# tag is set to YES, then doxygen will reuse the documentation of the first 
+# member in the group (if any) for the other members of the group. By default 
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of 
+# the same type (for instance a group of public functions) to be put as a 
+# subgroup of that type (e.g. under the Public Functions section). Set it to 
+# NO to prevent subgrouping. Alternatively, this can be done per class using 
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum 
+# is documented as struct, union, or enum with the name of the typedef. So 
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct 
+# with name TypeT. When disabled the typedef will appear as a member of a file, 
+# namespace, or class. And the struct will be named TypeS. This can typically 
+# be useful for C code in case the coding convention dictates that all compound 
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to 
+# determine which symbols to keep in memory and which to flush to disk. 
+# When the cache is full, less often used symbols will be written to disk. 
+# For small to medium size projects (<1000 input files) the default value is 
+# probably good enough. For larger projects a too small cache size can cause 
+# doxygen to be busy swapping symbols to and from disk most of the time 
+# causing a significant performance penality. 
+# If the system has enough physical memory increasing the cache will improve the 
+# performance by keeping more symbols in memory. Note that the value works on 
+# a logarithmic scale so increasing the size by one will rougly double the 
+# memory usage. The cache size is given by this formula: 
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, 
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in 
+# documentation are documented, even if no documentation was available. 
+# Private class members and static file members will be hidden unless 
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class 
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file 
+# will be included in the documentation.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) 
+# defined locally in source files will be included in the documentation. 
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. When set to YES local 
+# methods, which are defined in the implementation section but not in 
+# the interface are included in the documentation. 
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be 
+# extracted and appear in the documentation as a namespace called 
+# 'anonymous_namespace{file}', where file will be replaced with the base 
+# name of the file that contains the anonymous namespace. By default 
+# anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all 
+# undocumented members of documented classes, files or namespaces. 
+# If set to NO (the default) these members will be included in the 
+# various overviews, but no documentation section is generated. 
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all 
+# undocumented classes that are normally visible in the class hierarchy. 
+# If set to NO (the default) these classes will be included in the various 
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all 
+# friend (class|struct|union) declarations. 
+# If set to NO (the default) these declarations will be included in the 
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any 
+# documentation blocks found inside the body of a function. 
+# If set to NO (the default) these blocks will be appended to the 
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation 
+# that is typed after a \internal command is included. If the tag is set 
+# to NO (the default) then the documentation will be excluded. 
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate 
+# file names in lower-case letters. If set to YES upper-case letters are also 
+# allowed. This is useful if you have classes or files whose names only differ 
+# in case and if your file system supports case sensitive file names. Windows 
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen 
+# will show members with their full class and namespace scopes in the 
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen 
+# will put a list of the files that are included by a file in the documentation 
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen 
+# will list include files with double quotes in the documentation 
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] 
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen 
+# will sort the (detailed) documentation of file and class members 
+# alphabetically by member name. If set to NO the members will appear in 
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the 
+# brief documentation of file, namespace and class members alphabetically 
+# by member name. If set to NO (the default) the members will appear in 
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the 
+# hierarchy of group names into alphabetical order. If set to NO (the default) 
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be 
+# sorted by fully-qualified names, including namespaces. If set to 
+# NO (the default), the class list will be sorted only by class name, 
+# not including the namespace part. 
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. 
+# Note: This option applies only to the class list, not to the 
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or 
+# disable (NO) the todo list. This list is created by putting \todo 
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or 
+# disable (NO) the test list. This list is created by putting \test 
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or 
+# disable (NO) the bug list. This list is created by putting \bug 
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or 
+# disable (NO) the deprecated list. This list is created by putting 
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional 
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       = 
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines 
+# the initial value of a variable or define consists of for it to appear in 
+# the documentation. If the initializer consists of more lines than specified 
+# here it will be hidden. Use a value of 0 to hide initializers completely. 
+# The appearance of the initializer of individual variables and defines in the 
+# documentation can be controlled using \showinitializer or \hideinitializer 
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated 
+# at the bottom of the documentation of classes and structs. If set to YES the 
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories 
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy 
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. 
+# This will remove the Files entry from the Quick Index and from the 
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the 
+# Namespaces page.  This will remove the Namespaces entry from the Quick Index 
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that 
+# doxygen should invoke to get the current version for each file (typically from 
+# the version control system). Doxygen will invoke the program by executing (via 
+# popen()) the command <command> <input-file>, where <command> is the value of 
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file 
+# provided by doxygen. Whatever the program writes to standard output 
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    = 
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by 
+# doxygen. The layout file controls the global structure of the generated output files 
+# in an output format independent way. The create the layout file that represents 
+# doxygen's defaults, run doxygen with the -l option. You can optionally specify a 
+# file name after the option, if omitted DoxygenLayout.xml will be used as the name 
+# of the layout file.
+
+LAYOUT_FILE            = 
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated 
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are 
+# generated by doxygen. Possible values are YES and NO. If left blank 
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings 
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will 
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for 
+# potential errors in the documentation, such as not documenting some 
+# parameters in a documented function, or documenting parameters that 
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for 
+# functions that are documented, but have no documentation for their parameters 
+# or return value. If set to NO (the default) doxygen will only warn about 
+# wrong or incomplete parameter documentation, but not about the absence of 
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that 
+# doxygen can produce. The string should contain the $file, $line, and $text 
+# tags, which will be replaced by the file and line number from which the 
+# warning originated and the warning text. Optionally the format may contain 
+# $version, which will be replaced by the version of the file (if it could 
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning 
+# and error messages should be written. If left blank the output is written 
+# to stderr.
+
+WARN_LOGFILE           = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain 
+# documented source files. You may enter file names like "myfile.cpp" or 
+# directories like "/usr/src/myproject". Separate the files or directories 
+# with spaces.
+
+INPUT                  = @PacBioBAM_IncludeDir@
+
+# This tag can be used to specify the character encoding of the source files 
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 
+# also the default input encoding. Doxygen uses libiconv (or the iconv built 
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for 
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the 
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
+# and *.h) to filter out the source-files in the directories. If left 
+# blank the following patterns are tested: 
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx 
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.d \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.idl \
+                         *.odl \
+                         *.cs \
+                         *.php \
+                         *.php3 \
+                         *.inc \
+                         *.m \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.f90 \
+                         *.f \
+                         *.vhd \
+                         *.vhdl
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories 
+# should be searched for input files as well. Possible values are YES and NO. 
+# If left blank NO is used.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should 
+# excluded from the INPUT source files. This way you can easily exclude a 
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                = @PacBioBAM_IncludeDir@/pbbam/internal 
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or 
+# directories that are symbolic links (a Unix filesystem feature) are excluded 
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the 
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude 
+# certain files from those directories. Note that the wildcards are matched 
+# against the file with absolute path, so to exclude all test directories 
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = 
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names 
+# (namespaces, classes, functions, etc.) that should be excluded from the 
+# output. The symbol name can be a fully qualified name, a word, or if the 
+# wildcard * is used, a substring. Examples: ANamespace, AClass, 
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        = pugi, PacBio::BAM::internal
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or 
+# directories that contain example code fragments that are included (see 
+# the \include command).
+
+EXAMPLE_PATH           = examples 
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the 
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
+# and *.h) to filter out the source-files in the directories. If left 
+# blank all files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be 
+# searched for input files to be used with the \include or \dontinclude 
+# commands irrespective of the value of the RECURSIVE tag. 
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or 
+# directories that contain image that are included in the documentation (see 
+# the \image command).
+
+IMAGE_PATH             = 
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should 
+# invoke to filter for each input file. Doxygen will invoke the filter program 
+# by executing (via popen()) the command <filter> <input-file>, where <filter> 
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an 
+# input file. Doxygen will then use the output that the filter program writes 
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be 
+# ignored.
+
+INPUT_FILTER           = 
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern 
+# basis.  Doxygen will compare the file name with each pattern and apply the 
+# filter if there is a match.  The filters are a list of the form: 
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further 
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER 
+# is applied to all files.
+
+FILTER_PATTERNS        = 
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using 
+# INPUT_FILTER) will be used to filter the input files when producing source 
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will 
+# be generated. Documented entities will be cross-referenced with these sources. 
+# Note: To get rid of all source code in the generated output, make sure also 
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body 
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct 
+# doxygen to hide any special comment blocks from generated source code 
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES 
+# then for each documented function all documented 
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES 
+# then for each documented function all documented entities 
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) 
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from 
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will 
+# link to the source code.  Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code 
+# will point to the HTML generated by the htags(1) tool instead of doxygen 
+# built-in source browser. The htags tool is part of GNU's global source 
+# tagging system (see http://www.gnu.org/software/global/global.html). You 
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen 
+# will generate a verbatim copy of the header file for each class for 
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index 
+# of all compounds will be generated. Enable this if the project 
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then 
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns 
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all 
+# classes will be put under the same header in the alphabetical index. 
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that 
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will 
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for 
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank 
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for 
+# each generated HTML page. If it is left blank doxygen will generate a 
+# standard header.
+
+HTML_HEADER            = 
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for 
+# each generated HTML page. If it is left blank doxygen will generate a 
+# standard footer.
+
+HTML_FOOTER            = 
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading 
+# style sheet that is used by each HTML page. It can be used to 
+# fine-tune the look of the HTML output. If the tag is left blank doxygen 
+# will generate a default style sheet. Note that doxygen will try to copy 
+# the style sheet file to the HTML output directory, so don't put your own 
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        = 
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML 
+# page will contain the date and time when the page was generated. Setting 
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, 
+# files or namespaces will be aligned in HTML using tables. If set to 
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML 
+# documentation will contain sections that can be hidden and shown after the 
+# page has loaded. For this to work a browser that supports 
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox 
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = YES
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files 
+# will be generated that can be used as input for Apple's Xcode 3 
+# integrated development environment, introduced with OSX 10.5 (Leopard). 
+# To create a documentation set, doxygen will generate a Makefile in the 
+# HTML output directory. Running make will produce the docset in that 
+# directory and running "make install" will install the docset in 
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find 
+# it at startup. 
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the 
+# feed. A documentation feed provides an umbrella under which multiple 
+# documentation sets from a single provider (such as a company or product suite) 
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that 
+# should uniquely identify the documentation set bundle. This should be a 
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen 
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files 
+# will be generated that can be used as input for tools like the 
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) 
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can 
+# be used to specify the file name of the resulting .chm file. You 
+# can add a path in front of the file if the result should not be 
+# written to the html output directory.
+
+CHM_FILE               = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can 
+# be used to specify the location (absolute path including file name) of 
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run 
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag 
+# controls if a separate .chi index file is generated (YES) or that 
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING 
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file 
+# content.
+
+CHM_INDEX_ENCODING     = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag 
+# controls whether a binary table of contents is generated (YES) or a 
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members 
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER 
+# are set, an additional index file will be generated that can be used as input for 
+# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated 
+# HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can 
+# be used to specify the file name of the resulting .qch file. 
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               = 
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating 
+# Qt Help Project output. For more information please see 
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating 
+# Qt Help Project output. For more information please see 
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add. 
+# For more information please see 
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   = 
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see 
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  = 
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's 
+# filter section matches. 
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  = 
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can 
+# be used to specify the location of Qt's qhelpgenerator. 
+# If non-empty doxygen will try to run qhelpgenerator on the generated 
+# .qhp file.
+
+QHG_LOCATION           = 
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files  
+# will be generated, which together with the HTML files, form an Eclipse help  
+# plugin. To install this plugin and make it available under the help contents 
+# menu in Eclipse, the contents of the directory containing the HTML and XML 
+# files needs to be copied into the plugins directory of eclipse. The name of 
+# the directory within the plugins directory should be the same as 
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin 
+# the directory name containing the HTML and XML files should also have 
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at 
+# top of each HTML page. The value NO (the default) enables the index and 
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20]) 
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index 
+# structure should be generated to display hierarchical information. 
+# If the tag value is set to YES, a side panel will be generated 
+# containing a tree-like index structure (just like the one that 
+# is generated for HTML Help). For this to work a browser that supports 
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). 
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, 
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be 
+# used to set the initial width (in pixels) of the frame in which the tree 
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# Use this tag to change the font size of Latex formulas included 
+# as images in the HTML documentation. The default is 10. Note that 
+# when you change the font size after a successful doxygen run you need 
+# to manually remove any form_*.png images from the HTML output directory 
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript 
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should 
+# typically be disabled. For large projects the javascript based search engine 
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index 
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvances is that it is more difficult to setup 
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be 
+# invoked. If left blank `latex' will be used as the default command name. 
+# Note that when enabling USE_PDFLATEX this option is only used for 
+# generating bitmaps for formulas in the HTML output, but not in the 
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to 
+# generate index for LaTeX. If left blank `makeindex' will be used as the 
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact 
+# LaTeX documents. This may be useful for small projects and may help to 
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used 
+# by the printer. Possible values are: a4, a4wide, letter, legal and 
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX 
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         = 
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for 
+# the generated latex document. The header should contain everything until 
+# the first chapter. If it is left blank doxygen will generate a 
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           = 
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated 
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will 
+# contain links (just like the HTML output) instead of page references 
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of 
+# plain latex in the generated Makefile. Set this option to YES to get a 
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. 
+# command to the generated LaTeX files. This will instruct LaTeX to keep 
+# running if errors occur, instead of asking the user for help. 
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not 
+# include the index chapters (such as File Index, Compound Index, etc.) 
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output 
+# The RTF output is optimized for Word 97 and may not look very pretty with 
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact 
+# RTF documents. This may be useful for small projects and may help to 
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated 
+# will contain hyperlink fields. The RTF file will 
+# contain links (just like the HTML output) instead of page references. 
+# This makes the output suitable for online browsing using WORD or other 
+# programs which support those fields. 
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's 
+# config file, i.e. a series of assignments. You only have to provide 
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    = 
+
+# Set optional variables used in the generation of an rtf document. 
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will 
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to 
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output, 
+# then it will generate one additional man file for each entity 
+# documented in the real man page(s). These additional files 
+# only source the real man page, but without them the man command 
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will 
+# generate an XML file that captures the structure of 
+# the code including all documentation.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema, 
+# which can be used by a validating XML parser to check the 
+# syntax of the XML files.
+
+XML_SCHEMA             = 
+
+# The XML_DTD tag can be used to specify an XML DTD, 
+# which can be used by a validating XML parser to check the 
+# syntax of the XML files.
+
+XML_DTD                = 
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will 
+# dump the program listings (including syntax highlighting 
+# and cross-referencing information) to the XML output. Note that 
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will 
+# generate an AutoGen Definitions (see autogen.sf.net) file 
+# that captures the structure of the code including all 
+# documentation. Note that this feature is still experimental 
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will 
+# generate a Perl module file that captures the structure of 
+# the code including all documentation. Note that this 
+# feature is still experimental and incomplete at the 
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate 
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able 
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be 
+# nicely formatted so it can be parsed by a human reader.  This is useful 
+# if you want to understand what is going on.  On the other hand, if this 
+# tag is set to NO the size of the Perl module output will be much smaller 
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file 
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. 
+# This is useful so different doxyrules.make files included by the same 
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will 
+# evaluate all C-preprocessor directives found in the sources and include 
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro 
+# names in the source code. If set to NO (the default) only conditional 
+# compilation will be performed. Macro expansion can be done in a controlled 
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES 
+# then the macro expansion is limited to the macros specified with the 
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files 
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that 
+# contain include files that are not input files but should be processed by 
+# the preprocessor.
+
+INCLUDE_PATH           = 
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard 
+# patterns (like *.h and *.hpp) to filter out the header-files in the 
+# directories. If left blank, the patterns specified with FILE_PATTERNS will 
+# be used.
+
+INCLUDE_FILE_PATTERNS  = 
+
+# The PREDEFINED tag can be used to specify one or more macro names that 
+# are defined before the preprocessor is started (similar to the -D option of 
+# gcc). The argument of the tag is a list of macros of the form: name 
+# or name=definition (no spaces). If the definition and the = are 
+# omitted =1 is assumed. To prevent a macro definition from being 
+# undefined via #undef or recursively expanded use the := operator 
+# instead of the = operator.
+
+PREDEFINED             = 
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then 
+# this tag can be used to specify a list of macro names that should be expanded. 
+# The macro definition that is found in the sources will be used. 
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      = 
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then 
+# doxygen's preprocessor will remove all function-like macros that are alone 
+# on a line, have an all uppercase name, and do not end with a semicolon. Such 
+# function macros are typically used for boiler-plate code, and will confuse 
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles. 
+# Optionally an initial location of the external documentation 
+# can be added for each tagfile. The format of a tag file without 
+# this location is as follows: 
+#   TAGFILES = file1 file2 ... 
+# Adding location for the tag files is done as follows: 
+#   TAGFILES = file1=loc1 "file2 = loc2" ... 
+# where "loc1" and "loc2" can be relative or absolute paths or 
+# URLs. If a location is present for each tag, the installdox tool 
+# does not have to be run to correct the links. 
+# Note that each tag file must have a unique name 
+# (where the name does NOT include the path) 
+# If a tag file is not located in the directory in which doxygen 
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               = 
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create 
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       = 
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed 
+# in the class index. If set to NO only the inherited external classes 
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed 
+# in the modules index. If set to NO, only the current project's groups will 
+# be listed.
+
+EXTERNAL_GROUPS        = NO
+
+# The PERL_PATH should be the absolute path and name of the perl script 
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will 
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base 
+# or super classes. Setting the tag to NO turns the diagrams off. Note that 
+# this option is superseded by the HAVE_DOT option below. This is only a 
+# fallback. It is recommended to install and use dot, since it yields more 
+# powerful graphs.
+
+CLASS_DIAGRAMS         = NO
+
+# You can define message sequence charts within doxygen comments using the \msc 
+# command. Doxygen will then run the mscgen tool (see 
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the 
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where 
+# the mscgen tool resides. If left empty the tool is assumed to be found in the 
+# default search path.
+
+MSCGEN_PATH            = 
+
+# If set to YES, the inheritance and collaboration graphs will hide 
+# inheritance and usage relations if the target is undocumented 
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is 
+# available from the path. This tool is part of Graphviz, a graph visualization 
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section 
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# By default doxygen will write a font called FreeSans.ttf to the output 
+# directory and reference it in all dot files that doxygen generates. This 
+# font does not include all possible unicode characters however, so when you need 
+# these (or just want a differently looking font) you can specify the font name 
+# using DOT_FONTNAME. You need need to make sure dot is able to find the font, 
+# which can be done by putting it in a standard location or by setting the 
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory 
+# containing the font.
+
+DOT_FONTNAME           = FreeSans
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. 
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the output directory to look for the 
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a 
+# different font using DOT_FONTNAME you can set the path where dot 
+# can find it using this tag.
+
+DOT_FONTPATH           = 
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for each documented class showing the direct and 
+# indirect inheritance relations. Setting this tag to YES will force the 
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for each documented class showing the direct and 
+# indirect implementation dependencies (inheritance, containment, and 
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and 
+# collaboration diagrams in a style similar to the OMG's Unified Modeling 
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the 
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT 
+# tags are set to YES then doxygen will generate a graph for each documented 
+# file showing the direct and indirect include dependencies of the file with 
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and 
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each 
+# documented header file showing the documented files that directly or 
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then 
+# doxygen will generate a call dependency graph for every global function 
+# or class method. Note that enabling this option will significantly increase 
+# the time of a run. So in most cases it will be better to enable call graphs 
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then 
+# doxygen will generate a caller dependency graph for every global function 
+# or class method. Note that enabling this option will significantly increase 
+# the time of a run. So in most cases it will be better to enable caller 
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen 
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES 
+# then doxygen will show the dependencies a directory has on other directories 
+# in a graphical way. The dependency relations are determined by the #include 
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images 
+# generated by dot. Possible values are png, jpg, or gif 
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be 
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               = 
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that 
+# contain dot files that are included in the documentation (see the 
+# \dotfile command).
+
+DOTFILE_DIRS           = 
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of 
+# nodes that will be shown in the graph. If the number of nodes in a graph 
+# becomes larger than this value, doxygen will truncate the graph, which is 
+# visualized by representing a node as a red box. Note that doxygen if the 
+# number of direct children of the root node in a graph is already larger than 
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note 
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the 
+# graphs generated by dot. A depth value of 3 means that only nodes reachable 
+# from the root by following a path via at most 3 edges will be shown. Nodes 
+# that lay further from the root node will be omitted. Note that setting this 
+# option to 1 or 2 may greatly reduce the computation time needed for large 
+# code bases. Also note that the size of a graph can be further restricted by 
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent 
+# background. This is disabled by default, because dot on Windows does not 
+# seem to support this out of the box. Warning: Depending on the platform used, 
+# enabling this option may lead to badly anti-aliased labels on the edges of 
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output 
+# files in one run (i.e. multiple -o and -T options on the command line). This 
+# makes dot run faster, but since only newer versions of dot (>1.8.10) 
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will 
+# generate a legend page explaining the meaning of the various boxes and 
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will 
+# remove the intermediate dot files that are used to generate 
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/docs/Makefile b/docs/Makefile

new file mode 100644 (file)

index 0000000..14e0fb1
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,168 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = build
+SOURCEDIR        = source
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SOURCEDIR) 
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SOURCEDIR)
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext fig
+
+help:
+       @echo "Please use \`make <target>' where <target> is one of"
+       @echo "  html       to make standalone HTML files"
+       @echo "  dirhtml    to make HTML files named index.html in directories"
+       @echo "  singlehtml to make a single large HTML file"
+       @echo "  pickle     to make pickle files"
+       @echo "  json       to make JSON files"
+       @echo "  htmlhelp   to make HTML files and a HTML help project"
+       @echo "  qthelp     to make HTML files and a qthelp project"
+       @echo "  devhelp    to make HTML files and a Devhelp project"
+       @echo "  epub       to make an epub"
+       @echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+       @echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+       @echo "  text       to make text files"
+       @echo "  man        to make manual pages"
+       @echo "  texinfo    to make Texinfo files"
+       @echo "  info       to make Texinfo files and run them through makeinfo"
+       @echo "  gettext    to make PO message catalogs"
+       @echo "  changes    to make an overview of all changed/added/deprecated items"
+       @echo "  linkcheck  to check all external links for integrity"
+       @echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+       -rm -rf $(BUILDDIR)/*
+
+html: basefig MANY_CLUSTER.png
+       $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+       @echo
+       @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+       $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+       @echo
+       @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+       $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+       @echo
+       @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+       $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+       @echo
+       @echo "Build finished; now you can process the pickle files."
+
+json:
+       $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+       @echo
+       @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+       $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+       @echo
+       @echo "Build finished; now you can run HTML Help Workshop with the" \
+             ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+       $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+       @echo
+       @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+             ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+       @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pbtoolkits.qhcp"
+       @echo "To view the help file:"
+       @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pbtoolkits.qhc"
+
+devhelp:
+       $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+       @echo
+       @echo "Build finished."
+       @echo "To view the help file:"
+       @echo "# mkdir -p $$HOME/.local/share/devhelp/pbtoolkits"
+       @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pbtoolkits"
+       @echo "# devhelp"
+
+epub:
+       $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+       @echo
+       @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+       $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+       @echo
+       @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+       @echo "Run \`make' in that directory to run these through (pdf)latex" \
+             "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+       $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+       @echo "Running LaTeX files through pdflatex..."
+       $(MAKE) -C $(BUILDDIR)/latex all-pdf
+       @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+       $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+       @echo
+       @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+       $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+       @echo
+       @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+       $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+       @echo
+       @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+       @echo "Run \`make' in that directory to run these through makeinfo" \
+             "(use \`make info' here to do that automatically)."
+
+info:
+       $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+       @echo "Running Texinfo files through makeinfo..."
+       make -C $(BUILDDIR)/texinfo info
+       @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+       $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+       @echo
+       @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+       $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+       @echo
+       @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+       $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+       @echo
+       @echo "Link check complete; look for any errors in the above output " \
+             "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+       $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+       @echo "Testing of doctests in the sources finished, look at the " \
+             "results in $(BUILDDIR)/doctest/output.txt."
+
+basefig:
+       dot -Tpng $(SOURCEDIR)/dependencies.dot > $(SOURCEDIR)/$@
+       grep -v "\"pbsmrtpipe\" ->" $(SOURCEDIR)/dependencies.dot  \
+               | grep -v "> \"pbcore\"" \
+               | sed 's/All/Sparse/' > $(SOURCEDIR)/sparse_dependencies.dot  
+       dot -Tpng $(SOURCEDIR)/sparse_dependencies.dot \
+               > $(SOURCEDIR)/sparse_dependencies.png
+
+%.png: basefig
+       grep -v $* $(SOURCEDIR)/sparse_dependencies.dot | \
+       grep -v \? | sed 's/Sparse dependencies/Module bundles/' | \
+       dot -Tpng > $(SOURCEDIR)/$@
+
diff --git a/docs/examples/code/BarcodeQuery.txt b/docs/examples/code/BarcodeQuery.txt

new file mode 100644 (file)

index 0000000..3fe8fce
--- /dev/null
+++ b/docs/examples/code/BarcodeQuery.txt
@@ -0,0 +1,17 @@
+// using C++11 range-based for loop
+BarcodeQuery query(42, dataset);
+for (const BamRecord& r : query) {
+    assert(r.HasBarcodes());
+    assert(r.BarcodeForward() == 42 || r.barcodeReverse() == 42);
+}
+
+// OR
+
+// using iterators directly
+BarcodeQuery query(42, dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    assert(iter->HasBarcodes());
+    assert(iter->BarcodeForward() == 42 || iter->barcodeReverse() == 42);
+} 
diff --git a/docs/examples/code/Compare.txt b/docs/examples/code/Compare.txt

new file mode 100644 (file)

index 0000000..deecd8d
--- /dev/null
+++ b/docs/examples/code/Compare.txt
@@ -0,0 +1,3 @@
+// sort on increasing ZMW hole number
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::Zmw());
diff --git a/docs/examples/code/Compare_AlignedEnd.txt b/docs/examples/code/Compare_AlignedEnd.txt

new file mode 100644 (file)

index 0000000..d34ed67
--- /dev/null
+++ b/docs/examples/code/Compare_AlignedEnd.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::AlignedEnd());
diff --git a/docs/examples/code/Compare_AlignedStart.txt b/docs/examples/code/Compare_AlignedStart.txt

new file mode 100644 (file)

index 0000000..68de3e2
--- /dev/null
+++ b/docs/examples/code/Compare_AlignedStart.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::AlignedStart());
diff --git a/docs/examples/code/Compare_AlignedStrand.txt b/docs/examples/code/Compare_AlignedStrand.txt

new file mode 100644 (file)

index 0000000..6c22cdc
--- /dev/null
+++ b/docs/examples/code/Compare_AlignedStrand.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::AlignedStrand());
diff --git a/docs/examples/code/Compare_BarcodeForward.txt b/docs/examples/code/Compare_BarcodeForward.txt

new file mode 100644 (file)

index 0000000..1967341
--- /dev/null
+++ b/docs/examples/code/Compare_BarcodeForward.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::BarcodeForward());
diff --git a/docs/examples/code/Compare_BarcodeQuality.txt b/docs/examples/code/Compare_BarcodeQuality.txt

new file mode 100644 (file)

index 0000000..144f483
--- /dev/null
+++ b/docs/examples/code/Compare_BarcodeQuality.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::BarcodeQuality());
diff --git a/docs/examples/code/Compare_BarcodeReverse.txt b/docs/examples/code/Compare_BarcodeReverse.txt

new file mode 100644 (file)

index 0000000..9d3b245
--- /dev/null
+++ b/docs/examples/code/Compare_BarcodeReverse.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::BarcodeReverse());
diff --git a/docs/examples/code/Compare_FullName.txt b/docs/examples/code/Compare_FullName.txt

new file mode 100644 (file)

index 0000000..4b392b9
--- /dev/null
+++ b/docs/examples/code/Compare_FullName.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::FullName());
diff --git a/docs/examples/code/Compare_LocalContextFlag.txt b/docs/examples/code/Compare_LocalContextFlag.txt

new file mode 100644 (file)

index 0000000..aeab944
--- /dev/null
+++ b/docs/examples/code/Compare_LocalContextFlag.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::LocalContextFlag());
diff --git a/docs/examples/code/Compare_MapQuality.txt b/docs/examples/code/Compare_MapQuality.txt

new file mode 100644 (file)

index 0000000..fe22821
--- /dev/null
+++ b/docs/examples/code/Compare_MapQuality.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::MapQuality());
diff --git a/docs/examples/code/Compare_MovieName.txt b/docs/examples/code/Compare_MovieName.txt

new file mode 100644 (file)

index 0000000..cddcb64
--- /dev/null
+++ b/docs/examples/code/Compare_MovieName.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::MovieName());
diff --git a/docs/examples/code/Compare_NumDeletedBases.txt b/docs/examples/code/Compare_NumDeletedBases.txt

new file mode 100644 (file)

index 0000000..aa6dd4b
--- /dev/null
+++ b/docs/examples/code/Compare_NumDeletedBases.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::NumDeletedBases());
diff --git a/docs/examples/code/Compare_NumInsertedBases.txt b/docs/examples/code/Compare_NumInsertedBases.txt

new file mode 100644 (file)

index 0000000..917d87f
--- /dev/null
+++ b/docs/examples/code/Compare_NumInsertedBases.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::NumInsertedBases());
diff --git a/docs/examples/code/Compare_NumMatches.txt b/docs/examples/code/Compare_NumMatches.txt

new file mode 100644 (file)

index 0000000..47e3081
--- /dev/null
+++ b/docs/examples/code/Compare_NumMatches.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::NumMatches());
diff --git a/docs/examples/code/Compare_NumMismatches.txt b/docs/examples/code/Compare_NumMismatches.txt

new file mode 100644 (file)

index 0000000..12affb1
--- /dev/null
+++ b/docs/examples/code/Compare_NumMismatches.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::NumMismatches());
diff --git a/docs/examples/code/Compare_QueryEnd.txt b/docs/examples/code/Compare_QueryEnd.txt

new file mode 100644 (file)

index 0000000..d664d28
--- /dev/null
+++ b/docs/examples/code/Compare_QueryEnd.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::QueryEnd());
diff --git a/docs/examples/code/Compare_QueryStart.txt b/docs/examples/code/Compare_QueryStart.txt

new file mode 100644 (file)

index 0000000..12f6244
--- /dev/null
+++ b/docs/examples/code/Compare_QueryStart.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::QueryStart());
diff --git a/docs/examples/code/Compare_ReadAccuracy.txt b/docs/examples/code/Compare_ReadAccuracy.txt

new file mode 100644 (file)

index 0000000..9454309
--- /dev/null
+++ b/docs/examples/code/Compare_ReadAccuracy.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReadAccuracy());
diff --git a/docs/examples/code/Compare_ReadGroupId.txt b/docs/examples/code/Compare_ReadGroupId.txt

new file mode 100644 (file)

index 0000000..dab3497
--- /dev/null
+++ b/docs/examples/code/Compare_ReadGroupId.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReadGroupId());
diff --git a/docs/examples/code/Compare_ReadGroupNumericId.txt b/docs/examples/code/Compare_ReadGroupNumericId.txt

new file mode 100644 (file)

index 0000000..5ad8f9d
--- /dev/null
+++ b/docs/examples/code/Compare_ReadGroupNumericId.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReadGroupNumericId());
diff --git a/docs/examples/code/Compare_ReferenceEnd.txt b/docs/examples/code/Compare_ReferenceEnd.txt

new file mode 100644 (file)

index 0000000..ed42d05
--- /dev/null
+++ b/docs/examples/code/Compare_ReferenceEnd.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReferenceEnd());
diff --git a/docs/examples/code/Compare_ReferenceId.txt b/docs/examples/code/Compare_ReferenceId.txt

new file mode 100644 (file)

index 0000000..5628427
--- /dev/null
+++ b/docs/examples/code/Compare_ReferenceId.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReferenceId());
diff --git a/docs/examples/code/Compare_ReferenceName.txt b/docs/examples/code/Compare_ReferenceName.txt

new file mode 100644 (file)

index 0000000..1f76e7e
--- /dev/null
+++ b/docs/examples/code/Compare_ReferenceName.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReferenceName());
diff --git a/docs/examples/code/Compare_ReferenceStart.txt b/docs/examples/code/Compare_ReferenceStart.txt

new file mode 100644 (file)

index 0000000..0ccaf36
--- /dev/null
+++ b/docs/examples/code/Compare_ReferenceStart.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReferenceStart());
diff --git a/docs/examples/code/Compare_TypeFromOperator.txt b/docs/examples/code/Compare_TypeFromOperator.txt

new file mode 100644 (file)

index 0000000..afb0848
--- /dev/null
+++ b/docs/examples/code/Compare_TypeFromOperator.txt
@@ -0,0 +1,2 @@
+Compare::Type type = Compare::TypeFromOperator("!=");
+assert(type == Compare::NOT_EQUAL);
diff --git a/docs/examples/code/Compare_TypeToName.txt b/docs/examples/code/Compare_TypeToName.txt

new file mode 100644 (file)

index 0000000..c44e1cb
--- /dev/null
+++ b/docs/examples/code/Compare_TypeToName.txt
@@ -0,0 +1,2 @@
+string name = Compare::TypeToName(Compare::LESS_THAN);
+assert(name = "Compare::LESS_THAN");
diff --git a/docs/examples/code/Compare_Zmw.txt b/docs/examples/code/Compare_Zmw.txt

new file mode 100644 (file)

index 0000000..b02c426
--- /dev/null
+++ b/docs/examples/code/Compare_Zmw.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::Zmw());
diff --git a/docs/examples/code/EntireFileQuery.txt b/docs/examples/code/EntireFileQuery.txt

new file mode 100644 (file)

index 0000000..d3fcc2c
--- /dev/null
+++ b/docs/examples/code/EntireFileQuery.txt
@@ -0,0 +1,15 @@
+// using C++11 range-based for loop
+EntireFileQuery query(dataset);
+for (const BamRecord& record : query) {
+    // ... do stuff ...
+}
+
+// OR
+
+// using iterators
+EntireFileQuery query(dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    // ... do stuff ...
+}  
diff --git a/docs/examples/code/EntireFileQuery_BamFilename.txt b/docs/examples/code/EntireFileQuery_BamFilename.txt

new file mode 100644 (file)

index 0000000..484db61
--- /dev/null
+++ b/docs/examples/code/EntireFileQuery_BamFilename.txt
@@ -0,0 +1,4 @@
+EntireFileQuery query("foo.bam");
+for (const BamRecord& record : query) {
+    // do stuff
+}
diff --git a/docs/examples/code/EntireFileQuery_NonConst.txt b/docs/examples/code/EntireFileQuery_NonConst.txt

new file mode 100644 (file)

index 0000000..a0a092e
--- /dev/null
+++ b/docs/examples/code/EntireFileQuery_NonConst.txt
@@ -0,0 +1,4 @@
+EntireFileQuery query("foo.bam");
+for (BamRecord& record : query) {
+    // ok to modify 'record' here
+} 
diff --git a/docs/examples/code/GenomicIntervalQuery.txt b/docs/examples/code/GenomicIntervalQuery.txt

new file mode 100644 (file)

index 0000000..651f254
--- /dev/null
+++ b/docs/examples/code/GenomicIntervalQuery.txt
@@ -0,0 +1,16 @@
+// using C++11 range-based for loop
+GenomicIntervalQuery query(GenomicInterval("chr1:1000-2000"), dataset);
+for (const BamRecord& record : query) {
+    // ... do stuff ...
+}
+
+// OR
+
+// using iterators directly
+GenomicIntervalQuery query(GenomicInterval("chr1:1000-2000"), dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    // ... do stuff ...
+}
+
diff --git a/docs/examples/code/GenomicIntervalQuery_Reuse.txt b/docs/examples/code/GenomicIntervalQuery_Reuse.txt

new file mode 100644 (file)

index 0000000..339ae95
--- /dev/null
+++ b/docs/examples/code/GenomicIntervalQuery_Reuse.txt
@@ -0,0 +1,8 @@
+DataSet ds("data.xml");
+GenomicIntervalQuery query(GenomicInterval(), ds);
+for (const GenomicInterval& interval : intervals) {
+    query.Interval(interval);
+    for (const BamRecord& record : query) {}
+        // do stuff
+    }
+}
+\ No newline at end of file
diff --git a/docs/examples/code/PbiAlignedEndFilter.txt b/docs/examples/code/PbiAlignedEndFilter.txt

new file mode 100644 (file)

index 0000000..bac1a46
--- /dev/null
+++ b/docs/examples/code/PbiAlignedEndFilter.txt
@@ -0,0 +1,4 @@
+PbiFilterQuery query(PbiAlignedEndFilter{3000, Compare::GREATER_THAN});
+for (const BamRecord& record : query) {
+    assert(record.AlignedEnd() > 3000);
+}
diff --git a/docs/examples/code/PbiAlignedLengthFilter.txt b/docs/examples/code/PbiAlignedLengthFilter.txt

new file mode 100644 (file)

index 0000000..38dc3ff
--- /dev/null
+++ b/docs/examples/code/PbiAlignedLengthFilter.txt
@@ -0,0 +1,4 @@
+PbiFilterQuery query(PbiAlignedLengthFilter{1000, Compare::GREATER_THAN});
+for (const BamRecord& record : query) {
+    assert((record.AlignedEnd() - record.AlignedStart()) > 1000);
+}
diff --git a/docs/examples/code/PbiAlignedStartFilter.txt b/docs/examples/code/PbiAlignedStartFilter.txt

new file mode 100644 (file)

index 0000000..b78bb2c
--- /dev/null
+++ b/docs/examples/code/PbiAlignedStartFilter.txt
@@ -0,0 +1,4 @@
+PbiFilterQuery query(PbiAlignedStartFilter{3000, Compare::GREATER_THAN});
+for (const BamRecord& record : query) {
+    assert(record.AlignedStart() > 3000);
+}
diff --git a/docs/examples/code/PbiAlignedStrandFilter.txt b/docs/examples/code/PbiAlignedStrandFilter.txt

new file mode 100644 (file)

index 0000000..9f9a885
--- /dev/null
+++ b/docs/examples/code/PbiAlignedStrandFilter.txt
@@ -0,0 +1,5 @@
+PbiFilterQuery query(PbiAlignedStrandFilter{Strand::FORWARD});
+for (const BamRecord& record : query) {
+    assert(record.AlignedStrand() == Strand::FORWARD);
+}
+
diff --git a/docs/examples/code/PbiBarcodeFilter.txt b/docs/examples/code/PbiBarcodeFilter.txt

new file mode 100644 (file)

index 0000000..c7ce5cb
--- /dev/null
+++ b/docs/examples/code/PbiBarcodeFilter.txt
@@ -0,0 +1,17 @@
+// single value
+PbiFilter filter{ PbiBarcodeFilter{17} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    const auto barcodes = record.Barcodes();
+    assert(barcodes.first == 17 || barcodes.second == 17);
+}
+
+// whitelist
+vector<int16_t> whitelist = { 50, 100 };
+PbiFilter filter{ PbiBarcodeFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    const auto barcodes = record.Barcodes();
+    assert(barcodes.first == 50  || barcodes.second == 50 ||
+           barcodes.first == 100 || barcodes.second == 100);
+}
diff --git a/docs/examples/code/PbiBarcodeForwardFilter.txt b/docs/examples/code/PbiBarcodeForwardFilter.txt

new file mode 100644 (file)

index 0000000..a6c12fd
--- /dev/null
+++ b/docs/examples/code/PbiBarcodeForwardFilter.txt
@@ -0,0 +1,15 @@
+// single value
+PbiFilter filter{ PbiBarcodeForwardFilter{50} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeForward() == 50);
+}
+
+// whitelist
+vector<int16_t> whitelist = { 50, 100 };
+PbiFilter filter{ PbiBarcodeForwardFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeForward() == 50 || record.BarcodeForward() == 100);
+}
+
diff --git a/docs/examples/code/PbiBarcodeQualityFilter.txt b/docs/examples/code/PbiBarcodeQualityFilter.txt

new file mode 100644 (file)

index 0000000..34311d0
--- /dev/null
+++ b/docs/examples/code/PbiBarcodeQualityFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiBarcodeQualityFilter{42, Compare::GREATER_THAN_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeQuality() >= 42);
+}
diff --git a/docs/examples/code/PbiBarcodeReverseFilter.txt b/docs/examples/code/PbiBarcodeReverseFilter.txt

new file mode 100644 (file)

index 0000000..24134f8
--- /dev/null
+++ b/docs/examples/code/PbiBarcodeReverseFilter.txt
@@ -0,0 +1,15 @@
+// single value
+PbiFilter filter{ PbiBarcodeReverseFilter{50} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeReverse() == 50);
+}
+
+// whitelist
+vector<int16_t> whitelist = { 50, 100 };
+PbiFilter filter{ PbiBarcodeReverseFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeReverse() == 50 || record.BarcodeReverse() == 100);
+}
+
diff --git a/docs/examples/code/PbiBarcodesFilter.txt b/docs/examples/code/PbiBarcodesFilter.txt

new file mode 100644 (file)

index 0000000..a655c57
--- /dev/null
+++ b/docs/examples/code/PbiBarcodesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiBarcodesFilter{17, 18} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeForward() == 17 && 
+           record.BarcodeReverse() == 18);
+}
diff --git a/docs/examples/code/PbiBuilder_WithReader.txt b/docs/examples/code/PbiBuilder_WithReader.txt

new file mode 100644 (file)

index 0000000..e2748c2
--- /dev/null
+++ b/docs/examples/code/PbiBuilder_WithReader.txt
@@ -0,0 +1,30 @@
+// To simply create a PBI file from BAM, the following is the easiest method:
+//
+#include <pbbam/BamFile.h>
+#include <pbbam/PbiFile.h>
+
+BamFile bamFile("data.bam");
+PbiFile::CreateFrom(bamFile);
+
+
+// However if you need to perform additional operations while reading the BAM file, 
+// you can do something like the following:
+//
+{
+    BamFile bamFile("data.bam");
+    PbiBuilder builder(bamFile.PacBioIndexFilename(), 
+                       bamFile.Header().Sequences().size());
+    BamReader reader(bamFile);
+    BamRecord b;
+    int64_t offset = reader.VirtualTell(); // first record's vOffset
+    while (reader.GetNext(b)) {
+
+        // store PBI recrod entry & get next record's vOffset
+        builder.AddRecord(b, offset);
+        offset = reader.VirtualTell();
+   
+        // ... additional stuff as needed ...
+    }
+
+} // <-- PBI data will only be written here, as PbiBuilder goes out of scope
+
diff --git a/docs/examples/code/PbiBuilder_WithWriter.txt b/docs/examples/code/PbiBuilder_WithWriter.txt

new file mode 100644 (file)

index 0000000..0c7d6d1
--- /dev/null
+++ b/docs/examples/code/PbiBuilder_WithWriter.txt
@@ -0,0 +1,12 @@
+BamWriter writer(...);
+PbiBuilder pbiBuilder(...);
+int64_t vOffset;
+BamRecord record;
+while (...) {
+
+    // ... populate record data ...
+
+    // write record to BAM and add PBI entry
+    writer.Write(record, &vOffset);
+    pbiBuilder.AddRecord(record, vOffset);
+}
diff --git a/docs/examples/code/PbiFilterQuery.txt b/docs/examples/code/PbiFilterQuery.txt

new file mode 100644 (file)

index 0000000..4914eab
--- /dev/null
+++ b/docs/examples/code/PbiFilterQuery.txt
@@ -0,0 +1,22 @@
+// setup filter
+PbiFilter filter;
+filter.Add(PbiZmwFilter(42));
+filter.Add(PbiReadAccuracyFilter(0.9, Compare::GREATER_THAN_EQUAL));
+
+// using C++11 range-based for loop
+PbiFilterQuery query(filter, dataset);
+for (const BamRecord& r : query) {
+    assert(r.HoleNumber() == 42);
+    assert(r.ReadAccuracy() >= 0.9);
+}
+
+// OR
+
+// using iterators directly
+PbiFilterQuery query(filter, dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    assert(iter->HoleNumber() == 42);
+    assert(iter->ReadAccuracy() >= 0.9);
+} 
diff --git a/docs/examples/code/PbiFilter_Composition.txt b/docs/examples/code/PbiFilter_Composition.txt

new file mode 100644 (file)

index 0000000..22cc6ff
--- /dev/null
+++ b/docs/examples/code/PbiFilter_Composition.txt
@@ -0,0 +1,8 @@
+// (f1 && f2) || f3
+
+PbiFilter f1;
+PbiFilter f2;
+PbiFilter intersect_f1_f2 = PbiFilter::Intersection(f1, f2);
+
+PbiFilter f3;
+PbiFilter final = PbiFilter::Union(intersect_f1_f2, f3);
diff --git a/docs/examples/code/PbiFilter_CustomFilter.txt b/docs/examples/code/PbiFilter_CustomFilter.txt

new file mode 100644 (file)

index 0000000..f9cdd21
--- /dev/null
+++ b/docs/examples/code/PbiFilter_CustomFilter.txt
@@ -0,0 +1,21 @@
+struct MyCustomFilter
+{
+    bool Accepts(const PbiRawData& index, const size_t row) const
+    {
+        // Look up data for record at the provided row. Do any calculations
+        // necessary, then return whether that record passes your 
+        // filter criteria. 
+        
+        return true;
+    }
+};
+
+// use in composite filters
+PbiFilter f;
+f.Add(PbiMovieNameFilter("foo"));
+f.Add(MyCustomFilter());
+
+// pass directly to PbiFilterQuery
+PbiFilterQuery query(MyCustomFilter(), "foo.bam");
+for (const BamRecord& record : query)
+    // ... do stuff ...
diff --git a/docs/examples/code/PbiFilter_Interface.txt b/docs/examples/code/PbiFilter_Interface.txt

new file mode 100644 (file)

index 0000000..0fea900
--- /dev/null
+++ b/docs/examples/code/PbiFilter_Interface.txt
@@ -0,0 +1 @@
+bool Accepts(const PbiRawData& index, const size_t row) const;
diff --git a/docs/examples/code/PbiIdentityFilter.txt b/docs/examples/code/PbiIdentityFilter.txt

new file mode 100644 (file)

index 0000000..6fcb8d0
--- /dev/null
+++ b/docs/examples/code/PbiIdentityFilter.txt
@@ -0,0 +1,6 @@
+// single value
+PbiFilter filter{ PbiIdentityFilter{ 0.5, Compare::GREATER_THAN_EQUAL } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    // ... at least 50% of record was aligned ...
+}
diff --git a/docs/examples/code/PbiLocalContextFilter.txt b/docs/examples/code/PbiLocalContextFilter.txt

new file mode 100644 (file)

index 0000000..0aaa3eb
--- /dev/null
+++ b/docs/examples/code/PbiLocalContextFilter.txt
@@ -0,0 +1,22 @@
+
+// --------------------
+// has adapter_before
+// --------------------
+
+PbiFilter filter{ PbiLocalContextFilter{LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    const bool hasAdapterBefore = (record.LocalContextFlags() & LocalContextFlags::ADAPTER_BEFORE) != 0;
+    assert(hasAdapterBefore);
+}
+
+// ----------------------------------
+// has any adapters, barcodes, etc.
+// ----------------------------------
+
+PbiFilter filter{ PbiLocalContextFilter{LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    const bool hasContext = (record.LocalContextFlags() != LocalContextFlags::NO_LOCAL_CONTEXT);
+    assert(hasContext);
+}
diff --git a/docs/examples/code/PbiMapQualityFilter.txt b/docs/examples/code/PbiMapQualityFilter.txt

new file mode 100644 (file)

index 0000000..67fb5dc
--- /dev/null
+++ b/docs/examples/code/PbiMapQualityFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiMapQualityFilter{75, Compare::GREATER_THAN_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.MapQuality() >= 75);
+} 
diff --git a/docs/examples/code/PbiMovieNameFilter.txt b/docs/examples/code/PbiMovieNameFilter.txt

new file mode 100644 (file)

index 0000000..dd124e2
--- /dev/null
+++ b/docs/examples/code/PbiMovieNameFilter.txt
@@ -0,0 +1,14 @@
+// single value
+PbiFilter filter{ PbiMovieFilter{ "foo" } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.MovieName() == "foo");
+}
+
+// whitelist
+vector<string> whitelist = { "foo", "bar" };
+PbiFilter filter{ PbiMovieNameFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.MovieName() == "foo" || record.MovieName() == "bar");
+}
diff --git a/docs/examples/code/PbiNumDeletedBasesFilter.txt b/docs/examples/code/PbiNumDeletedBasesFilter.txt

new file mode 100644 (file)

index 0000000..e1e3d1f
--- /dev/null
+++ b/docs/examples/code/PbiNumDeletedBasesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiNumDeletedBasesFilter{50, Compare::LESS_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.NumDeletedBases() < 50);
+}
+
diff --git a/docs/examples/code/PbiNumInsertedBasesFilter.txt b/docs/examples/code/PbiNumInsertedBasesFilter.txt

new file mode 100644 (file)

index 0000000..ab385e4
--- /dev/null
+++ b/docs/examples/code/PbiNumInsertedBasesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiNumInsertedBasesFilter{50, Compare::LESS_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.NumInsertedBases() < 50);
+}
+
diff --git a/docs/examples/code/PbiNumMatchesFilter.txt b/docs/examples/code/PbiNumMatchesFilter.txt

new file mode 100644 (file)

index 0000000..4e1b97d
--- /dev/null
+++ b/docs/examples/code/PbiNumMatchesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiNumMatchesFilter{2000, Compare::GREATER_THAN_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.NumMatches() >= 2000);
+}
+
diff --git a/docs/examples/code/PbiNumMismatchesFilter.txt b/docs/examples/code/PbiNumMismatchesFilter.txt

new file mode 100644 (file)

index 0000000..690e4a1
--- /dev/null
+++ b/docs/examples/code/PbiNumMismatchesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiNumMismatchesFilter{500, Compare::LESS_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.NumMismatches() < 500);
+}
+
diff --git a/docs/examples/code/PbiQueryEndFilter.txt b/docs/examples/code/PbiQueryEndFilter.txt

new file mode 100644 (file)

index 0000000..f85166b
--- /dev/null
+++ b/docs/examples/code/PbiQueryEndFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiQueryEndFilter{3000, Compare::GREATER_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.QueryEnd() > 3000);
+} 
diff --git a/docs/examples/code/PbiQueryLengthFilter.txt b/docs/examples/code/PbiQueryLengthFilter.txt

new file mode 100644 (file)

index 0000000..123412a
--- /dev/null
+++ b/docs/examples/code/PbiQueryLengthFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiQueryLengthFilter{2000, Compare::GREATER_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert( (record.QueryEnd() - record.QueryStart()) > 2000 );
+}
diff --git a/docs/examples/code/PbiQueryNameFilter.txt b/docs/examples/code/PbiQueryNameFilter.txt

new file mode 100644 (file)

index 0000000..f1e51c7
--- /dev/null
+++ b/docs/examples/code/PbiQueryNameFilter.txt
@@ -0,0 +1,15 @@
+// single value
+PbiFilter filter{ PbiQueryNameFilter{ "movie_1/42/100_200" } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.FullName() == "movie_1/42/100_200");
+}
+
+// whitelist
+vector<string> whitelist = { "movie_1/42/100_200", "movie_3/24/300_500" };
+PbiFilter filter{ PbiQueryNameFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.FullName() == "movie_1/42/100_200" || 
+           record.FullName() == "movie_3/24/300_500");
+}
diff --git a/docs/examples/code/PbiQueryStartFilter.txt b/docs/examples/code/PbiQueryStartFilter.txt

new file mode 100644 (file)

index 0000000..56353df
--- /dev/null
+++ b/docs/examples/code/PbiQueryStartFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiQueryStartFilter{3000, Compare::GREATER_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.QueryStart() > 3000);
+} 
diff --git a/docs/examples/code/PbiReadAccuracyFilter.txt b/docs/examples/code/PbiReadAccuracyFilter.txt

new file mode 100644 (file)

index 0000000..dd2df32
--- /dev/null
+++ b/docs/examples/code/PbiReadAccuracyFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiReadAccuracyFilter{0.8, Compare::GREATER_THAN_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadAccuracy() >= 0.8);
+}
diff --git a/docs/examples/code/PbiReadGroupFilter.txt b/docs/examples/code/PbiReadGroupFilter.txt

new file mode 100644 (file)

index 0000000..9af096d
--- /dev/null
+++ b/docs/examples/code/PbiReadGroupFilter.txt
@@ -0,0 +1,64 @@
+// -------------------------
+// numeric ID
+// -------------------------
+
+// single value
+PbiFilter filter{ PbiReadGroupFilter{ 2458765 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroupNumericId() == 2458765);
+}
+
+// whitelist
+vector<int32_t> whitelist = { 2458765, -32143 };
+PbiFilter filter{ PbiReadGroupFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroupNumericId() == 2458765 ||
+           record.ReadGroupNumericId() == -32143);
+}
+
+// -------------------------
+// printable ID
+// -------------------------
+
+// single value 
+PbiFilter filter{ PbiReadGroupFilter{ "12B33F00" } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroupId() == "12B33F00");
+}
+
+// whitelist
+vector<string> whitelist = { "12B33F00", "123ABC77" };
+PbiFilter filter{ PbiReadGroupFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroupId() == "12B33F00" ||
+           record.ReadGroupId() == "123ABC77");
+}
+
+
+// -------------------------
+// read group 
+// -------------------------
+
+BamFile file("foo.bam");
+BamHeader header = file.Header();
+assert(header.ReadGroups().size() > 1);
+
+// single value 
+PbiFilter filter{ PbiReadGroupFilter{ header.ReadGroups()[0] } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroup() == header.ReadGroups()[0]);
+}
+
+// whitelist
+vector<ReadGroupInfo> whitelist = { header.ReadGroups()[0], header.ReadGroups()[1] };
+PbiFilter filter{ PbiReadGroupFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroup() == header.ReadGroups()[0] ||
+           record.ReadGroup() == header.ReadGroups()[1]);
+}
diff --git a/docs/examples/code/PbiReferenceEndFilter.txt b/docs/examples/code/PbiReferenceEndFilter.txt

new file mode 100644 (file)

index 0000000..ce005c6
--- /dev/null
+++ b/docs/examples/code/PbiReferenceEndFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiReferenceEndFilter{ 2000 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceEnd() == 2000);
+}
diff --git a/docs/examples/code/PbiReferenceIdFilter.txt b/docs/examples/code/PbiReferenceIdFilter.txt

new file mode 100644 (file)

index 0000000..d963d28
--- /dev/null
+++ b/docs/examples/code/PbiReferenceIdFilter.txt
@@ -0,0 +1,16 @@
+// single value
+PbiFilter filter{ PbiReferenceIdFilter{ 4 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceId() == 4);
+}
+
+// whitelist
+vector<int32_t> whitelist = { 0, 1 };
+PbiFilter filter{ PbiReferenceIdFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceId() == 0 || 
+           record.ReferenceId() == 1);
+}
+
diff --git a/docs/examples/code/PbiReferenceNameFilter.txt b/docs/examples/code/PbiReferenceNameFilter.txt

new file mode 100644 (file)

index 0000000..c86b14a
--- /dev/null
+++ b/docs/examples/code/PbiReferenceNameFilter.txt
@@ -0,0 +1,15 @@
+// single value
+PbiFilter filter{ PbiReferenceNameFilter{ "chr1" } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceName() == "chr1");
+}
+
+// whitelist
+vector<string> whitelist = { "chr1", "chr5" };
+PbiFilter filter{ PbiReferenceNameFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceName() == "chr1" ||
+           record.ReferenceName() == "chr5");
+}
diff --git a/docs/examples/code/PbiReferenceStartFilter.txt b/docs/examples/code/PbiReferenceStartFilter.txt

new file mode 100644 (file)

index 0000000..d3ffdbb
--- /dev/null
+++ b/docs/examples/code/PbiReferenceStartFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiReferenceStartFilter{ 2000 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceStart() == 2000);
+}
diff --git a/docs/examples/code/PbiZmwFilter.txt b/docs/examples/code/PbiZmwFilter.txt

new file mode 100644 (file)

index 0000000..c63a804
--- /dev/null
+++ b/docs/examples/code/PbiZmwFilter.txt
@@ -0,0 +1,16 @@
+// single value
+PbiFilter filter{ PbiZmwFilter{ 4000 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.HoleNumber() == 4000);
+}
+
+// whitelist
+vector<int32_t> whitelist = { 4000, 8000 };
+PbiFilter filter{ PbiZmwFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.HoleNumber() == 4000 || 
+           record.HoleNumber() == 8000);
+}
+
diff --git a/docs/examples/code/ReadAccuracyQuery.txt b/docs/examples/code/ReadAccuracyQuery.txt

new file mode 100644 (file)

index 0000000..5b0404f
--- /dev/null
+++ b/docs/examples/code/ReadAccuracyQuery.txt
@@ -0,0 +1,15 @@
+// using C++11 range-based for loop
+ReadAccuracyQuery query(0.9, Compare::GREATER_THAN_EQUAL, dataset);
+for (const BamRecord& r : query) {
+    assert(r.ReadAccuracy() >= 0.9);
+}
+
+// OR
+
+// using iterators directly
+ReadAccuracyQuery query(0.9, Compare::GREATER_THAN_EQUAL, dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    assert(iter->ReadAccuracy() >= 0.9);
+} 
diff --git a/docs/examples/code/SubreadLengthQuery.txt b/docs/examples/code/SubreadLengthQuery.txt

new file mode 100644 (file)

index 0000000..466a1d9
--- /dev/null
+++ b/docs/examples/code/SubreadLengthQuery.txt
@@ -0,0 +1,15 @@
+// using C++11 range-based for loop
+SubreadLengthQuery query(500, Compare::GREATER_THAN_EQUAL, dataset);
+for (const BamRecord& r : query) {
+    assert((r.QueryEnd() - r.QueryStart()) >= 500);  
+}
+
+// OR
+
+// using iterators directly
+SubreadLengthQuery query(500, Compare::GREATER_THAN_EQUAL, dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    assert((iter->QueryEnd() - iter->QueryStart()) >= 500);
+} 
diff --git a/docs/examples/code/Tag_AsciiCtor.txt b/docs/examples/code/Tag_AsciiCtor.txt

new file mode 100644 (file)

index 0000000..057d22f
--- /dev/null
+++ b/docs/examples/code/Tag_AsciiCtor.txt
@@ -0,0 +1,10 @@
+// One-step construction
+// 
+// This is useful in situations that require a const Tag.
+//
+const auto t = Tag('A', TagModifier::ASCII_CHAR);
+
+// or two-step construction
+auto t = Tag('A');
+t.Modifier(TagModifier::ASCII_CHAR);
+
diff --git a/docs/examples/code/WhitelistedZmwReadStitcher.txt b/docs/examples/code/WhitelistedZmwReadStitcher.txt

new file mode 100644 (file)

index 0000000..a94c27b
--- /dev/null
+++ b/docs/examples/code/WhitelistedZmwReadStitcher.txt
@@ -0,0 +1,6 @@
+vector<int32_t> zmws = { ... };
+WhitelistedZmwReadStitcher reader(zmws, "primary.bam", "scraps.bam");
+while(reader.HasNext()) {
+    auto virtualRecord = reader.Next();
+    // ... do stuff ...
+}
diff --git a/docs/examples/code/ZmwGroupQuery.txt b/docs/examples/code/ZmwGroupQuery.txt

new file mode 100644 (file)

index 0000000..1d728ac
--- /dev/null
+++ b/docs/examples/code/ZmwGroupQuery.txt
@@ -0,0 +1,23 @@
+bool allHoleNumbersEqual(const vector<BamRecord>& group) 
+{
+    if (group.empty()) 
+        return true;
+    const auto firstHoleNumber = group[0].HoleNumber();
+    for (size_t i = 1; i < group.size(); ++i) {
+       if (group[i].HoleNumber() != firstHoleNumber)
+           return false;
+    }
+    return true;
+}
+
+vector<int32_t> whitelist = { 50, 100 };
+ZmwGroupQuery query(whitelist, dataset);
+for(const vector<BamRecord>& group : query) {
+
+    assert(allHoleNumbersEqual(group));
+
+    for (const BamRecord& record : group) {
+        assert(record.HoleNumber() == 50 ||
+               record.HoleNumber() == 100);
+    }
+}
diff --git a/docs/examples/code/ZmwQuery.txt b/docs/examples/code/ZmwQuery.txt

new file mode 100644 (file)

index 0000000..59c22c4
--- /dev/null
+++ b/docs/examples/code/ZmwQuery.txt
@@ -0,0 +1,6 @@
+vector<int32_t> whitelist = { 50, 100 };
+ZmwQuery query(whitelist, dataset);
+for (const BamRecord& record : query) {
+    assert(record.HoleNumber() == 50 ||
+           record.HoleNumber() == 100);
+}
diff --git a/docs/examples/plaintext/AlignmentPrinterOutput.txt b/docs/examples/plaintext/AlignmentPrinterOutput.txt

new file mode 100644 (file)

index 0000000..21d948b
--- /dev/null
+++ b/docs/examples/plaintext/AlignmentPrinterOutput.txt
@@ -0,0 +1,13 @@
+Read        : singleInsertion2
+Reference   : lambda_NEB3011
+
+Read-length : 49
+Concordance : 0.96
+
+5210 : GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGG : 5249
+       |||||||| ||||||||||||||||||| |||||||||||
+   0 : GGCTGCAG-GTACAGCGGTCAGGAGGCCAATTGATGCCGG :   39
+
+5249 : ACTGGCTGAT : 5259
+       ||||||||||
+  39 : ACTGGCTGAT :   49
diff --git a/docs/examples/plaintext/PbiFilter_DataSetXmlFilters.txt b/docs/examples/plaintext/PbiFilter_DataSetXmlFilters.txt

new file mode 100644 (file)

index 0000000..5b5e8c2
--- /dev/null
+++ b/docs/examples/plaintext/PbiFilter_DataSetXmlFilters.txt
@@ -0,0 +1,14 @@
+<Filters>
+  <Filter>
+    <Properties>
+      <Property />  # A
+      <Property />  # B
+    </Properties>
+  </Filter>
+  <Filter>
+    <Properties>
+      <Property />  # C
+      <Property />  # D
+    </Properties> 
+  </Filter>
+</Filters>
diff --git a/docs/meson.build b/docs/meson.build

new file mode 100644 (file)

index 0000000..cffad5c
--- /dev/null
+++ b/docs/meson.build
@@ -0,0 +1,24 @@
+#################
+# documentation #
+#################
+
+doxygen = find_program('doxygen', required : true)
+
+pbbam_doxygen_config = configuration_data()
+pbbam_doxygen_config.set('PacBioBAM_NAME', meson.project_name())
+pbbam_doxygen_config.set('PacBioBAM_VERSION', meson.project_version())
+pbbam_doxygen_config.set('PacBioBAM_DocsDir', '.')
+pbbam_doxygen_config.set('PacBioBAM_IncludeDir', join_paths([meson.current_source_dir(), '../include']))
+
+doxyfile = configure_file(
+  input : 'Doxyfile.in',
+  output : 'Doxyfile',
+  configuration : pbbam_doxygen_config,
+  install : false)
+
+custom_target('docs',
+  input : doxyfile,
+  output : 'docs',
+  command : [doxygen, doxyfile],
+  build_by_default : true,
+  install : false)
diff --git a/docs/source/api/Accuracy.rst b/docs/source/api/Accuracy.rst

new file mode 100644 (file)

index 0000000..f88b722
--- /dev/null
+++ b/docs/source/api/Accuracy.rst
@@ -0,0 +1,11 @@
+Accuracy
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/Accuracy.h>
+
+.. doxygenclass:: PacBio::BAM::Accuracy
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/AlignmentPrinter.rst b/docs/source/api/AlignmentPrinter.rst

new file mode 100644 (file)

index 0000000..ef0b191
--- /dev/null
+++ b/docs/source/api/AlignmentPrinter.rst
@@ -0,0 +1,11 @@
+AlignmentPrinter
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/AlignmentPrinter.h>
+
+.. doxygenclass:: PacBio::BAM::AlignmentPrinter 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/AlignmentSet.rst b/docs/source/api/AlignmentSet.rst

new file mode 100644 (file)

index 0000000..1817962
--- /dev/null
+++ b/docs/source/api/AlignmentSet.rst
@@ -0,0 +1,11 @@
+AlignmentSet
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::AlignmentSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BaiIndexedBamReader.rst b/docs/source/api/BaiIndexedBamReader.rst

new file mode 100644 (file)

index 0000000..aab136f
--- /dev/null
+++ b/docs/source/api/BaiIndexedBamReader.rst
@@ -0,0 +1,11 @@
+BaiIndexedBamReader
+===================
+
+.. code-block:: cpp
+
+   #include <pbbam/BaiIndexedBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::BaiIndexedBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamFile.rst b/docs/source/api/BamFile.rst

new file mode 100644 (file)

index 0000000..c7e48fb
--- /dev/null
+++ b/docs/source/api/BamFile.rst
@@ -0,0 +1,11 @@
+BamFile
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/BamFile.h>
+
+.. doxygenclass:: PacBio::BAM::BamFile
+   :members:
+   :protected-members:
+   :undoc-members:
diff --git a/docs/source/api/BamHeader.rst b/docs/source/api/BamHeader.rst

new file mode 100644 (file)

index 0000000..6cf06af
--- /dev/null
+++ b/docs/source/api/BamHeader.rst
@@ -0,0 +1,11 @@
+BamHeader
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamHeader.h>
+
+.. doxygenclass:: PacBio::BAM::BamHeader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamReader.rst b/docs/source/api/BamReader.rst

new file mode 100644 (file)

index 0000000..e0b6f3c
--- /dev/null
+++ b/docs/source/api/BamReader.rst
@@ -0,0 +1,11 @@
+BamReader
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamReader.h>
+
+.. doxygenclass:: PacBio::BAM::BamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamRecord.rst b/docs/source/api/BamRecord.rst

new file mode 100644 (file)

index 0000000..a749775
--- /dev/null
+++ b/docs/source/api/BamRecord.rst
@@ -0,0 +1,17 @@
+BamRecord
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamRecord.h>
+
+.. doxygenenum:: PacBio::BAM::ClipType
+
+.. doxygenenum:: PacBio::BAM::RecordType
+
+.. doxygenenum:: PacBio::BAM::FrameEncodingType
+
+.. doxygenclass:: PacBio::BAM::BamRecord
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamRecordBuilder.rst b/docs/source/api/BamRecordBuilder.rst

new file mode 100644 (file)

index 0000000..ce477b4
--- /dev/null
+++ b/docs/source/api/BamRecordBuilder.rst
@@ -0,0 +1,11 @@
+BamRecordBuilder
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/BamRecordBuilder.h>
+
+.. doxygenclass:: PacBio::BAM::BamRecordBuilder
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamRecordImpl.rst b/docs/source/api/BamRecordImpl.rst

new file mode 100644 (file)

index 0000000..92b6759
--- /dev/null
+++ b/docs/source/api/BamRecordImpl.rst
@@ -0,0 +1,11 @@
+BamRecordImpl
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/BamRecordImpl.h>
+
+.. doxygenclass:: PacBio::BAM::BamRecordImpl
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamRecordView.rst b/docs/source/api/BamRecordView.rst

new file mode 100644 (file)

index 0000000..2bc8fc4
--- /dev/null
+++ b/docs/source/api/BamRecordView.rst
@@ -0,0 +1,11 @@
+BamRecordView
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/BamRecord.h>
+
+.. doxygenclass:: PacBio::BAM::BamRecordView
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamTagCodec.rst b/docs/source/api/BamTagCodec.rst

new file mode 100644 (file)

index 0000000..9307421
--- /dev/null
+++ b/docs/source/api/BamTagCodec.rst
@@ -0,0 +1,11 @@
+BamTagCodec
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamTagCodec.h>
+
+.. doxygenclass:: PacBio::BAM::BamTagCodec
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamWriter.rst b/docs/source/api/BamWriter.rst

new file mode 100644 (file)

index 0000000..2e2951b
--- /dev/null
+++ b/docs/source/api/BamWriter.rst
@@ -0,0 +1,11 @@
+BamWriter
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamWriter.h>
+
+.. doxygenclass:: PacBio::BAM::BamWriter
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BarcodeQuery.rst b/docs/source/api/BarcodeQuery.rst

new file mode 100644 (file)

index 0000000..5836059
--- /dev/null
+++ b/docs/source/api/BarcodeQuery.rst
@@ -0,0 +1,11 @@
+BarcodeQuery
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/BarcodeQuery.h>
+
+.. doxygenclass:: PacBio::BAM::BarcodeQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BarcodeSet.rst b/docs/source/api/BarcodeSet.rst

new file mode 100644 (file)

index 0000000..a7ee056
--- /dev/null
+++ b/docs/source/api/BarcodeSet.rst
@@ -0,0 +1,11 @@
+BarcodeSet
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::BarcodeSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Cigar.rst b/docs/source/api/Cigar.rst

new file mode 100644 (file)

index 0000000..cea30d5
--- /dev/null
+++ b/docs/source/api/Cigar.rst
@@ -0,0 +1,11 @@
+Cigar
+=====
+
+.. code-block:: cpp
+
+   #include <pbbam/Cigar.h>
+
+.. doxygenclass:: PacBio::BAM::Cigar
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/CigarOperation.rst b/docs/source/api/CigarOperation.rst

new file mode 100644 (file)

index 0000000..856400a
--- /dev/null
+++ b/docs/source/api/CigarOperation.rst
@@ -0,0 +1,13 @@
+CigarOperation
+==============
+
+.. code-block:: cpp
+
+   #include <pbbam/CigarOperation.h>
+   
+.. doxygenenum:: PacBio::BAM::CigarOperationType   
+
+.. doxygenclass:: PacBio::BAM::CigarOperation
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Compare.rst b/docs/source/api/Compare.rst

new file mode 100644 (file)

index 0000000..bb28a7e
--- /dev/null
+++ b/docs/source/api/Compare.rst
@@ -0,0 +1,8 @@
+Compare
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/Compare.h>
+
+.. doxygenfile:: Compare.h
+\ No newline at end of file
diff --git a/docs/source/api/Config.rst b/docs/source/api/Config.rst

new file mode 100644 (file)

index 0000000..c4be9e4
--- /dev/null
+++ b/docs/source/api/Config.rst
@@ -0,0 +1,8 @@
+Config
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/Conifig.h>
+
+.. doxygenfile:: Config.h
+\ No newline at end of file
diff --git a/docs/source/api/ConsensusAlignmentSet.rst b/docs/source/api/ConsensusAlignmentSet.rst

new file mode 100644 (file)

index 0000000..bc5a7e5
--- /dev/null
+++ b/docs/source/api/ConsensusAlignmentSet.rst
@@ -0,0 +1,11 @@
+ConsensusAlignmentSet
+=====================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ConsensusAlignmentSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ConsensusReadSet.rst b/docs/source/api/ConsensusReadSet.rst

new file mode 100644 (file)

index 0000000..846698d
--- /dev/null
+++ b/docs/source/api/ConsensusReadSet.rst
@@ -0,0 +1,11 @@
+ConsensusReadSet
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ConsensusReadSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ContigSet.rst b/docs/source/api/ContigSet.rst

new file mode 100644 (file)

index 0000000..96bb20b
--- /dev/null
+++ b/docs/source/api/ContigSet.rst
@@ -0,0 +1,11 @@
+ContigSet
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ContigSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/DataSet.rst b/docs/source/api/DataSet.rst

new file mode 100644 (file)

index 0000000..8b3f0db
--- /dev/null
+++ b/docs/source/api/DataSet.rst
@@ -0,0 +1,11 @@
+DataSet
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSet.h>
+
+.. doxygenclass:: PacBio::BAM::DataSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/DataSetBase.rst b/docs/source/api/DataSetBase.rst

new file mode 100644 (file)

index 0000000..f23fbb5
--- /dev/null
+++ b/docs/source/api/DataSetBase.rst
@@ -0,0 +1,11 @@
+DataSetBase
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::DataSetBase
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/DataSetMetadata.rst b/docs/source/api/DataSetMetadata.rst

new file mode 100644 (file)

index 0000000..eea260d
--- /dev/null
+++ b/docs/source/api/DataSetMetadata.rst
@@ -0,0 +1,11 @@
+DataSetMetadata
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::DataSetMetadata
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/EntireFileQuery.rst b/docs/source/api/EntireFileQuery.rst

new file mode 100644 (file)

index 0000000..4e7b86b
--- /dev/null
+++ b/docs/source/api/EntireFileQuery.rst
@@ -0,0 +1,11 @@
+EntireFileQuery
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/EntireFileQuery.h>
+
+.. doxygenclass:: PacBio::BAM::EntireFileQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ExtensionElement.rst b/docs/source/api/ExtensionElement.rst

new file mode 100644 (file)

index 0000000..980303e
--- /dev/null
+++ b/docs/source/api/ExtensionElement.rst
@@ -0,0 +1,11 @@
+ExtensionElement
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ExtensionElement
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Extensions.rst b/docs/source/api/Extensions.rst

new file mode 100644 (file)

index 0000000..6704807
--- /dev/null
+++ b/docs/source/api/Extensions.rst
@@ -0,0 +1,11 @@
+Extensions
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::Extensions
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ExternalResource.rst b/docs/source/api/ExternalResource.rst

new file mode 100644 (file)

index 0000000..03ab0d3
--- /dev/null
+++ b/docs/source/api/ExternalResource.rst
@@ -0,0 +1,11 @@
+ExternalResource
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ExternalResource
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ExternalResources.rst b/docs/source/api/ExternalResources.rst

new file mode 100644 (file)

index 0000000..bd72ea4
--- /dev/null
+++ b/docs/source/api/ExternalResources.rst
@@ -0,0 +1,11 @@
+ExternalResources
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ExternalResources
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/FileIndex.rst b/docs/source/api/FileIndex.rst

new file mode 100644 (file)

index 0000000..c117214
--- /dev/null
+++ b/docs/source/api/FileIndex.rst
@@ -0,0 +1,11 @@
+FileIndex
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::FileIndex
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/FileIndices.rst b/docs/source/api/FileIndices.rst

new file mode 100644 (file)

index 0000000..b25720c
--- /dev/null
+++ b/docs/source/api/FileIndices.rst
@@ -0,0 +1,11 @@
+FileIndices
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::FileIndices
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Filter.rst b/docs/source/api/Filter.rst

new file mode 100644 (file)

index 0000000..6faa8aa
--- /dev/null
+++ b/docs/source/api/Filter.rst
@@ -0,0 +1,11 @@
+Filter
+======
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::Filter
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Filters.rst b/docs/source/api/Filters.rst

new file mode 100644 (file)

index 0000000..7ea1620
--- /dev/null
+++ b/docs/source/api/Filters.rst
@@ -0,0 +1,11 @@
+Filters
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::Filters
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Frames.rst b/docs/source/api/Frames.rst

new file mode 100644 (file)

index 0000000..cf260f2
--- /dev/null
+++ b/docs/source/api/Frames.rst
@@ -0,0 +1,11 @@
+Frames
+======
+
+.. code-block:: cpp
+
+   #include <pbbam/Frames.h>
+
+.. doxygenclass:: PacBio::BAM::Frames
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/GenomicInterval.rst b/docs/source/api/GenomicInterval.rst

new file mode 100644 (file)

index 0000000..811b83a
--- /dev/null
+++ b/docs/source/api/GenomicInterval.rst
@@ -0,0 +1,11 @@
+GenomicInterval
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/GenomicInterval.h>
+
+.. doxygenclass:: PacBio::BAM::GenomicInterval
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/GenomicIntervalCompositeBamReader.rst b/docs/source/api/GenomicIntervalCompositeBamReader.rst

new file mode 100644 (file)

index 0000000..f658621
--- /dev/null
+++ b/docs/source/api/GenomicIntervalCompositeBamReader.rst
@@ -0,0 +1,11 @@
+GenomicIntervalCompositeBamReader
+=================================
+
+.. code-block:: cpp
+
+   #include <pbbam/CompositeBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::GenomicIntervalCompositeBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/GenomicIntervalQuery.rst b/docs/source/api/GenomicIntervalQuery.rst

new file mode 100644 (file)

index 0000000..7bae558
--- /dev/null
+++ b/docs/source/api/GenomicIntervalQuery.rst
@@ -0,0 +1,11 @@
+GenomicIntervalQuery
+====================
+
+.. code-block:: cpp
+
+   #include <pbbam/GenomicIntervalQuery.h>
+
+.. doxygenclass:: PacBio::BAM::GenomicIntervalQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/HdfSubreadSet.rst b/docs/source/api/HdfSubreadSet.rst

new file mode 100644 (file)

index 0000000..88bf008
--- /dev/null
+++ b/docs/source/api/HdfSubreadSet.rst
@@ -0,0 +1,11 @@
+HdfSubreadSet
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::HdfSubreadSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/IndexResultBlock.rst b/docs/source/api/IndexResultBlock.rst

new file mode 100644 (file)

index 0000000..fac804a
--- /dev/null
+++ b/docs/source/api/IndexResultBlock.rst
@@ -0,0 +1,17 @@
+IndexResultBlock
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiBasicTypes.h>
+
+.. doxygenstruct:: PacBio::BAM::IndexResultBlock
+   :members:
+   :protected-members:
+   :undoc-members:
+   
+.. doxygentypedef:: PacBio::BAM::IndexResultBlocks
+
+.. doxygentypedef:: PacBio::BAM::IndexList
+   
+.. doxygentypedef:: PacBio::BAM::IndexRange
+\ No newline at end of file
diff --git a/docs/source/api/IndexedFastaReader.rst b/docs/source/api/IndexedFastaReader.rst

new file mode 100644 (file)

index 0000000..7c46064
--- /dev/null
+++ b/docs/source/api/IndexedFastaReader.rst
@@ -0,0 +1,11 @@
+IndexedFastaReader
+==================
+
+.. code-block:: cpp
+
+   #include <pbbam/IndexedFastaReader.h>
+
+.. doxygenclass:: PacBio::BAM::IndexedFastaReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Interval.rst b/docs/source/api/Interval.rst

new file mode 100644 (file)

index 0000000..f506a19
--- /dev/null
+++ b/docs/source/api/Interval.rst
@@ -0,0 +1,11 @@
+Interval
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/Interval.h>
+
+.. doxygenclass:: PacBio::BAM::Interval
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/InvalidSequencingChemistryException.rst b/docs/source/api/InvalidSequencingChemistryException.rst

new file mode 100644 (file)

index 0000000..d521ecc
--- /dev/null
+++ b/docs/source/api/InvalidSequencingChemistryException.rst
@@ -0,0 +1,11 @@
+InvalidSequencingChemistryException
+===================================
+
+.. code-block:: cpp
+
+   #include <pbbam/exception/InvalidSequencingChemistryException.h>
+
+.. doxygenclass:: PacBio::BAM::InvalidSequencingChemistryException
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/LocalContextFlags.rst b/docs/source/api/LocalContextFlags.rst

new file mode 100644 (file)

index 0000000..8cd63be
--- /dev/null
+++ b/docs/source/api/LocalContextFlags.rst
@@ -0,0 +1,8 @@
+LocalContextFlags
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/LocalContextFlags.h>
+
+.. doxygenenum:: PacBio::BAM::LocalContextFlags
diff --git a/docs/source/api/NamespaceInfo.rst b/docs/source/api/NamespaceInfo.rst

new file mode 100644 (file)

index 0000000..c7613ec
--- /dev/null
+++ b/docs/source/api/NamespaceInfo.rst
@@ -0,0 +1,11 @@
+NamespaceInfo
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetXsd.h>
+
+.. doxygenclass:: PacBio::BAM::NamespaceInfo
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/NamespaceRegistry.rst b/docs/source/api/NamespaceRegistry.rst

new file mode 100644 (file)

index 0000000..2f8f9a7
--- /dev/null
+++ b/docs/source/api/NamespaceRegistry.rst
@@ -0,0 +1,11 @@
+NamespaceRegistry
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetXsd.h>
+
+.. doxygenclass:: PacBio::BAM::NamespaceRegistry
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Orientation.rst b/docs/source/api/Orientation.rst

new file mode 100644 (file)

index 0000000..e9bbc42
--- /dev/null
+++ b/docs/source/api/Orientation.rst
@@ -0,0 +1,8 @@
+Orientation
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/Orientation.h>
+
+.. doxygenenum:: PacBio::BAM::Orientation
diff --git a/docs/source/api/ParentTool.rst b/docs/source/api/ParentTool.rst

new file mode 100644 (file)

index 0000000..e2ffa1b
--- /dev/null
+++ b/docs/source/api/ParentTool.rst
@@ -0,0 +1,11 @@
+ParentTool
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ParentTool
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiBuilder.rst b/docs/source/api/PbiBuilder.rst

new file mode 100644 (file)

index 0000000..d795d0f
--- /dev/null
+++ b/docs/source/api/PbiBuilder.rst
@@ -0,0 +1,11 @@
+PbiBuilder
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiBuilder.h>
+
+.. doxygenclass:: PacBio::BAM::PbiBuilder
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiFile.rst b/docs/source/api/PbiFile.rst

new file mode 100644 (file)

index 0000000..5a8b85a
--- /dev/null
+++ b/docs/source/api/PbiFile.rst
@@ -0,0 +1,14 @@
+PbiFile
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiFile.h>
+
+.. doxygenenum:: PacBio::BAM::PbiFile::Section
+
+.. doxygentypedef:: PacBio::BAM::PbiFile::Sections
+
+.. doxygenenum:: PacBio::BAM::PbiFile::VersionEnum
+
+.. doxygenfunction:: PacBio::BAM::PbiFile::CreateFrom
diff --git a/docs/source/api/PbiFilter.rst b/docs/source/api/PbiFilter.rst

new file mode 100644 (file)

index 0000000..261498b
--- /dev/null
+++ b/docs/source/api/PbiFilter.rst
@@ -0,0 +1,11 @@
+PbiFilter
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiFilter.h>
+
+.. doxygenclass:: PacBio::BAM::PbiFilter
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiFilterCompositeBamReader.rst b/docs/source/api/PbiFilterCompositeBamReader.rst

new file mode 100644 (file)

index 0000000..7a69df3
--- /dev/null
+++ b/docs/source/api/PbiFilterCompositeBamReader.rst
@@ -0,0 +1,11 @@
+PbiFilterCompositeBamReader
+===========================
+
+.. code-block:: cpp
+
+   #include <pbbam/CompositeBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::PbiFilterCompositeBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiFilterQuery.rst b/docs/source/api/PbiFilterQuery.rst

new file mode 100644 (file)

index 0000000..75bbc12
--- /dev/null
+++ b/docs/source/api/PbiFilterQuery.rst
@@ -0,0 +1,11 @@
+PbiFilterQuery
+==============
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiFilterQuery.h>
+
+.. doxygenclass:: PacBio::BAM::PbiFilterQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiFilterTypes.rst b/docs/source/api/PbiFilterTypes.rst

new file mode 100644 (file)

index 0000000..052389b
--- /dev/null
+++ b/docs/source/api/PbiFilterTypes.rst
@@ -0,0 +1,8 @@
+PbiFilterTypes
+==============
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiFilterTypes.h>
+
+.. doxygenfile:: PbiFilterTypes.h
+\ No newline at end of file
diff --git a/docs/source/api/PbiIndexedBamReader.rst b/docs/source/api/PbiIndexedBamReader.rst

new file mode 100644 (file)

index 0000000..5450c8a
--- /dev/null
+++ b/docs/source/api/PbiIndexedBamReader.rst
@@ -0,0 +1,11 @@
+PbiIndexedBamReader
+===================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiIndexedBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::PbiIndexedBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawBarcodeData.rst b/docs/source/api/PbiRawBarcodeData.rst

new file mode 100644 (file)

index 0000000..c72ebfb
--- /dev/null
+++ b/docs/source/api/PbiRawBarcodeData.rst
@@ -0,0 +1,11 @@
+PbiRawBarcodeData
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawBarcodeData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawBasicData.rst b/docs/source/api/PbiRawBasicData.rst

new file mode 100644 (file)

index 0000000..2282387
--- /dev/null
+++ b/docs/source/api/PbiRawBasicData.rst
@@ -0,0 +1,11 @@
+PbiRawBasicData
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawBasicData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawData.rst b/docs/source/api/PbiRawData.rst

new file mode 100644 (file)

index 0000000..1a974e8
--- /dev/null
+++ b/docs/source/api/PbiRawData.rst
@@ -0,0 +1,11 @@
+PbiRawData
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawMappedData.rst b/docs/source/api/PbiRawMappedData.rst

new file mode 100644 (file)

index 0000000..42e1de1
--- /dev/null
+++ b/docs/source/api/PbiRawMappedData.rst
@@ -0,0 +1,11 @@
+PbiRawMappedData
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawMappedData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawReferenceData.rst b/docs/source/api/PbiRawReferenceData.rst

new file mode 100644 (file)

index 0000000..460cde4
--- /dev/null
+++ b/docs/source/api/PbiRawReferenceData.rst
@@ -0,0 +1,11 @@
+PbiRawReferenceData
+===================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawReferenceData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiReferenceEntry.rst b/docs/source/api/PbiReferenceEntry.rst

new file mode 100644 (file)

index 0000000..472e586
--- /dev/null
+++ b/docs/source/api/PbiReferenceEntry.rst
@@ -0,0 +1,11 @@
+PbiReferenceEntry
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiReferenceEntry
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Position.rst b/docs/source/api/Position.rst

new file mode 100644 (file)

index 0000000..3c945f2
--- /dev/null
+++ b/docs/source/api/Position.rst
@@ -0,0 +1,10 @@
+Position
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/Position.h>
+
+.. doxygentypedef:: PacBio::BAM::Position
+
+.. doxygenvariable:: PacBio::BAM::UnmappedPosition
+\ No newline at end of file
diff --git a/docs/source/api/ProgramInfo.rst b/docs/source/api/ProgramInfo.rst

new file mode 100644 (file)

index 0000000..b58c93a
--- /dev/null
+++ b/docs/source/api/ProgramInfo.rst
@@ -0,0 +1,11 @@
+ProgramInfo
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/ProgramInfo.h>
+
+.. doxygenclass:: PacBio::BAM::ProgramInfo
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/QNameQuery.rst b/docs/source/api/QNameQuery.rst

new file mode 100644 (file)

index 0000000..b549436
--- /dev/null
+++ b/docs/source/api/QNameQuery.rst
@@ -0,0 +1,11 @@
+QNameQuery
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/QNameQuery.h>
+
+.. doxygenclass:: PacBio::BAM::QNameQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/QualityValue.rst b/docs/source/api/QualityValue.rst

new file mode 100644 (file)

index 0000000..3520c5a
--- /dev/null
+++ b/docs/source/api/QualityValue.rst
@@ -0,0 +1,11 @@
+QualityValue
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/QualityValue.h>
+
+.. doxygenclass:: PacBio::BAM::QualityValue
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/QualityValues.rst b/docs/source/api/QualityValues.rst

new file mode 100644 (file)

index 0000000..8f6dfa5
--- /dev/null
+++ b/docs/source/api/QualityValues.rst
@@ -0,0 +1,11 @@
+QualityValues
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/QualityValues.h>
+
+.. doxygenclass:: PacBio::BAM::QualityValues
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ReadAccuracyQuery.rst b/docs/source/api/ReadAccuracyQuery.rst

new file mode 100644 (file)

index 0000000..abfd1e6
--- /dev/null
+++ b/docs/source/api/ReadAccuracyQuery.rst
@@ -0,0 +1,11 @@
+ReadAccuracyQuery
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/ReadAccuracyQuery.h>
+
+.. doxygenclass:: PacBio::BAM::ReadAccuracyQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ReadGroupInfo.rst b/docs/source/api/ReadGroupInfo.rst

new file mode 100644 (file)

index 0000000..7fb4f69
--- /dev/null
+++ b/docs/source/api/ReadGroupInfo.rst
@@ -0,0 +1,21 @@
+ReadGroupInfo
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/ReadGroupInfo.h>
+
+.. doxygenenum:: PacBio::BAM::BaseFeature
+
+.. doxygenenum:: PacBio::BAM::FrameCodec
+
+.. doxygenenum:: PacBio::BAM::BarcodeModeType
+
+.. doxygenenum:: PacBio::BAM::BarcodeQualityType
+
+.. doxygenclass:: PacBio::BAM::ReadGroupInfo
+   :members:
+   :protected-members:
+   :undoc-members:
+   
+.. doxygenfunction:: PacBio::BAM::MakeReadGroupId
+\ No newline at end of file
diff --git a/docs/source/api/ReferenceSet.rst b/docs/source/api/ReferenceSet.rst

new file mode 100644 (file)

index 0000000..22e4703
--- /dev/null
+++ b/docs/source/api/ReferenceSet.rst
@@ -0,0 +1,11 @@
+ReferenceSet
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ReferenceSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SamTagCodec.rst b/docs/source/api/SamTagCodec.rst

new file mode 100644 (file)

index 0000000..4f8d65d
--- /dev/null
+++ b/docs/source/api/SamTagCodec.rst
@@ -0,0 +1,11 @@
+SamTagCodec
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/SamTagCodec.h>
+
+.. doxygenclass:: PacBio::BAM::SamTagCodec
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SequenceInfo.rst b/docs/source/api/SequenceInfo.rst

new file mode 100644 (file)

index 0000000..393d5bb
--- /dev/null
+++ b/docs/source/api/SequenceInfo.rst
@@ -0,0 +1,11 @@
+SequenceInfo
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/SequenceInfo.h>
+
+.. doxygenclass:: PacBio::BAM::SequenceInfo
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SequentialCompositeBamReader.rst b/docs/source/api/SequentialCompositeBamReader.rst

new file mode 100644 (file)

index 0000000..31ed3b1
--- /dev/null
+++ b/docs/source/api/SequentialCompositeBamReader.rst
@@ -0,0 +1,11 @@
+SequentialCompositeBamReader
+============================
+
+.. code-block:: cpp
+
+   #include <pbbam/CompositeBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::SequentialCompositeBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Strand.rst b/docs/source/api/Strand.rst

new file mode 100644 (file)

index 0000000..4978f72
--- /dev/null
+++ b/docs/source/api/Strand.rst
@@ -0,0 +1,8 @@
+Strand
+======
+
+.. code-block:: cpp
+
+   #include <pbbam/Strand.h>
+
+.. doxygenenum:: PacBio::BAM::Strand 
diff --git a/docs/source/api/SubDataSets.rst b/docs/source/api/SubDataSets.rst

new file mode 100644 (file)

index 0000000..d179065
--- /dev/null
+++ b/docs/source/api/SubDataSets.rst
@@ -0,0 +1,11 @@
+SubDataSets
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::SubDataSets
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SubreadLengthQuery.rst b/docs/source/api/SubreadLengthQuery.rst

new file mode 100644 (file)

index 0000000..23000b3
--- /dev/null
+++ b/docs/source/api/SubreadLengthQuery.rst
@@ -0,0 +1,11 @@
+SubreadLengthQuery
+==================
+
+.. code-block:: cpp
+
+   #include <pbbam/SubreadLengthQuery.h>
+
+.. doxygenclass:: PacBio::BAM::SubreadLengthQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SubreadSet.rst b/docs/source/api/SubreadSet.rst

new file mode 100644 (file)

index 0000000..bfc3c13
--- /dev/null
+++ b/docs/source/api/SubreadSet.rst
@@ -0,0 +1,11 @@
+SubreadSet
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::SubreadSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Tag.rst b/docs/source/api/Tag.rst

new file mode 100644 (file)

index 0000000..50b85c7
--- /dev/null
+++ b/docs/source/api/Tag.rst
@@ -0,0 +1,15 @@
+Tag
+===
+
+.. code-block:: cpp
+
+   #include <pbbam/Tag.h>
+
+.. doxygenenum:: PacBio::BAM::TagDataType
+
+.. doxygenenum:: PacBio::BAM::TagModifier
+
+.. doxygenclass:: PacBio::BAM::Tag
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/TagCollection.rst b/docs/source/api/TagCollection.rst

new file mode 100644 (file)

index 0000000..1314b13
--- /dev/null
+++ b/docs/source/api/TagCollection.rst
@@ -0,0 +1,11 @@
+TagCollection
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/TagCollection.h>
+
+.. doxygenclass:: PacBio::BAM::TagCollection
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualPolymeraseBamRecord.rst b/docs/source/api/VirtualPolymeraseBamRecord.rst

new file mode 100644 (file)

index 0000000..06d5531
--- /dev/null
+++ b/docs/source/api/VirtualPolymeraseBamRecord.rst
@@ -0,0 +1,11 @@
+VirtualPolymeraseBamRecord
+==========================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualPolymeraseBamRecord.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualPolymeraseBamRecord
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualPolymeraseCompositeReader.rst b/docs/source/api/VirtualPolymeraseCompositeReader.rst

new file mode 100644 (file)

index 0000000..e6cab4e
--- /dev/null
+++ b/docs/source/api/VirtualPolymeraseCompositeReader.rst
@@ -0,0 +1,11 @@
+VirtualPolymeraseCompositeReader
+================================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualPolymeraseCompositeReader.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualPolymeraseCompositeReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualPolymeraseReader.rst b/docs/source/api/VirtualPolymeraseReader.rst

new file mode 100644 (file)

index 0000000..14a46e8
--- /dev/null
+++ b/docs/source/api/VirtualPolymeraseReader.rst
@@ -0,0 +1,11 @@
+VirtualPolymeraseReader
+=======================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualPolymeraseReader.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualPolymeraseReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualRegion.rst b/docs/source/api/VirtualRegion.rst

new file mode 100644 (file)

index 0000000..7a09846
--- /dev/null
+++ b/docs/source/api/VirtualRegion.rst
@@ -0,0 +1,11 @@
+VirtualRegion
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualRegion.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualRegion
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualRegionType.rst b/docs/source/api/VirtualRegionType.rst

new file mode 100644 (file)

index 0000000..4279200
--- /dev/null
+++ b/docs/source/api/VirtualRegionType.rst
@@ -0,0 +1,8 @@
+VirtualRegionType
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualRegionType.h>
+
+.. doxygenenum:: PacBio::BAM::VirtualRegionType
diff --git a/docs/source/api/VirtualRegionTypeMap.rst b/docs/source/api/VirtualRegionTypeMap.rst

new file mode 100644 (file)

index 0000000..eebe637
--- /dev/null
+++ b/docs/source/api/VirtualRegionTypeMap.rst
@@ -0,0 +1,11 @@
+VirtualRegionTypeMap
+====================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualRegionTypeMap.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualRegionTypeMap
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ZmwGroupQuery.rst b/docs/source/api/ZmwGroupQuery.rst

new file mode 100644 (file)

index 0000000..01fc18a
--- /dev/null
+++ b/docs/source/api/ZmwGroupQuery.rst
@@ -0,0 +1,11 @@
+ZmwGroupQuery
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/ZmwGroupQuery.h>
+
+.. doxygenclass:: PacBio::BAM::ZmwGroupQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ZmwQuery.rst b/docs/source/api/ZmwQuery.rst

new file mode 100644 (file)

index 0000000..375fcb0
--- /dev/null
+++ b/docs/source/api/ZmwQuery.rst
@@ -0,0 +1,11 @@
+ZmwQuery
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/ZmwQuery.h>
+
+.. doxygenclass:: PacBio::BAM::ZmwQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ZmwWhitelistVirtualReader.rst b/docs/source/api/ZmwWhitelistVirtualReader.rst

new file mode 100644 (file)

index 0000000..95d2d1a
--- /dev/null
+++ b/docs/source/api/ZmwWhitelistVirtualReader.rst
@@ -0,0 +1,11 @@
+ZmwWhitelistVirtualReader
+=========================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/ZmwWhitelistVirtualReader.h>
+
+.. doxygenclass:: PacBio::BAM::ZmwWhitelistVirtualReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst

new file mode 100644 (file)

index 0000000..354c0de
--- /dev/null
+++ b/docs/source/api_reference.rst
@@ -0,0 +1,12 @@
+.. _api_reference:
+
+C++ API Reference
+=================
+
+Watch this space for more recipes & how-tos. 
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+
+   api/*
diff --git a/docs/source/commandline_utilities.rst b/docs/source/commandline_utilities.rst

new file mode 100644 (file)

index 0000000..7f1bdaf
--- /dev/null
+++ b/docs/source/commandline_utilities.rst
@@ -0,0 +1,15 @@
+.. _command_line:
+
+Command Line Utilities
+======================
+
+In addition to the main library and wrappers, pbbam also provides a few basic
+utilities for working with PacBio indices (".pbi" files).
+
+.. toctree::
+   :maxdepth: 1
+
+   tools/bam2sam
+   tools/pbindex
+   tools/pbindexdump
+   tools/pbmerge
diff --git a/docs/source/conf.py b/docs/source/conf.py

new file mode 100755 (executable)

index 0000000..8fb7646
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,332 @@
+# -*- coding: utf-8 -*-
+#
+# pbbam documentation build configuration file, created by
+# sphinx-quickstart on Fri Dec  4 10:08:52 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+import shlex
+import re
+import subprocess
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# get RTD to run doxygen first, per http://breathe.readthedocs.org/en/latest/readthedocs.html
+# but... we generate our actual Doxyfile via CMake in a normal build,
+# so we need to create one here, subbing actual values
+read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
+if read_the_docs_build:
+
+    # fetch directory info
+    this_dir = os.path.abspath(os.getcwd())
+    docs_dir = os.path.abspath(os.path.join(this_dir, '..'))
+    root_dir = os.path.abspath(os.path.join(docs_dir, '..'))
+    include_dir = os.path.abspath(os.path.join(root_dir, 'include'))
+
+    # get project version
+    version = ''
+    with open(os.path.abspath(os.path.join(root_dir, 'CMakeLists.txt')), 'r') as cmakeFile:
+        for line in cmakeFile:
+            if line.startswith('project'):
+                version = re.search(r'VERSION\s*([\d.]+)', line).group(1)
+                break
+
+    # read Doxyfile.in, replace markers with real values, and write Doxyfile
+    inDoxyfile = open(os.path.abspath(os.path.join(docs_dir, 'Doxyfile.in')), 'r')
+    configIn   = inDoxyfile.read()
+    configOut  = re.sub('@PacBioBAM_NAME@',       'pbbam', \
+                 re.sub('@PacBioBAM_VERSION@',    version, \
+                 re.sub('@PacBioBAM_DocsDir@',    docs_dir, \
+                 re.sub('@PacBioBAM_IncludeDir@', include_dir, configIn))))
+    outDoxyfile = open(os.path.abspath(os.path.join(docs_dir, 'Doxyfile')), 'w')
+    #print(configOut, outDoxyfile)
+    print >>outDoxyfile, configOut
+    outDoxyfile.close()
+
+    # now run Doxygen
+    subprocess.call('cd ..; doxygen', shell=True)
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['breathe']
+#extensions = [
+#    'sphinx.ext.autodoc',
+ #   'sphinx.ext.coverage',
+ #   'breathe',
+#]
+
+# Setup Breathe extension varialbes
+breathe_projects = { 'pbbam' : os.path.join(os.getcwd(), '..', 'xml') + os.path.sep }
+breathe_default_project = 'pbbam'
+breathe_default_members = ('members', 'undoc-members')
+breathe_implementation_filename_extensions = [ '.cpp', '.inl' ]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'pbbam'
+copyright = u'2015, Derek Barnett'
+author = u'Derek Barnett'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '1.0.6'
+# The full version, including alpha/beta/rc tags.
+release = '1.0.6'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = []
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'pacbio-theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+html_theme_path = ['.']
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
+#html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# Now only 'ja' uses this config value
+#html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pbbamdoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+
+# Latex figure (float) alignment
+#'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, 'pbbam.tex', u'pbbam Documentation',
+   u'Derek Barnett', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pbbam', u'pbbam Documentation',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  (master_doc, 'pbbam', u'pbbam Documentation',
+   author, 'pbbam', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst

new file mode 100644 (file)

index 0000000..a69a987
--- /dev/null
+++ b/docs/source/getting_started.rst
@@ -0,0 +1,105 @@
+
+.. _getting_started:
+
+Getting Started
+===============
+
+.. _getting_started-requirements:
+
+Requirements
+------------
+
+These components will almost certainly already be on your system. 
+ 
+* `gcc`_ (4.8+) OR `clang`_ (v3.1+)
+* pthreads
+* zlib
+
+Double-check your compiler version, to be sure it is compatible.
+
+.. code-block:: console
+
+   $ g++ -v    
+   $ clang -v  
+
+Additional requirements:
+
+* `Boost`_ (1.55+)
+* `Meson`_ (0.48+)
+* `Google Test`_
+* `htslib`_ (1.4+)
+
+For building API documentation locally:
+
+* `Doxygen`_
+
+For maximal convenience, install htslib and google test in the same parent directory you plan to install pbbam.
+
+.. _Boost: http://www.boost.org/
+.. _clang: http://clang.llvm.org/
+.. _Meson: https://mesonbuild.com
+.. _Ninja: https://ninja-build.org/ (only required when using Meson, optional for CMake)
+.. _Doxygen: http://www.stack.nl/~dimitri/doxygen/
+.. _gcc: https://gcc.gnu.org/
+.. _Google Test: https://github.com/google/googletest
+.. _htslib: https://github.com/samtools/htslib.git 
+
+.. _getting_started-build:
+
+Clone & Build
+-------------
+
+.. note::
+
+   The following steps are for building the C++ library and command-line utilities. 
+   If you are integrating pbbam into a C#, Python, or R project, take a look at the 
+   instructions for :ref:`additional languages <swig_bindings>`.
+
+The basic steps for obtaining pbbam and building it from source are as follows:
+
+Build and install htslib, per the project's instructions (or on OSX "brew install htslib").
+
+Clone
+^^^^^
+
+You should first clone the repository:
+
+.. code-block:: console
+
+   $ git clone https://github.com/PacificBiosciences/pbbam.git
+   $ cd pbbam
+
+Building with Meson
+^^^^^^^^^^^^^^^^^^^
+
+Building with Meson is generally faster and more versatile. Meson strictly requires building out of source:
+
+.. code-block:: console
+
+   $ mkdir build
+   $ cd build
+   $ meson --prefix /my/install/prefix -Dtests=true ..
+   $ ninja
+
+where ninja will by default utilize a number of threads for compilation equal to the number of logical
+cores on your system. Here ``-Dtests=true`` enables pulling in dependencies for testing. In
+order to run the test suite, run:
+
+.. code-block:: console
+
+   $ ninja test
+
+If you wish to install pbbam, run:
+
+.. code-block:: console
+
+   $ ninja install
+
+and ninja will install pbbam to ``/my/install/prefix``.
+
+Integrate
+---------
+
+If you built and installed pbbam, pkg-config files will be available to be consumed by projects
+wishing to utilize pbbam. Autoconf, CMake, Waf, SCons and Meson all have means to determine
+dependency information from pkg-config files.
diff --git a/docs/source/index.rst b/docs/source/index.rst

new file mode 100644 (file)

index 0000000..426c3c5
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,33 @@
+.. pbbam documentation master file, created by
+   sphinx-quickstart on Fri Dec  4 10:08:52 2015.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+.. _home:
+
+pbbam documentation
+===================
+
+As of the 3.0 release of SMRTanalysis, PacBio is embracing the industry standard BAM 
+format for (both aligned and unaligned) basecall data files. We have also formulated 
+a BAM companion file format (bam.pbi) enabling fast access to a richer set of per-read 
+information as well as compatibility for software built around the legacy cmp.h5 format.
+
+The **pbbam** software package provides components to create, query, & edit PacBio BAM
+files and associated indices. These components include a core C++ library, bindings for 
+additional languages, and command-line utilities.
+
+.. toctree::
+   :maxdepth: 1
+
+   getting_started
+   api_reference
+   swig_bindings
+   commandline_utilities
+
+
+Search:
+
+* :ref:`genindex`
+* :ref:`search`
+
diff --git a/docs/source/pacbio-theme/static/headerGradient.jpg b/docs/source/pacbio-theme/static/headerGradient.jpg

new file mode 100644 (file)

index 0000000..883f147

Binary files /dev/null and b/docs/source/pacbio-theme/static/headerGradient.jpg differ
diff --git a/docs/source/pacbio-theme/static/pacbio.css b/docs/source/pacbio-theme/static/pacbio.css

new file mode 100644 (file)

index 0000000..b4ab87f
--- /dev/null
+++ b/docs/source/pacbio-theme/static/pacbio.css
@@ -0,0 +1,238 @@
+/**
+ * Sphinx stylesheet -- default theme
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+ 
+@import url("basic.css");
+ 
+/* -- page layout ----------------------------------------------------------- */
+ 
+body {
+    font-family: Arial, sans-serif;
+    font-size: 100%;
+    background-color: #555;
+    color: #555;
+    margin: 0;
+    padding: 0;
+    min-width: 500px;
+    max-width: 956px;
+    margin: 0 auto;
+}
+
+div.documentwrapper {
+    float: left;
+    width: 100%;
+}
+
+div.bodywrapper {
+    margin: 0 0 0 230px;
+}
+
+hr{
+    border: 1px solid #B1B4B6;
+    
+}
+ 
+div.document {
+    background-color: #eee;
+}
+ 
+div.body {
+    background-color: #ffffff;
+    color: #3E4349;
+    padding: 30px 30px 30px 30px;
+    font-size: 0.8em;
+}
+ 
+div.footer {
+    color: #555;
+       background-color: #fff;
+    padding: 13px 0;
+    text-align: center;
+    font-size: 75%;
+
+}
+div.footer a {
+    color: #444;
+    text-decoration: underline;
+}
+ 
+div.related {
+    background: #fff url(headerGradient.jpg);
+    line-height: 80px;
+    color: #fff;
+    font-size: 0.80em;
+    height: 79px;
+    z-index: -1;
+}
+
+div.related ul {
+    background: url(pacbioLogo.png) 10px no-repeat;
+    padding: 0 0 0 200px;
+}
+ 
+div.related a {
+    color: #E2F3CC;
+}
+ 
+div.sphinxsidebar {
+    font-size: 0.75em;
+    line-height: 1.5em;
+}
+
+div.sphinxsidebarwrapper{
+    padding: 20px 0;
+}
+ 
+div.sphinxsidebar h3,
+div.sphinxsidebar h4 {
+    font-family: Arial, sans-serif;
+    color: #222;
+    font-size: 1.2em;
+    font-weight: bold;
+    margin: 0;
+    padding: 5px 10px 0 10px;
+}
+
+div.sphinxsidebar h4{
+    font-size: 1.1em;
+}
+ 
+div.sphinxsidebar h3 a {
+    color: #444;
+}
+ 
+ 
+div.sphinxsidebar p {
+    color: #888;
+    padding: 0px 20px;
+       margin-top: 5px;
+}
+ 
+div.sphinxsidebar p.topless {
+}
+ 
+div.sphinxsidebar ul {
+    margin: 5px 20px 10px 20px;
+    padding: 0;
+    color: #000;
+}
+ 
+div.sphinxsidebar a {
+    color: #444;
+}
+ 
+div.sphinxsidebar input {
+    border: 1px solid #ccc;
+    font-family: sans-serif;
+    font-size: 1em;
+}
+
+div.sphinxsidebar input[type=text]{
+    margin-left: 20px;
+}
+ 
+/* -- body styles ----------------------------------------------------------- */
+ 
+a {
+    color: #005B81;
+    text-decoration: none;
+}
+ 
+a:hover {
+    color: #E32E00;
+    text-decoration: underline;
+}
+ 
+div.body h1,
+div.body h2,
+div.body h3,
+div.body h4,
+div.body h5,
+div.body h6 {
+    font-family: Arial, sans-serif;
+    font-weight: bold;
+    color: #264868;
+    margin: 30px 0px 10px 0px;
+    padding: 5px 0 5px 0px;
+}
+ 
+div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 180%; font-weight: normal; }
+div.body h2 { font-size: 125%; }
+div.body h3 { font-size: 110%; }
+div.body h4 { font-size: 100%; }
+div.body h5 { font-size: 100%; }
+div.body h6 { font-size: 100%; }
+ 
+a.headerlink {
+    color: #c60f0f;
+    font-size: 0.8em;
+    padding: 0 4px 0 4px;
+    text-decoration: none;
+}
+ 
+a.headerlink:hover {
+    background-color: #c60f0f;
+    color: white;
+}
+ 
+div.body p, div.body dd, div.body li {
+    line-height: 1.5em;
+    font-size: 1em;
+}
+ 
+div.admonition p.admonition-title + p {
+    display: inline;
+}
+
+div.highlight{
+    background-color: white;
+}
+
+div.note {
+    background-color: #eee;
+    border: 1px solid #ccc;
+}
+ 
+div.seealso {
+    background-color: #ffc;
+    border: 1px solid #ff6;
+}
+ 
+div.topic {
+    background-color: #eee;
+}
+ 
+div.warning {
+    background-color: #ffe4e4;
+    border: 1px solid #f66;
+}
+ 
+p.admonition-title {
+    display: inline;
+}
+ 
+p.admonition-title:after {
+    content: ":";
+}
+ 
+pre {
+    padding: 10px;
+    background-color: White;
+    color: #222;
+    line-height: 1.2em;
+    border: 1px solid #C6C9CB;
+    font-size: 1.2em;
+    margin: 1.5em 0 1.5em 0;
+    -webkit-box-shadow: 1px 1px 1px #d8d8d8;
+    -moz-box-shadow: 1px 1px 1px #d8d8d8;
+}
+ 
+tt {
+    background-color: #ecf0f3;
+    color: #222;
+    padding: 1px 2px;
+    font-size: 1.2em;
+    font-family: monospace;
+}
+
diff --git a/docs/source/pacbio-theme/static/pacbioLogo.png b/docs/source/pacbio-theme/static/pacbioLogo.png

new file mode 100644 (file)

index 0000000..b2e4887

Binary files /dev/null and b/docs/source/pacbio-theme/static/pacbioLogo.png differ
diff --git a/docs/source/pacbio-theme/static/pygments.css b/docs/source/pacbio-theme/static/pygments.css

new file mode 100644 (file)

index 0000000..4588cde
--- /dev/null
+++ b/docs/source/pacbio-theme/static/pygments.css
@@ -0,0 +1,55 @@
+.c { color: #999988; font-style: italic } /* Comment */
+.k { font-weight: bold } /* Keyword */
+.o { font-weight: bold } /* Operator */
+.cm { color: #999988; font-style: italic } /* Comment.Multiline */
+.cp { color: #999999; font-weight: bold } /* Comment.preproc */
+.c1 { color: #999988; font-style: italic } /* Comment.Single */
+.gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
+.ge { font-style: italic } /* Generic.Emph */
+.gr { color: #aa0000 } /* Generic.Error */
+.gh { color: #999999 } /* Generic.Heading */
+.gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
+.go { color: #111 } /* Generic.Output */
+.gp { color: #555555 } /* Generic.Prompt */
+.gs { font-weight: bold } /* Generic.Strong */
+.gu { color: #aaaaaa } /* Generic.Subheading */
+.gt { color: #aa0000 } /* Generic.Traceback */
+.kc { font-weight: bold } /* Keyword.Constant */
+.kd { font-weight: bold } /* Keyword.Declaration */
+.kp { font-weight: bold } /* Keyword.Pseudo */
+.kr { font-weight: bold } /* Keyword.Reserved */
+.kt { color: #445588; font-weight: bold } /* Keyword.Type */
+.m { color: #009999 } /* Literal.Number */
+.s { color: #bb8844 } /* Literal.String */
+.na { color: #008080 } /* Name.Attribute */
+.nb { color: #999999 } /* Name.Builtin */
+.nc { color: #445588; font-weight: bold } /* Name.Class */
+.no { color: #ff99ff } /* Name.Constant */
+.ni { color: #800080 } /* Name.Entity */
+.ne { color: #990000; font-weight: bold } /* Name.Exception */
+.nf { color: #990000; font-weight: bold } /* Name.Function */
+.nn { color: #555555 } /* Name.Namespace */
+.nt { color: #000080 } /* Name.Tag */
+.nv { color: purple } /* Name.Variable */
+.ow { font-weight: bold } /* Operator.Word */
+.mf { color: #009999 } /* Literal.Number.Float */
+.mh { color: #009999 } /* Literal.Number.Hex */
+.mi { color: #009999 } /* Literal.Number.Integer */
+.mo { color: #009999 } /* Literal.Number.Oct */
+.sb { color: #bb8844 } /* Literal.String.Backtick */
+.sc { color: #bb8844 } /* Literal.String.Char */
+.sd { color: #bb8844 } /* Literal.String.Doc */
+.s2 { color: #bb8844 } /* Literal.String.Double */
+.se { color: #bb8844 } /* Literal.String.Escape */
+.sh { color: #bb8844 } /* Literal.String.Heredoc */
+.si { color: #bb8844 } /* Literal.String.Interpol */
+.sx { color: #bb8844 } /* Literal.String.Other */
+.sr { color: #808000 } /* Literal.String.Regex */
+.s1 { color: #bb8844 } /* Literal.String.Single */
+.ss { color: #bb8844 } /* Literal.String.Symbol */
+.bp { color: #999999 } /* Name.Builtin.Pseudo */
+.vc { color: #ff99ff } /* Name.Variable.Class */
+.vg { color: #ff99ff } /* Name.Variable.Global */
+.vi { color: #ff99ff } /* Name.Variable.Instance */
+.il { color: #009999 } /* Literal.Number.Integer.Long */
+
diff --git a/docs/source/pacbio-theme/theme.conf b/docs/source/pacbio-theme/theme.conf

new file mode 100644 (file)

index 0000000..dd24a1a
--- /dev/null
+++ b/docs/source/pacbio-theme/theme.conf
@@ -0,0 +1,4 @@
+[theme]
+inherit = default 
+stylesheet = pacbio.css
+pygments_style = tango
diff --git a/docs/source/requirements.txt b/docs/source/requirements.txt

new file mode 100644 (file)

index 0000000..cd6467e
--- /dev/null
+++ b/docs/source/requirements.txt
@@ -0,0 +1 @@
+breathe
diff --git a/docs/source/tools/bam2sam.rst b/docs/source/tools/bam2sam.rst

new file mode 100644 (file)

index 0000000..4577686
--- /dev/null
+++ b/docs/source/tools/bam2sam.rst
@@ -0,0 +1,21 @@
+.. _bam2sam:
+
+bam2sam
+=======
+
+::
+
+  Usage: bam2sam [options] [input]
+
+  bam2sam converts a BAM file to SAM. It is essentially a stripped-down 'samtools
+  view', mostly useful for testing/debugging without requiring samtools. Input BAM
+  file is read from a file or stdin, and SAM output is written to stdout.
+
+  Options:
+    -h, --help            show this help message and exit
+    --version             show program's version number and exit
+
+  Options:
+    input               Input BAM file. If not provided, stdin will be used as input.
+    --no-header         Omit header from output.
+    --header-only       Print only the header (no records).
diff --git a/docs/source/tools/pbindex.rst b/docs/source/tools/pbindex.rst

new file mode 100644 (file)

index 0000000..e7c491f
--- /dev/null
+++ b/docs/source/tools/pbindex.rst
@@ -0,0 +1,18 @@
+.. _pbindex:
+
+pbindex
+=======
+
+::
+
+  Usage: pbindex <input>
+
+  pbindex creates a index file that enables random-access to PacBio-specific data
+  in BAM files. Generated index filename will be the same as input BAM plus .pbi suffix.
+
+  Options:
+    -h, --help            show this help message and exit
+    --version             show program's version number and exit
+
+  Input/Output:
+    input                 Input BAM file
diff --git a/docs/source/tools/pbindexdump.rst b/docs/source/tools/pbindexdump.rst

new file mode 100644 (file)

index 0000000..6829064
--- /dev/null
+++ b/docs/source/tools/pbindexdump.rst
@@ -0,0 +1,233 @@
+.. _pbindexdump:
+
+pbindexdump
+===========
+
+::
+
+  Usage: pbindexdump [options] [input]
+
+  pbindexdump prints a human-readable view of PBI data to stdout.
+
+  Options:
+    -h, --help            show this help message and exit
+    --version             show program's version number and exit
+
+  Input/Output:
+    input               Input PBI file. If not provided, stdin will be used as input.
+    --format=STRING     Output format, one of:
+                            json, cpp
+
+                        json: pretty-printed JSON [default]
+
+                        cpp: copy/paste-able C++ code that can be used to
+                        construct the equivalent PacBio::BAM::PbiRawData object
+
+  JSON Formatting:
+    --json-indent-level=INT
+                        JSON indent level [4]
+    --json-raw          Prints fields in a manner that more closely reflects the
+                        PBI file format - presenting data as per-field columns,
+                        not per-record objects.
+
+JSON Output Schemas
+-------------------
+
+Normal JSON:
+
+.. code-block:: JSON
+
+    {
+      "type": "object",
+      "properties": {
+        "fileSections": {
+          "type": "array",
+          "items": { "type": "string" },
+        },
+        "numReads": { "type": "integer" },
+        "reads": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "aEnd": { "type": "integer" },
+              "aStart": { "type": "integer" },
+              "bcForward": { "type": "integer" },
+              "bcQuality": { "type": "integer" },
+              "bcReverse": { "type": "integer" },
+              "contextFlag": { "type": "integer" },
+              "fileOffset": { "type": "integer" },
+              "holeNumber": { "type": "integer" },
+              "mapQuality": { "type": "integer" },
+              "nM": { "type": "integer" },
+              "nMM": { "type": "integer" },
+              "qEnd": { "type": "integer" },
+              "qStart": { "type": "integer" },
+              "readQuality": { "type": "number" },
+              "reverseStrand": { "type": "integer" },
+              "rgId": { "type": "integer" },
+              "tEnd": { "type": "integer" },
+              "tId": { "type": "integer" },
+              "tStart: { "type": "integer" }
+            },
+            "required": [
+              "contextFlag",
+              "fileOffset",
+              "holeNumber",
+              "qEnd",
+              "qStart",
+              "readQuality",
+              "rgId"
+            ]
+          }
+        },
+        "references": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "beginRow": { "type": "integer" },
+              "endRow": { "type": "integer" },
+              "tId": { "type": "integer" }
+            },
+            "required" : [ "beginRow", "endRow","tId" ]
+          }
+        }q
+        "version": { "type": "string" }
+      },
+      "required": [
+        "fileSections",
+        "numReads",
+        "reads",
+        "version"
+      ]
+    }
+
+"Raw" JSON:
+
+.. code-block:: JSON
+
+    {
+      "type": "object",
+      "properties": {
+        "barcodeData" : {
+          "type" : "object",
+          "properties: {
+            "bcForward" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "bcQuality" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "bcReverse" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            }
+          }
+        },
+        "basicData" : {
+          "type" : "object",
+          "properties: {
+            "contextFlag" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "fileOffset" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "holeNumber" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "qEnd" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "qStart" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "readQuality" : {
+              "type": "array",
+              "items" : { "type": "number" }
+            },
+            "rgId : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            }
+          }
+        },
+        "fileSections": {
+          "type": "array",
+          "items": { "type": "string" },
+        },
+        "mappedData" : {
+          "type" : "object",
+          "properties: {
+            "aEnd" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "aStart" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "mapQuality" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "nM" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "nMM" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "readQuality" : {
+              "type": "array",
+              "items" : { "type": "number" }
+            },
+            "reverseStrand" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "tEnd" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "tId" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "tStart" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            }
+          }
+        },
+        "numReads": { "type": "integer" },
+        "references": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "beginRow": { "type": "integer" },
+              "endRow": { "type": "integer" },
+              "tId": { "type": "integer" }
+            },
+            "required" : [ "beginRow", "endRow","tId" ]
+          }
+        },
+        "version" : { "type": "string" }
+      },
+      "required": [
+        "fileSections",
+        "numReads",
+        "basicData",
+        "version"
+      ]
+    }
diff --git a/docs/source/tools/pbmerge.rst b/docs/source/tools/pbmerge.rst

new file mode 100644 (file)

index 0000000..937ec56
--- /dev/null
+++ b/docs/source/tools/pbmerge.rst
@@ -0,0 +1,30 @@
+.. _pbmerge:
+
+pbmerge
+=======
+
+::
+
+  Usage: pbmerge [options] [-o <out.bam>] <INPUT>
+
+  pbmerge merges PacBio BAM files. If the input is DataSetXML, any filters will be
+  applied. If no output filename is specified, new BAM will be written to stdout.
+
+  Options:
+  -h, --help            show this help message and exit
+  --version             show program's version number and exit
+
+  Input/Output:
+    -o output           Output BAM filename.
+    --no-pbi            Set this option to skip PBI index file creation. PBI
+                        creation is automatically skipped if no output filename
+                        is provided.
+    INPUT               Input may be one of:
+                            DataSetXML, list of BAM files, or FOFN
+
+                            fofn: pbmerge -o merged.bam bams.fofn
+
+                            bams: pbmerge -o merged.bam 1.bam 2.bam 3.bam
+
+                            xml:  pbmerge -o merged.bam foo.subreadset.xml
+
diff --git a/docs/specs/pbbam.rst b/docs/specs/pbbam.rst

new file mode 100644 (file)

index 0000000..993cae2
--- /dev/null
+++ b/docs/specs/pbbam.rst
@@ -0,0 +1,630 @@
+=================================================================
+**pbbam Software Design & Functional Specification**
+=================================================================
+| *Version 0.1*
+| *Pacific Biosciences Engineering Group*
+| *Jan 29, 2016*
+
+1. Revision History
+===================
+
++-------------+---------------+--------------------+---------------------------+
+| **Date**    | **Revision**  | **Author(s)**      | **Comments**              |
++=============+===============+====================+===========================+
+| 01-29-2016  | 0.1           | Derek Barnett      | Initial draft created     |
+|             |               |                    |                           |
++-------------+---------------+--------------------+---------------------------+
+
+2. Introduction
+===============
+
+2.1. Document Specification Identifier
+--------------------------------------
+
++-----------------------------------+------------------------------------------+
+| **Document Specification Prefix** | **Description**                          |
++===================================+==========================================+
+| FS\_SA\_PBBAM\_                   | Functional spec for pbbam                |
++-----------------------------------+------------------------------------------+
+
+2.2. Purpose
+------------
+
+This document is intended to describe the requirements and interface of the pbbam
+library, which provides functionality for creating, querying, and editing PacBio
+BAM files and associated file formats.
+
+2.3. Scope of Document
+----------------------
+
+This document covers the expected usage of the pbbam library, as well as any
+desired or required performance characteristics with respect to quality or speed.
+
+This document does not provide installation instructions or API documentation.
+
+2.4. Glossary of Terms
+----------------------
+
+The table below specifies only terms specific to this document, and skips
+acronyms/terms that are specified in `Pacific Biosciences Software Glossary`_.
+
+.. _Pacific Biosciences Software Glossary: http://smrtanalysis-docs/pb_sw_glossary.html
+
++------------------+-----------------------------------------------------------+
+| **Acronym/Term** | **Description**                                           |
++==================+===========================================================+
+| API              | Application Programming Interface - a set of routines,    |
+|                  | protocols, and tools for building software applications.  |
+|                  | In this document , this will consist of one or more       |
+|                  | cooperating libraries that specify data structures,       |
+|                  | methods, etc. for use within a target programming         |
+|                  | language.                                                 |
++------------------+-----------------------------------------------------------+
+| Client           | An application that uses the library.                     |
++------------------+-----------------------------------------------------------+
+| I/O              | Input/output of data.                                     |
++------------------+-----------------------------------------------------------+
+
+2.5. References
+---------------
+
++-------------+------------------------------+--------------------------------------+
+| **Ref No.** | **Document Name, Link**      | **Description**                      |
++=============+==============================+======================================+
+| (1)         | `BAM format`_                | General SAM/BAM specification        |
++-------------+------------------------------+--------------------------------------+
+| (2)         | `PacBio BAM`_                | PacBio BAM specification             |
++-------------+------------------------------+--------------------------------------+
+| (3)         | `PacBio BAM index`_          | PacBio BAM index specification       |
++-------------+------------------------------+--------------------------------------+
+| (4)         | `DataSet XML`_               | PacBio DataSet XML specification     |
++-------------+------------------------------+--------------------------------------+
+| (5)         | `Software Style Guide`_      | PacBio coding standards              |
++-------------+------------------------------+--------------------------------------+
+| (6)         | `SMRT Analysis`_             | General SMRT Analysis infrastructure |
++-------------+------------------------------+--------------------------------------+
+
+.. _BAM format: https://samtools.github.io/hts-specs/SAMv1.pdf
+.. _PacBio BAM: http://pacbiofileformats.readthedocs.org/en/3.0/BAM.html
+.. _PacBio BAM index: http://pacbiofileformats.readthedocs.org/en/3.0/PacBioBamIndex.html
+.. _DataSet XML: https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/DataSet.rst
+.. _Software Style Guide: http://smrtanalysis-docs/_downloads/PBISoftwareStyleGuide.doc
+.. _SMRT Analysis: http://smrtanalysis-docs/smrt_docs.html
+
+3. Software Overview
+====================
+
+3.1. Product Description
+------------------------
+
+As of the 3.0 release of SMRTanalysis, PacBio is embracing the industry standard
+`BAM format`_ (1) for (both aligned and unaligned) basecall data files. We have
+also formulated a BAM companion file format (.bam.pbi) enabling fast access to a
+richer set of per-read information as well as compatibility for software built
+around the legacy cmp.h5 format.
+
+The pbbam library provides components to create, query, & transform PacBio BAM
+data: sequence files and their associated indices. This includes a core C++
+library as well as bindings for additional programming languages.
+
+3.2. Product Functional Capabilities
+------------------------------------
+
+The library must be able to read and write BAM files that conform to the
+`PacBio BAM`_ specification (2). BAM records must be editable e.g. adding
+alignment information. Random access must be supported, whether by genomic
+region or by filtering record features. To this end, the library will be able to
+read, write, and create associated index files - both the standard BAM index
+(.bai) and the `PacBio BAM index`_ (.pbi) (3). In addition to working with
+individual files, datasets of related BAM files will be supported. These are
+described in a `DataSet XML`_ document. (4)
+
+3.3. User Characteristics
+-------------------------
+
++---------------------+--------------------------------------------------------+
+| **User Class/Role** | **User Knowledge and Skill Levels**                    |
++=====================+========================================================+
+| Developer           | Competence in one or more programming languages        |
+|                     | supported (C++, R, Python, C#). No knowledge of        |
+|                     | molecular biology wet lab techniques required.         |
++---------------------+--------------------------------------------------------+
+
+3.4. User Operations and Practices
+----------------------------------
+
+Developer users will interact with the software by incorporating the library
+into a client application.
+
+3.5. Operating Environment
+--------------------------
+
+The software is intended to be run in a Linux or OSX environment, with ideally 4
+or more cores.
+
+3.6. Design and Implementation Constraints
+------------------------------------------
+
+Currently there are no constraints outside the operating environment and speed
+requirements. In particular, as the library will be used for writing the BAM
+files coming off a Sequel instrument, it should be able to keep pace.
+
+3.7. Assumptions and Dependencies
+---------------------------------
+
+Input routines for the library will expect to receive files that conform to the
+`PacBio BAM`_ (2) or `DataSet XML`_ (4) specifications.
+
+The pbbam library depends on Boost, zlib, and htslib libraries.
+
+3.8. Other Software
+-------------------
+
+Output PacBio BAMs will be compatible with the `PacBio BAM`_ specification (2)
+and thus compatible with the general `BAM format`_ specification (1). This
+ensures that a wide variety of downstream tools can interact with data files.
+
+The software uses `Meson`_ as its build system.
+
+The core C++ API relies on the following 3rd party components:
+
+* `zlib`_
+* `htslib`_
+* `Boost`_ (header-only modules)
+
+Wrapper APIs for additional languages (Python, R, C#) are generated by `SWIG`_.
+
+API documentation is generated via `Doxygen`_.
+
+.. _Meson: https://mesonbuild.com
+.. _zlib: http://www.zlib.net/
+.. _htslib: https://github.com/samtools/htslib
+.. _Boost: http://www.boost.org/
+.. _SWIG: http://www.swig.org/
+.. _Doxygen: http://www.stack.nl/~dimitri/doxygen/
+
+4. External Interfaces
+======================
+
+4.1. User Interfaces
+--------------------
+
+N/A
+
+4.2. Software Interfaces
+------------------------
+
+pbbam will require the following software:
+
+* `htslib`_ & `zlib`_ - provides low-level handling of compressed BAM data
+* `Boost`_ - provides utility classes
+
+Incoming data from upstream components will be compliant with
+PacBio BAM format - see `PacBio BAM`_ specification (2) for more detail.
+
+4.3. Hardware Interfaces
+------------------------
+
+N/A
+
+4.4. Communications Interfaces
+------------------------------
+
+N/A
+
+5. Functional Requirements
+==========================
+
+5.1. Query BAM data by genomic region
+-----------------------------------------
+
+5.1.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall allow client applications to query data, limited to some genomic
+region of interest.
+
+5.1.2. Inputs
+~~~~~~~~~~~~~
+
+* BAM file(s) or DataSet XML
+* a standard index (.bai) for each source BAM file
+* genomic interval (e.g. "chr1:1000-2000")
+
+5.1.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Obtain an `htslib`_ "iterator" object for a given file and region. This will be
+wrapped by pbbam to hide the low-level nature of this type, as well as handling
+memory lifetime.
+
+5.1.4. Outputs
+~~~~~~~~~~~~~~
+
+Iterator providing access to individual BAM records from the input data sources,
+which are aligned to the requested genomic interval.
+
+For example:
+
+.. code:: c++
+
+    GenomicIntervalQuery query(interval, dataset);
+    for (const BamRecord& record : query) {
+        // ... do stuff ...
+    }
+
+
+5.1.5. Regulatory Compliance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+5.2. Query BAM data by filter criteria
+-----------------------------------------
+
+5.2.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall allow client applications to query data, limited to some filter
+criteria (e.g. only reads from ZMW hole number 200 with a read quality of >0.5).
+
+5.2.2. Inputs
+~~~~~~~~~~~~~
+
+* BAM file(s) or DataSet XML
+* a `PacBio BAM index`_ (.pbi) for each source BAM file
+* filters supported by data contained in the PBI
+
+5.2.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Query PBI files(s) for records that match the provided filter criteria. Merge
+contiguous runs of records into record blocks, to minimize seeks. Advancing the
+iterator either reads the next read from the current block or seeks to the next
+block and fetches the next record.
+
+5.2.4. Outputs
+~~~~~~~~~~~~~~
+
+Iterator providing access to individual BAM records from the input data sources,
+which satisfy the requested filter criteria.
+
+For example:
+
+.. code:: c++
+
+    PbiFilterQuery query(filter, dataset);
+    for (const BamRecord& record : query) {
+        // ... do stuff ...
+    }
+
+5.2.5. Regulatory Compliance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+5.3. Write PacBio BAM data
+------------------------------------------
+
+5.3.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall be able to write `PacBio BAM`_ files conforming to the specification.
+
+5.3.2. Inputs
+~~~~~~~~~~~~~
+
+* filename
+* header information
+* BAM records
+
+5.3.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Create file handle for the provided filename, output initial header information.
+As records are passed in, write to file. Upon completion, flush any buffers and
+close file handle.
+
+Multithreading, provided by `htslib`_, will be utilized where possible to speed
+up the compression process - often then main bottleneck of BAM throughput.
+
+5.3.4. Outputs
+~~~~~~~~~~~~~~
+
+BAM file conforming to the `PacBio BAM`_ specification.
+
+5.3.5. Regulatory Compliance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+5.4. Create PacBio BAM index file
+------------------------------------------
+
+5.4.1. Description
+~~~~~~~~~~~~~~~~~~
+
+Much of PacBio BAM data processing relies on the presence of a `PacBio BAM index`_
+file. pbbam shall be able to generate this file type for a `PacBio BAM`_ file.
+
+5.4.2. Inputs
+~~~~~~~~~~~~~
+
+`PacBio BAM`_ file
+
+5.4.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Read through the input BAM records, storing the values relevant to a PBI index.
+At end of file, write the index contents to a file and close.
+
+5.4.4. Outputs
+~~~~~~~~~~~~~~
+
+`PacBio BAM index`_ file
+
+5.4.5. Regulatory Compliance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+6. Non-Functional Requirements
+==============================
+
+6.1. Performance Requirements
+-----------------------------
+
+Since pbbam will be used to write all BAM files coming off a Sequel device, the
+library must keep pace with data generation requirements.
+
+** come back to this, hard numbers ?? **
+
+6.2. Safety Requirements
+------------------------
+
+N/A
+
+6.3. Security Requirements
+--------------------------
+
+N/A
+
+6.4. Quality Attributes
+-----------------------
+
+6.4.1. Availability
+~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+6.4.2. Integrity
+~~~~~~~~~~~~~~~~
+
+Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications.
+Files that do not meet this requirement will raise exceptions and will not be
+accepted.
+
+6.4.3. Interoperability
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications.
+
+6.4.4. Reliability
+~~~~~~~~~~~~~~~~~~
+
+The developed software shall meet the overall product reliability requirements.
+
+6.4.5. Robustness
+~~~~~~~~~~~~~~~~~
+
+pbbam will raise exceptions upon encountering failure cases, allowing client
+applications to recover or report the error to a UI.
+
+6.4.6. Usability
+~~~~~~~~~~~~~~~~
+
+pbbam shall have comprehensive API documentation, available both on- and offline.
+Further documentation will be provided for installation, API usage tips, etc.
+
+Raised exceptions shall carry as much information as possible so that client
+applications can respond with appropriate actions or display useful messages.
+
+6.4.7. Maintainability
+~~~~~~~~~~~~~~~~~~~~~~
+
+The source code of the software covered in this functional specification shall
+adhere to the PacBio `Software Style Guide`_ (9) work instruction, to guarantee
+high quality of code that facilitates maintainability.
+
+6.4.8. Customizability
+~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+6.5. Business Rules
+-------------------
+
+N/A
+
+6.6. Installation and Upgrade
+-----------------------------
+
+Installation and Upgrade of this software will be handled as part of the SMRT
+Analysis subsystem. See `SMRT Analysis`_ (6) specifications for more detail.
+
+Additionally, the library may be built independently, either from internal
+version control (Perforce) or from the public-facing Github repository. In
+either case, `Meson`_ is used to drive the build process.
+
+6.7. Administration
+-------------------
+
+N/A
+
+6.8. User Documentation
+-----------------------
+
+pbbam shall have comprehensive API documentation, available both on- and offline.
+Further documentation will be provided for installation, API usage tips, etc.
+
+The "offline" API documentation may be built directly from the source code, using
+`Doxygen`_. Online documentation will be generated via a continuous integration
+server, thus ensuring it is always pointing to the current codebase.
+
+7. High Level Design
+====================
+
+7.1. Top Level Context
+----------------------
+
+The pbbam library is intended to be linked in with client applications,
+providing programmatic access to data files.
+
+7.2. Use Cases
+--------------
+
+Primary use cases for pbbam include:
+
+* BAM file creation
+* BAM file query - iterable access to various subsets of data
+
+8. Detailed Design
+==================
+
+8.1. Structural Representation
+------------------------------
+
+ *image(s) here*
+
+8.2. Behavioral Representation
+------------------------------
+
+This section provides behavioral (dynamic) representation of how the
+elements of the system realize the required use cases.
+
+Describe how the significant subsystems and classes interact with each
+other to realize the architecturally significant use cases.
+
+Provide a link to a file containing Sequence Diagram or Activity Diagram, when applicable.
+The link may be provided with use of 'image' directive.
+
+Sequence Diagram shows one use case scenario, executed by class model,
+with sequence of operations over period of time (time increased from top
+to bottom). It shows interactions between objects, but does not show
+relationships between them.
+
+Activity Diagram is a virtual representation of the sequential flow and
+control logic of a set of related activities or actions. It is a type of
+flowchart, frequently called Swim Lane Diagram, because activities of
+each entity are presented within its swim lane.
+
+Note: You may use http://wsd tool to auto-generate a sequence diagram from
+a descriptive text file, save the diagram to the wsd site, get link to the image,
+and add this link to the document with use of 'image' directive.
+
+8.3. Information Storage
+------------------------
+
+pbbam software requires no persistent storage outside of availability of input
+and output during analysis.
+
+8.4. Technology Overview
+------------------------
+
+pbbam is implemented in C++-11 and should perform as designed on any UNIX-like
+operating system (Linux distributions, Apple OSX, etc.).
+
+8.5. SOUP Components
+--------------------
+
+pbbam utilizes Meson for its build system. The C++ library uses the following
+3rd-party software components: Boost, htslib and zlib.
+
+8.6. Deployment and Configuration
+---------------------------------
+
+Please refer to `SMRT Analysis`_ (6) documentation
+
+9. Automated Tests
+==================
+
+9.1. Unit Testing
+-----------------
+
+The library shall have unit tests for all classes & components.
+
+9.2. Performance Testing
+------------------------
+
+Unit tests may evaluate performance requirements as desired.
+
+9.3. Regression Testing
+-----------------------
+
+As its role is primarily in data I/O, pbbam has no "scientific quality/validity"
+metrics that would indicate a regression. Instead, passing its unit tests and
+end-to-end tests will indicate that a regression has not been introduced.
+
+These tests will be run after each check-in and nightly.
+
+10. Requirements Traceability Matrices
+======================================
+
+This section provides traces from requirements specified in PRD/DIR documents to the
+requirements covered in this functional specification, and from these
+functional requirements to corresponding Test Cases/Procedures.
+
+10.1. HPQC Functional Specifications
+------------------------------------
+
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+| **PBI_ID**  | **Name**                  | **Description**                                   | **Comment** | **Metric** | **Owner** | **PRD/DIR Path**                          |
++=============+===========================+===================================================+=============+============+===========+===========================================+
+| 5.1         | Query BAM data by         | pbbam shall allow client applications to query    |             |            | dbarnett  |                                           |
+|             | genomic region            | data, limited to some genomic region of interest. |             |            |           |                                           |
+|             |                           |                                                   |             |            |           |                                           |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+| 5.2         | Query BAM data by         | pbbam shall allow client applications to query    |             |            | dbarnett  |                                           |
+|             | filter criteria           | data, limited to some filter criteria (e.g. only  |             |            |           |                                           |
+|             |                           | reads from ZMW hole number 200 with a read        |             |            |           |                                           |
+|             |                           | quality of >0.5).                                 |             |            |           |                                           |
+|             |                           |                                                   |             |            |           |                                           |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+| 5.3         | Write PacBio BAM data     | pbbam shall be able to write files conforming to  |             |            | dbarnett  |                                           |
+|             |                           | the `PacBio BAM`_ specifictation.                 |             |            |           |                                           |
+|             |                           |                                                   |             |            |           |                                           |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+| 5.4         | Create PacBio BAM index   | Much of PacBio BAM data processing relies on the  |             |            | dbarnett  |                                           |
+|             | file                      | presence of a `PacBio BAM index`_ file. pbbam     |             |            |           |                                           |
+|             |                           | shall be able to generate this file type for a    |             |            |           |                                           |
+|             |                           | `PacBio BAM`_ file.                               |             |            |           |                                           |
+|             |                           |                                                   |             |            |           |                                           |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+
+10.2. Automated Tests Coverage
+------------------------------
+
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| **FS Item** | **FS Item Title**         | **Use Case Description**                           | **Test Case Name/ID**                                            |
++=============+===========================+====================================================+==================================================================+
+| 5.1         | Query BAM data by         | pbbam shall allow client applications to query     | TODO                                                             |
+|             | genomic region            | data, limited to some genomic region of interest.  |                                                                  |
+|             |                           |                                                    |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.2         | Query BAM data by         | pbbam shall allow client applications to query     | TODO                                                             |
+|             | filter criteria           | data, limited to some filter criteria (e.g. only   |                                                                  |
+|             |                           | reads from ZMW hole number 200 with a read         |                                                                  |
+|             |                           | quality of >0.5).                                  |                                                                  |
+|             |                           |                                                    |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.3         | Write PacBio BAM data     | pbbam shall be able to write files conforming to   | TODO                                                             |
+|             |                           | the `PacBio BAM`_ specifictation.                  |                                                                  |
+|             |                           |                                                    |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.4         | Create PacBio BAM index   | Much of PacBio BAM data processing relies on the   | TODO                                                             |
+|             | file                      | presence of a `PacBio BAM index`_ file. pbbam      |                                                                  |
+|             |                           | shall be able to generate this file type for a     |                                                                  |
+|             |                           | `PacBio BAM`_ file.                                |                                                                  |
+|             |                           |                                                    |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+
diff --git a/docs/specs/pbbam_structure.png b/docs/specs/pbbam_structure.png

new file mode 100755 (executable)

index 0000000..40f50cf

Binary files /dev/null and b/docs/specs/pbbam_structure.png differ
diff --git a/docs/specs/pbbam_updated_release3_2.rst b/docs/specs/pbbam_updated_release3_2.rst

new file mode 100755 (executable)

index 0000000..72d9b76
--- /dev/null
+++ b/docs/specs/pbbam_updated_release3_2.rst
@@ -0,0 +1,618 @@
+=============================================================
+**Pbbam Core API Software Design & Functional Specification**
+=============================================================
+| *Version 0.2*
+| *Pacific Biosciences Engineering Group*
+| *Oct 17, 2016*
+
+1. Revision History
+===================
+
++-------------+---------------+--------------------+---------------------------------+
+| **Date**    | **Revision**  | **Author(s)**      | **Comments**                    |
++=============+===============+====================+=================================+
+| 01-29-2016  | 0.1           | Derek Barnett      | Initial draft created           |
+|             |               |                    |                                 |
++-------------+---------------+--------------------+---------------------------------+
+| 10-17-2016  | 0.2           | Derek Barnett      | Added behavioral representation |
+|             |               |                    | and structural representation   |
+|             |               |                    | diagram                         |
++-------------+---------------+--------------------+---------------------------------+
+
+2. Introduction
+===============
+
+2.1. Document Specification Identifier
+--------------------------------------
+
++-----------------------------------+------------------------------------------+
+| **Document Specification Prefix** | **Description**                          |
++===================================+==========================================+
+| FS\_SA\_PBBAM\_                   | Functional spec for pbbam                |
++-----------------------------------+------------------------------------------+
+
+2.2. Purpose
+------------
+
+This document is intended to describe the requirements and interface of the pbbam
+library, which provides functionality for creating, querying, and editing PacBio
+BAM files and associated file formats.
+
+2.3. Scope of Document
+----------------------
+
+This document covers the expected usage of the pbbam library, as well as any
+desired or required performance characteristics with respect to quality or speed.
+
+This document does not provide installation instructions or API documentation.
+
+2.4. Glossary of Terms
+----------------------
+
+The table below specifies only terms specific to this document, and skips
+acronyms/terms that are specified in `Pacific Biosciences Software Glossary`_.
+
+.. _Pacific Biosciences Software Glossary: http://smrtanalysis-docs/pb_sw_glossary.html
+
++------------------+-----------------------------------------------------------+
+| **Acronym/Term** | **Description**                                           |
++==================+===========================================================+
+| API              | Application Programming Interface - a set of routines,    |
+|                  | protocols, and tools for building software applications.  |
+|                  | In this document, this will consist of one or more        |
+|                  | cooperating libraries that specify data structures,       |
+|                  | methods, etc. for use within a target programming         |
+|                  | language.                                                 |
++------------------+-----------------------------------------------------------+
+| Client           | An application that uses the library.                     |
++------------------+-----------------------------------------------------------+
+| I/O              | Input/output of data.                                     |
++------------------+-----------------------------------------------------------+
+
+2.5. References
+---------------
+
++-------------+------------------------------+--------------------------------------+
+| **Ref No.** | **Document Name, Link**      | **Description**                      |
++=============+==============================+======================================+
+| (1)         | `BAM format`_                | General SAM/BAM specification        |
++-------------+------------------------------+--------------------------------------+
+| (2)         | `PacBio BAM`_                | PacBio BAM specification             |
++-------------+------------------------------+--------------------------------------+
+| (3)         | `PacBio BAM index`_          | PacBio BAM index specification       |
++-------------+------------------------------+--------------------------------------+
+| (4)         | `DataSet XML`_               | PacBio DataSet XML specification     |
++-------------+------------------------------+--------------------------------------+
+| (5)         | `Software Style Guide`_      | PacBio coding standards              |
++-------------+------------------------------+--------------------------------------+
+| (6)         | `SMRT Analysis`_             | General SMRT Analysis infrastructure |
++-------------+------------------------------+--------------------------------------+
+
+.. _BAM format: https://samtools.github.io/hts-specs/SAMv1.pdf
+.. _PacBio BAM: http://pacbiofileformats.readthedocs.org/en/3.0/BAM.html
+.. _PacBio BAM index: http://pacbiofileformats.readthedocs.org/en/3.0/PacBioBamIndex.html
+.. _DataSet XML: https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/DataSet.rst
+.. _Software Style Guide: http://smrtanalysis-docs/_downloads/PBISoftwareStyleGuide.doc
+.. _SMRT Analysis: http://smrtanalysis-docs/smrt_docs.html
+
+3. Software Overview
+====================
+
+3.1. Software Module Description
+--------------------------------
+
+As of the 3.0 release of SMRT Analysis, PacBio is embracing the industry standard
+`BAM format`_ (1) for (both aligned and unaligned) basecall data files. We have
+also formulated a BAM companion file format (.bam.pbi) enabling fast access to a
+richer set of per-read information as well as compatibility for software built
+around the legacy cmp.h5 format.
+
+The pbbam library provides components to create, query, & transform PacBio BAM
+data: sequence files and their associated indices. This includes a core C++
+library as well as bindings for additional programming languages.
+
+3.2. Software Module Functional Capabilities
+--------------------------------------------
+
+The library must be able to read and write BAM files that conform to the
+`PacBio BAM`_ specification (2). BAM records must be editable e.g. adding
+alignment information. Random access must be supported, whether by genomic
+region or by filtering record features. To this end, the library will be able to
+read, write, and create associated index files - both the standard BAM index
+(.bai) and the `PacBio BAM index`_ (.pbi) (3). In addition to working with
+individual files, datasets of related BAM files will be supported. These are
+described in a `DataSet XML`_ document. (4)
+
+3.3. User Characteristics
+-------------------------
+
++---------------------+--------------------------------------------------------+
+| **User Class/Role** | **User Knowledge and Skill Levels**                    |
++=====================+========================================================+
+| Developer           | Competence in one or more programming languages        |
+|                     | supported (C++, R, Python, C#). No knowledge of        |
+|                     | molecular biology wet lab techniques required.         |
++---------------------+--------------------------------------------------------+
+
+3.4. User Operations and Practices
+----------------------------------
+
+Developer users will interact with the software by incorporating the library
+into a client application.
+
+3.5. Operating Environment
+--------------------------
+
+The software is intended to be run in a Linux or OSX environment, with ideally 4
+or more cores.
+
+3.6. General Constraints
+------------------------
+
+Currently there are no constraints outside the operating environment and speed
+requirements. In particular, as the library will be used for writing the BAM
+files coming off a Sequel instrument, it should be able to keep pace.
+
+3.7. Assumptions and Dependencies
+---------------------------------
+
+Input routines for the library will expect to receive files that conform to the
+`PacBio BAM`_ (2) or `DataSet XML`_ (4) specifications.
+
+The pbbam library depends on Boost, zlib, and htslib libraries.
+
+3.8. Other Software
+-------------------
+
+Output PacBio BAMs will be compatible with the `PacBio BAM`_ specification (2)
+and thus compatible with the general `BAM format`_ specification (1). This
+ensures that a wide variety of downstream tools can interact with data files.
+
+The software uses `CMake`_ as its build system.
+
+The core C++ API relies on the following 3rd party components:
+
+* `zlib`_
+* `htslib`_
+* `Boost`_ (header-only modules)
+
+Wrapper APIs for additional languages (Python, R, C#) are generated by `SWIG`_.
+
+API documentation is generated via `Doxygen`_.
+
+.. _CMake: https://cmake.org/
+.. _zlib: http://www.zlib.net/
+.. _htslib: https://github.com/samtools/htslib
+.. _Boost: http://www.boost.org/
+.. _SWIG: http://www.swig.org/
+.. _Doxygen: http://www.stack.nl/~dimitri/doxygen/
+
+4. External Interfaces
+======================
+
+4.1. User Interfaces
+--------------------
+
+N/A
+
+4.2. Software Interfaces
+------------------------
+
+pbbam will require the following software:
+
+* `htslib`_ & `zlib`_ - provides low-level handling of compressed BAM data
+* `Boost`_ - provides utility classes
+
+Incoming data from upstream components will be compliant with
+PacBio BAM format - see `PacBio BAM`_ specification (2) for more detail.
+
+4.3. Hardware Interfaces
+------------------------
+
+N/A
+
+4.4. Communications Interfaces
+------------------------------
+
+N/A
+
+5. Functional Requirements
+==========================
+
+5.1. Query BAM data by genomic region
+-------------------------------------
+
+5.1.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall allow client applications to query data, limited to some genomic
+region of interest.
+
+5.1.2. Inputs
+~~~~~~~~~~~~~
+
+* BAM file(s) or DataSet XML
+* a standard index (.bai) for each source BAM file
+* genomic interval (e.g. "chr1:1000-2000")
+
+5.1.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Obtain an `htslib`_ "iterator" object for a given file and region. This will be
+wrapped by pbbam to hide the low-level nature of this type, as well as handling
+memory lifetime.
+
+5.1.4. Outputs
+~~~~~~~~~~~~~~
+
+Iterator providing access to individual BAM records from the input data sources,
+which are aligned to the requested genomic interval.
+
+For example:
+
+.. code:: c++
+
+    GenomicIntervalQuery query(interval, dataset);
+    for (const BamRecord& record : query) {
+        // ... use record data ...
+    }
+
+
+5.2. Query BAM data by filter criteria
+--------------------------------------
+
+5.2.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall allow client applications to query data, limited to some filter
+criteria (e.g. only reads from ZMW hole number 200 with a read quality of >0.5).
+
+5.2.2. Inputs
+~~~~~~~~~~~~~
+
+* BAM file(s) or DataSet XML
+* a `PacBio BAM index`_ (.pbi) for each source BAM file
+* filters supported by data contained in the PBI
+
+5.2.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Query PBI files(s) for records that match the provided filter criteria. Merge
+contiguous runs of records into record blocks, to minimize seeks. Advancing the
+iterator either reads the next read from the current block or seeks to the next
+block and fetches the next record.
+
+5.2.4. Outputs
+~~~~~~~~~~~~~~
+
+Iterator providing access to individual BAM records from the input data sources,
+which satisfy the requested filter criteria.
+
+For example:
+
+.. code:: c++
+
+    PbiFilterQuery query(filter, dataset);
+    for (const BamRecord& record : query) {
+        // ... do stuff ...
+    }
+
+
+5.3. Write PacBio BAM data
+--------------------------
+
+5.3.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall be able to write `PacBio BAM`_ files conforming to the specification.
+
+5.3.2. Inputs
+~~~~~~~~~~~~~
+
+* filename
+* header information
+* BAM records
+
+5.3.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Create file handle for the provided filename, output initial header information.
+As records are passed in, write to file. Upon completion, flush any buffers and
+close file handle.
+
+Multithreading, provided by `htslib`_, will be utilized where possible to speed
+up the compression process - often then main bottleneck of BAM throughput.
+
+5.3.4. Outputs
+~~~~~~~~~~~~~~
+
+BAM file conforming to the `PacBio BAM`_ specification.
+
+5.4. Create PacBio BAM index file
+---------------------------------
+
+5.4.1. Description
+~~~~~~~~~~~~~~~~~~
+
+Much of PacBio BAM data processing relies on the presence of a `PacBio BAM index`_
+file. pbbam shall be able to generate this file type for a `PacBio BAM`_ file.
+
+5.4.2. Inputs
+~~~~~~~~~~~~~
+
+`PacBio BAM`_ file
+
+5.4.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Read through the input BAM records, storing the values relevant to a PBI index.
+At end of file, write the index contents to a file and close.
+
+5.4.4. Outputs
+~~~~~~~~~~~~~~
+
+`PacBio BAM index`_ file
+
+6. Non-Functional Requirements
+==============================
+
+6.1. Performance Requirements
+-----------------------------
+
+Since pbbam will be used to write all BAM files coming off a Sequel instrument, the
+library must keep pace with data generation requirements.
+
+6.2. Safety Requirements
+------------------------
+
+N/A
+
+6.3. Security Requirements
+--------------------------
+
+N/A
+
+6.4. Quality Attributes
+-----------------------
+
+6.4.1. Availability
+~~~~~~~~~~~~~~~~~~~
+
+The developed software shall meet the overall product availability requirements.
+
+6.4.2. Data Integrity
+~~~~~~~~~~~~~~~~~~~~~
+
+Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications.
+Files that do not meet this requirement will raise exceptions and will not be
+accepted.
+
+6.4.3. Interoperability
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications.
+
+6.4.4. Reliability
+~~~~~~~~~~~~~~~~~~
+
+The developed software shall meet the overall product reliability requirements.
+
+6.4.5. Robustness
+~~~~~~~~~~~~~~~~~
+
+pbbam will raise exceptions upon encountering failure cases, allowing client
+applications to recover or report the error to a UI.
+
+6.4.6. Usability
+~~~~~~~~~~~~~~~~
+
+pbbam shall have comprehensive API documentation, available both on- and offline.
+Further documentation will be provided for installation, API usage tips, etc.
+
+Raised exceptions shall carry as much information as possible so that client
+applications can respond with appropriate actions or display useful messages.
+
+6.4.7. Maintainability
+~~~~~~~~~~~~~~~~~~~~~~
+
+The source code of the software covered in this functional specification shall
+adhere to the PacBio `Software Style Guide`_ (9) work instruction, to guarantee
+high quality of code that facilitates maintainability.
+
+6.4.8. Customizability
+~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+6.4.9. Compatibility
+~~~~~~~~~~~~~~~~~~~~
+
+pbbam shall support backward compatibility of the API and BAM format versions
+in order not to break existing clients.
+
+6.5. Business Rules
+-------------------
+
+N/A
+
+6.6. Compliance Requirements
+----------------------------
+
+N/A
+
+6.7. Alarms and Error Handling
+------------------------------
+
+Raised exceptions shall carry as much information as possible so that client
+applications can respond with appropriate actions or display useful messages.
+
+6.8. Persistence Requirements
+-----------------------------
+
+pbbam software requires no persistent storage outside of availability of input
+and output during analysis.
+
+6.9. Installation and Upgrade
+-----------------------------
+
+Installation and Upgrade of this software will be handled as part of the SMRT
+Analysis subsystem. See `SMRT Analysis`_ (6) specifications for more detail.
+
+Additionally, the library may be built independently, either from internal
+version control (Perforce) or from the public-facing Github repository. In
+either case, `CMake`_ is used to drive the build process.
+
+6.10. Administration and Maintenance
+------------------------------------
+
+N/A
+
+6.11. User Documentation
+------------------------
+
+pbbam shall have comprehensive API documentation, available both on- and offline.
+Further documentation will be provided for installation, API usage tips, etc.
+
+The "offline" API documentation may be built directly from the source code, using
+`Doxygen`_. Online documentation will be generated via a continuous integration
+server, thus ensuring it is always pointing to the current codebase.
+
+7. High Level Design
+====================
+
+7.1. Top Level Context
+----------------------
+
+The pbbam library is intended to be linked in with client applications,
+providing programmatic access to data files.
+
+7.2. Use Cases
+--------------
+
+Primary use cases for pbbam include:
+
+* BAM file creation
+* BAM file query - iterable access to various subsets of data
+
+8. Detailed Design
+==================
+
+8.1. Structural Representation
+------------------------------
+
+.. image:: ./pbbam_structure.png
+
+8.2. Behavioral Representation
+------------------------------
+
+The typical access pattern involves a client query against BAM data, optionally
+described in DataSet XML. The query may involve some number of filter criteria.
+
+pbbam queries the associated index files (*.pbi) to pre-determine which records
+pass filtering criteria and where they reside on disk. The client code is given
+an iterable object, such that each iteration of the main access loop returns a
+valid BAM record for analysis, modification, etc.
+
+8.3. Information Storage
+------------------------
+
+pbbam software requires no persistent storage outside of availability of input
+and output during analysis.
+
+8.4. Technology Overview
+------------------------
+
+pbbam is implemented in C++-11 and should perform as designed on any UNIX-like
+operating system (Linux distributions, Apple OSX, etc.).
+
+8.5. SOUP Components
+--------------------
+
+pbbam utilizes CMake for its build system. The C++ library uses the following
+3rd-party software components: `Boost`_, `htslib`_, & `zlib`_. Wrappers for additional
+languages are generated using SWIG.
+
+8.6. Deployment and Configuration
+---------------------------------
+
+Please refer to `SMRT Analysis`_ (6) documentation
+
+9. Automated Tests
+==================
+
+9.1. Unit Testing
+-----------------
+
+The library shall have unit tests for all classes & components.
+
+9.2. Performance Testing
+------------------------
+
+Unit tests may evaluate performance requirements as desired.
+
+9.3. Regression Testing
+-----------------------
+
+As its role is primarily in data I/O, pbbam has no "scientific quality/validity"
+metrics that would indicate a regression. Instead, passing its unit tests and
+end-to-end tests will indicate that a regression has not been introduced.
+
+These tests will be run after each check-in and nightly.
+
+10. Requirements Traceability Matrices
+======================================
+
+This section provides traces from requirements specified in PRD/DIR documents to the
+requirements covered in this functional specification, and from these
+functional requirements to corresponding Test Cases/Procedures.
+
+10.1. HPQC Functional Specifications
+------------------------------------
+
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+| **PBI_ID**  | **Name**                  | **Description**                                   | **Comment** | **Metric** | **Owner** | **PRD/DIR Path**                                 |
++=============+===========================+===================================================+=============+============+===========+==================================================+
+| 5.1         | Query BAM data by         | pbbam shall allow client applications to query    |             | Yes        | dbarnett  | \\DIR\\Functionality\\Software\Common\APIs\\     |
+|             | genomic region            | data, limited to some genomic region of interest. |             |            |           | Software shall provide an API to allow 3rd       |
+|             |                           |                                                   |             |            |           | party software to extract all run information    |
+|             |                           |                                                   |             |            |           | including summary reports and locations          |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+| 5.2         | Query BAM data by         | pbbam shall allow client applications to query    |             | Yes        | dbarnett  | \\DIR\\Functionality\\Software\Common\APIs\\     |
+|             | filter criteria           | data, limited to some filter criteria (e.g. only  |             |            |           | Software shall provide an API to allow 3rd       |
+|             |                           | reads from ZMW hole number 200 with a read        |             |            |           | party software to extract all run information    |
+|             |                           | quality of >0.5).                                 |             |            |           | including summary reports and locations          |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+| 5.3         | Write PacBio BAM data     | pbbam shall be able to write files conforming to  |             | Yes        | dbarnett  | \\DIR\\Functionality\\Software\\PostProcessing\\ |
+|             |                           | the `PacBio BAM`_ specification.                  |             |            |           | Software shall provide base files including      |
+|             |                           |                                                   |             |            |           | kinetic information in industry standard format  |
+|             |                           |                                                   |             |            |           | such as SAM/BAM using current specifications     |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+| 5.4         | Create PacBio BAM index   | Much of PacBio BAM data processing relies on the  |             | Yes        | dbarnett  | \\DIR\\Functionality\\Software\\PostProcessing\\ |
+|             | file                      | presence of a `PacBio BAM index`_ file. pbbam     |             |            |           | Software shall provide base files including      |
+|             |                           | shall be able to generate this file type for a    |             |            |           | kinetic information in industry standard format  |
+|             |                           | `PacBio BAM`_ file.                               |             |            |           | such as SAM/BAM using current specifications     |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+
+10.2. Automated Tests Coverage
+------------------------------
+
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| **FS Item** | **FS Item Title**         | **Use Case Description**                           | **Test Case Name/ID**                                            |
++=============+===========================+====================================================+==================================================================+
+| 5.1         | Query BAM data by         | pbbam shall allow client applications to query     | See section 9.1. Unit Testing.                                   |
+|             | genomic region            | data, limited to some genomic region of interest.  |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.2         | Query BAM data by         | pbbam shall allow client applications to query     | See section 9.1. Unit Testing.                                   |
+|             | filter criteria           | data, limited to some filter criteria (e.g. only   |                                                                  |
+|             |                           | reads from ZMW hole number 200 with a read         |                                                                  |
+|             |                           | quality of >0.5).                                  |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.3         | Write PacBio BAM data     | pbbam shall be able to write files conforming to   | See section 9.1. Unit Testing.                                   |
+|             |                           | the `PacBio BAM`_ specification.                   |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.4         | Create PacBio BAM index   | Much of PacBio BAM data processing relies on the   | See section 9.1. Unit Testing.                                   |
+|             | file                      | presence of a `PacBio BAM index`_ file. pbbam      |                                                                  |
+|             |                           | shall be able to generate this file type for a     |                                                                  |
+|             |                           | `PacBio BAM`_ file.                                |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+
diff --git a/include/meson.build b/include/meson.build

new file mode 100644 (file)

index 0000000..35fd9ed
--- /dev/null
+++ b/include/meson.build
@@ -0,0 +1,165 @@
+###########
+# headers #
+###########
+
+if not meson.is_subproject()
+  install_headers(
+    files([
+      'pbbam/Accuracy.h',
+      'pbbam/AlignmentPrinter.h',
+      'pbbam/BaiIndexCache.h',
+      'pbbam/BaiIndexedBamReader.h',
+      'pbbam/BamFile.h',
+      'pbbam/BamFileMerger.h',
+      'pbbam/BamHeader.h',
+      'pbbam/BamReader.h',
+      'pbbam/BamRecord.h',
+      'pbbam/BamRecordBuilder.h',
+      'pbbam/BamRecordImpl.h',
+      'pbbam/BamRecordTag.h',
+      'pbbam/BamRecordView.h',
+      'pbbam/BamTagCodec.h',
+      'pbbam/BamWriter.h',
+      'pbbam/BarcodeQuery.h',
+      'pbbam/BgzipFastaWriter.h',
+      'pbbam/BgzipFastqWriter.h',
+      'pbbam/BgzipWriter.h',
+      'pbbam/Cigar.h',
+      'pbbam/CigarOperation.h',
+      'pbbam/ClipType.h',
+      'pbbam/Compare.h',
+      'pbbam/CompositeBamReader.h',
+      'pbbam/CompositeFastaReader.h',
+      'pbbam/Config.h',
+      'pbbam/DataSet.h',
+      'pbbam/DataSetTypes.h',
+      'pbbam/DataSetXsd.h',
+      'pbbam/EntireFileQuery.h',
+      'pbbam/FaiIndex.h',
+      'pbbam/FastaCache.h',
+      'pbbam/FastaReader.h',
+      'pbbam/FastaSequence.h',
+      'pbbam/FastaSequenceQuery.h',
+      'pbbam/FastaWriter.h',
+      'pbbam/FastqReader.h',
+      'pbbam/FastqSequence.h',
+      'pbbam/FastqWriter.h',
+      'pbbam/FormatUtils.h',
+      'pbbam/FrameEncodingType.h',
+      'pbbam/Frames.h',
+      'pbbam/GenomicInterval.h',
+      'pbbam/GenomicIntervalQuery.h',
+      'pbbam/IFastaWriter.h',
+      'pbbam/IFastqWriter.h',
+      'pbbam/IndexedBamWriter.h',
+      'pbbam/IndexedFastaReader.h',
+      'pbbam/IndexedFastqReader.h',
+      'pbbam/Interval.h',
+      'pbbam/IRecordWriter.h',
+      'pbbam/LocalContextFlags.h',
+      'pbbam/MD5.h',
+      'pbbam/MoveAppend.h',
+      'pbbam/Orientation.h',
+      'pbbam/PbiBasicTypes.h',
+      'pbbam/PbiBuilder.h',
+      'pbbam/PbiFile.h',
+      'pbbam/PbiFilter.h',
+      'pbbam/PbiFilterQuery.h',
+      'pbbam/PbiFilterTypes.h',
+      'pbbam/PbiIndexedBamReader.h',
+      'pbbam/PbiRawData.h',
+      'pbbam/Position.h',
+      'pbbam/ProgramInfo.h',
+      'pbbam/PulseBehavior.h',
+      'pbbam/PulseExclusionReason.h',
+      'pbbam/QNameQuery.h',
+      'pbbam/QualityValue.h',
+      'pbbam/QualityValues.h',
+      'pbbam/ReadAccuracyQuery.h',
+      'pbbam/ReadGroupInfo.h',
+      'pbbam/RecordType.h',
+      'pbbam/SamTagCodec.h',
+      'pbbam/SamWriter.h',
+      'pbbam/SequenceInfo.h',
+      'pbbam/SNR.h',
+      'pbbam/Strand.h',
+      'pbbam/StringUtilities.h',
+      'pbbam/SubreadLengthQuery.h',
+      'pbbam/Tag.h',
+      'pbbam/TagCollection.h',
+      'pbbam/TextFileReader.h',
+      'pbbam/TextFileWriter.h',
+      'pbbam/Unused.h',
+      'pbbam/Validator.h',
+      'pbbam/ZmwGroupQuery.h',
+      'pbbam/ZmwQuery.h',
+      'pbbam/ZmwType.h',
+      'pbbam/ZmwTypeMap.h']),
+    subdir : 'pbbam')
+
+  install_headers(
+    files([
+      'pbbam/bed/BedReader.h',
+      'pbbam/bed/BedWriter.h']),
+    subdir : 'pbbam/bed')
+
+  install_headers(
+    files([
+      'pbbam/ccs/CCSHeader.h',
+      'pbbam/ccs/CCSPbiBuilder.h',
+      'pbbam/ccs/CCSRecord.h',
+      'pbbam/ccs/CCSRecordFormat.h',
+      'pbbam/ccs/CCSRecordReader.h',
+      'pbbam/ccs/CCSRecordWriter.h']),
+    subdir : 'pbbam/ccs')
+
+  install_headers(
+    files([
+      'pbbam/exception/BundleChemistryMappingException.h',
+      'pbbam/exception/InvalidSequencingChemistryException.h',
+      'pbbam/exception/ValidationException.h']),
+    subdir : 'pbbam/exception')
+
+  install_headers(
+    files([
+      'pbbam/internal/Compare.inl',
+      'pbbam/internal/CompositeBamReader.inl',
+      'pbbam/internal/DataSetBaseTypes.h',
+      'pbbam/internal/DataSetElement.h',
+      'pbbam/internal/DataSetElement.inl',
+      'pbbam/internal/PbiBasicTypes.inl',
+      'pbbam/internal/PbiFilter.inl',
+      'pbbam/internal/PbiFilterTypes.inl',
+      'pbbam/internal/QueryBase.h',
+      'pbbam/internal/QueryBase.inl']),
+    subdir : 'pbbam/internal')
+
+  install_headers(
+    files([
+      'pbbam/vcf/VcfVariant.h',
+      'pbbam/vcf/VcfFile.h',
+      'pbbam/vcf/VcfFormat.h',
+      'pbbam/vcf/VcfHeader.h',
+      'pbbam/vcf/VcfHeaderTypes.h',
+      'pbbam/vcf/VcfReader.h',
+      'pbbam/vcf/VcfSort.h',
+      'pbbam/vcf/VcfQuery.h',
+      'pbbam/vcf/VcfWriter.h']),
+    subdir : 'pbbam/vcf')
+
+  install_headers(
+    files([
+      'pbbam/virtual/VirtualPolymeraseBamRecord.h',
+      'pbbam/virtual/VirtualPolymeraseCompositeReader.h',
+      'pbbam/virtual/VirtualPolymeraseReader.h',
+      'pbbam/virtual/VirtualRegion.h',
+      'pbbam/virtual/VirtualRegionType.h',
+      'pbbam/virtual/VirtualRegionTypeMap.h',
+      'pbbam/virtual/VirtualZmwBamRecord.h',
+      'pbbam/virtual/WhitelistedZmwReadStitcher.h',
+      'pbbam/virtual/ZmwReadStitcher.h',
+      'pbbam/virtual/ZmwWhitelistVirtualReader.h']),
+    subdir : 'pbbam/virtual')
+endif
+
+pbbam_include_directories = include_directories('.')
diff --git a/include/pbbam/Accuracy.h b/include/pbbam/Accuracy.h

new file mode 100644 (file)

index 0000000..f5bf7b8
--- /dev/null
+++ b/include/pbbam/Accuracy.h
@@ -0,0 +1,22 @@
+// File Description
+/// \file Accuracy.h
+/// \brief Defines the Accuracy class.
+//
+// Author: Derek Barnett
+
+#ifndef ACCURACY_H
+#define ACCURACY_H
+
+#include "pbbam/Config.h"
+
+#include <pbcopper/data/Accuracy.h>
+
+namespace PacBio {
+namespace BAM {
+
+using Accuracy PBBAM_DEPRECATED = Data::Accuracy;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ACCURACY_H
diff --git a/include/pbbam/AlignmentPrinter.h b/include/pbbam/AlignmentPrinter.h

new file mode 100644 (file)

index 0000000..264b193
--- /dev/null
+++ b/include/pbbam/AlignmentPrinter.h
@@ -0,0 +1,71 @@
+// File Description
+/// \file AlignmentPrinter.h
+/// \brief Defines the AlignmentPrinter class.
+//
+// Author: Armin Töpfer
+
+#ifndef ALIGNMENTPRINTER_H
+#define ALIGNMENTPRINTER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/IndexedFastaReader.h"
+#include "pbbam/Orientation.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+
+/// \brief The AlignmentPrinter class "pretty-prints" an alignment with respect
+///        to its associated reference sequence.
+///
+/// Example output:
+/// \verbinclude plaintext/AlignmentPrinterOutput.txt
+///
+class AlignmentPrinter
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// Constructs the alignment printer with an associated FASTA file reader.
+    ///
+    /// \param[in] ifr FASTA reader
+    ///
+    /// \throws std::runtime_error if FASTA file cannot be opened for reading.
+    ///
+    AlignmentPrinter(const IndexedFastaReader& ifr);
+
+    /// \}
+
+public:
+    /// \name Printing
+    /// \{
+
+    /// Pretty-prints an aligned BamRecord to std::string.
+    ///
+    /// \note The current implementation includes ANSI escape sequences for
+    ///       coloring terminal output. Future versions of this method will
+    ///       likely make this optional.
+    ///
+    /// \returns formatted string containing the alignment and summary
+    ///          information
+    ///
+    std::string Print(const BamRecord& record,
+                      const Orientation orientation = Orientation::GENOMIC);
+
+    /// \}
+
+private:
+    std::unique_ptr<IndexedFastaReader> ifr_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ALIGNMENTPRINTER_H
diff --git a/include/pbbam/BaiIndexCache.h b/include/pbbam/BaiIndexCache.h

new file mode 100644 (file)

index 0000000..65efda5
--- /dev/null
+++ b/include/pbbam/BaiIndexCache.h
@@ -0,0 +1,60 @@
+// File Description
+/// \file BaiIndexCache.h
+/// \brief Defines the BaiIndexCache class.
+//
+// Author: Derek Barnett
+
+#ifndef BAIINDEXCACHE_H
+#define BAIINDEXCACHE_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <htslib/hts.h>
+
+#include "pbbam/Position.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamFile;
+class DataSet;
+
+///
+/// \brief Caches contents of *.bai file for re-use by multiple readers.
+///
+class BaiIndexCacheData
+{
+public:
+    explicit BaiIndexCacheData(const BamFile& bamFile);
+    explicit BaiIndexCacheData(const std::string& bamFilename);
+
+    ~BaiIndexCacheData();
+
+    /// \note This is very much an internal method and should not be considered
+    ///       public API. Exposed here only because of implementation details
+    ///       (definition of htslib-related custom deleters) and may be removed.
+    ///
+    /// \note Does not own the returned pointer; caller is responsible.
+    ///
+    hts_itr_t* IteratorForInterval(const int32_t refId, const Position start,
+                                   const Position stop) const;
+
+private:
+    struct BaiIndexCacheDataPrivate;
+    std::unique_ptr<BaiIndexCacheDataPrivate> d_;
+};
+
+using BaiIndexCache = std::shared_ptr<std::vector<std::shared_ptr<BaiIndexCacheData>>>;
+
+BaiIndexCache MakeBaiIndexCache(const DataSet& dataset);
+BaiIndexCache MakeBaiIndexCache(const std::vector<BamFile>& bamFiles);
+BaiIndexCache MakeBaiIndexCache(const BamFile& bamFile);
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAIINDEXCACHE_H
diff --git a/include/pbbam/BaiIndexedBamReader.h b/include/pbbam/BaiIndexedBamReader.h

new file mode 100644 (file)

index 0000000..8745d85
--- /dev/null
+++ b/include/pbbam/BaiIndexedBamReader.h
@@ -0,0 +1,115 @@
+// File Description
+/// \file BaiIndexedBamReader.h
+/// \brief Defines the BaiIndexedBamReader class.
+//
+// Author: Derek Barnett
+
+#ifndef BAIINDEXEDBAMREADER_H
+#define BAIINDEXEDBAMREADER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+
+#include <htslib/sam.h>
+
+#include "pbbam/BaiIndexCache.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/BamReader.h"
+#include "pbbam/GenomicInterval.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BaiIndexedBamReader class provides read-only iteration over %BAM
+///        records, bounded by a particular genomic interval.
+///
+/// The SAM/BAM standard index (*.bai) is used to allow random-access operations.
+///
+class PBBAM_EXPORT BaiIndexedBamReader : public BamReader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Constructs %BAM reader, that can be queried on genomic interval.
+    ///
+    /// \param filename input %BAM filename
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open
+    ///         for reading, or if the interval is invalid
+    ///
+    explicit BaiIndexedBamReader(std::string filename);
+    BaiIndexedBamReader(std::string filename, const std::shared_ptr<BaiIndexCacheData>& index);
+
+    /// \brief Constructs %BAM reader, that can be queried on genomic interval.
+    ///
+    /// \param[in] bamFile   input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open
+    ///         for reading, or if the interval is invalid
+    ///
+    explicit BaiIndexedBamReader(BamFile bamFile);
+    BaiIndexedBamReader(BamFile bamFile, const std::shared_ptr<BaiIndexCacheData>& index);
+
+    /// \brief Constructs %BAM reader, bounded by a genomic interval.
+    ///
+    /// All reads that overlap the interval will be available.
+    ///
+    /// \param[in] interval iteration will be bounded by this GenomicInterval.
+    /// \param[in] filename input %BAM filename
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open
+    ///         for reading, or if the interval is invalid
+    ///
+    BaiIndexedBamReader(const GenomicInterval& interval, std::string filename);
+    BaiIndexedBamReader(const GenomicInterval& interval, std::string filename,
+                        const std::shared_ptr<BaiIndexCacheData>& index);
+
+    /// \brief Constructs %BAM reader, bounded by a genomic interval.
+    ///
+    /// All reads that overlap the interval will be available.
+    ///
+    /// \param[in] interval     iteration will be bounded by this GenomicInterval.
+    /// \param[in] bamFile      input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open
+    ///         for reading, or if the interval is invalid
+    ///
+    BaiIndexedBamReader(const GenomicInterval& interval, BamFile bamFile);
+    BaiIndexedBamReader(const GenomicInterval& interval, BamFile bamFile,
+                        const std::shared_ptr<BaiIndexCacheData>& index);
+
+    /// \}
+
+public:
+    /// \name Random-Access
+    /// \{
+
+    /// \returns the underlying BamFile
+    const BamFile& File() const;
+
+    /// \returns the current GenomicInterval in use by this reader
+    const GenomicInterval& Interval() const;
+
+    /// \brief Sets a new genomic interval on the reader.
+    ///
+    /// \param[in] interval
+    /// \returns reference to this reader
+    ///
+    BaiIndexedBamReader& Interval(const GenomicInterval& interval);
+
+    /// \}
+
+protected:
+    int ReadRawData(BGZF* bgzf, bam1_t* b) override;
+
+private:
+    class BaiIndexedBamReaderPrivate;
+    std::unique_ptr<BaiIndexedBamReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAIINDEXEDBAMREADER_H
diff --git a/include/pbbam/BamFile.h b/include/pbbam/BamFile.h

new file mode 100644 (file)

index 0000000..4748f2f
--- /dev/null
+++ b/include/pbbam/BamFile.h
@@ -0,0 +1,183 @@
+// File Description
+/// \file BamFile.h
+/// \brief Defines the BamFile class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMFILE_H
+#define BAMFILE_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <string>
+
+#include "pbbam/BamHeader.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BamFile class represents a %BAM file.
+///
+/// It provides access to header metadata and methods for finding/creating
+/// associated index files.
+///
+class PBBAM_EXPORT BamFile
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a BamFile object on the provided \p filename &
+    ///        loads header information.
+    ///
+    /// \param[in] filename %BAM filename
+    /// \throws std::exception on failure to open %BAM file for reading
+    ///
+    BamFile(std::string filename);
+
+    BamFile(const BamFile& other);
+    BamFile(BamFile&& other) noexcept;
+    BamFile& operator=(const BamFile& other);
+    BamFile& operator=(BamFile&& other) noexcept;
+    ~BamFile();
+
+    /// \}
+
+public:
+    /// \name Index & Filename Methods
+    /// \{
+
+    /// \brief Creates a ".pbi" file for this %BAM file.
+    ///
+    /// \note Existing index file will be overwritten. Use
+    ///       EnsurePacBioIndexExists() if this is not desired.
+    ///
+    /// \throws if PBI file could not be properly created and/or
+    ///         written to disk
+    ///
+    void CreatePacBioIndex() const;
+
+    /// \brief Creates a ".bai" file for this %BAM file.
+    ///
+    /// \note Existing index file will be overwritten. Use
+    ///       EnsureStandardIndexExists() if this is not desired.
+    ///
+    /// \throws if BAI file could not be properly created (e.g. this
+    ///         %BAM is not coordinate-sorted) or could not be written to disk
+    ///
+    void CreateStandardIndex() const;
+
+    /// \brief Creates a ".pbi" file if one does not exist or is older than its
+    ///        %BAM file.
+    ///
+    /// Equivalent to:
+    /// \code{.cpp}
+    ///    if (!file.PacBioIndexExists())
+    ///        file.CreatePacBioIndex();
+    /// \endcode
+    ///
+    /// \note As of v0.4.02+, no timestamp check is performed. Previously we requr
+    /// with an additional timestamp check.
+    ///
+    /// \throws if PBI file could not be properly created and/or
+    ///         written to disk
+    ///
+    void EnsurePacBioIndexExists() const;
+
+    /// \brief Creates a ".bai" file if one does not exist or is older than its
+    ///        %BAM file.
+    ///
+    /// Equivalent to:
+    /// \code{.cpp}
+    ///    if (!file.StandardIndexExists())
+    ///        file.CreateStandardIndex();
+    /// \endcode
+    ///
+    /// \note As of v0.4.2, no timestamp check is performed.
+    ///
+    /// \throws if BAI file could not be properly created (e.g. this
+    ///         %BAM is not coordinate-sorted) or could not be written to disk
+    ///
+    void EnsureStandardIndexExists() const;
+
+    /// \returns %BAM filename
+    const std::string& Filename() const;
+
+    /// \returns true if %BAM file has EOF marker (empty BGZF block). Streamed
+    ///          input (filename: "-")
+    bool HasEOF() const;
+
+    /// \returns true if ".pbi" exists and is newer than this %BAM file.
+    bool PacBioIndexExists() const;
+
+    /// \returns filename of %PacBio index file (".pbi")
+    /// \note No guarantee is made on the existence of this file.
+    ///       This method simply returns the expected filename.
+    std::string PacBioIndexFilename() const;
+
+    /// \returns true if ".pbi" has a more recent timestamp than this file
+    bool PacBioIndexIsNewer() const;
+
+    /// \returns true if ".bai" exists
+    bool StandardIndexExists() const;
+
+    /// \note No guarantee is made on the existence of this file.
+    ///       This method simply returns the expected filename.
+    std::string StandardIndexFilename() const;
+
+    /// \returns true if ".bai" has a more recent timestamp than this file
+    bool StandardIndexIsNewer() const;
+
+    /// \}
+
+public:
+    /// \name File Header Data
+    /// \{
+
+    /// \returns true if header metadata has this reference name
+    bool HasReference(const std::string& name) const;
+
+    /// \returns const reference to BamHeader containing the file's metadata
+    const BamHeader& Header() const;
+
+    /// \returns true if file is a %PacBio %BAM file (i.e. has non-empty version
+    ///          associated with header "pb" tag)
+    bool IsPacBioBAM() const;
+
+    /// \returns ID for reference \p name (can be used for e.g.
+    ///          GenomicIntervalQuery), or -1 if not found
+    int ReferenceId(const std::string& name) const;
+
+    /// \return name of reference matching \p id, empty string if not found
+    std::string ReferenceName(const int id) const;
+
+    /// \returns length of requested reference \p name. 0 if not found
+    uint32_t ReferenceLength(const std::string& name) const;
+
+    /// \returns length of requested reference \p id. 0 if not found
+    uint32_t ReferenceLength(const int id) const;
+
+    /// \}
+
+public:
+    /// \name Additional Attributes
+    /// \{
+
+    /// \returns virtual offset of first alignment. Intended mostly for internal
+    ///          use. Note that this is a BGZF \b virtual offset, not a
+    ///          'normal' file position.
+    ///
+    int64_t FirstAlignmentOffset() const;
+
+    /// \}
+
+private:
+    class BamFilePrivate;
+    std::unique_ptr<BamFilePrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMFILE_H
diff --git a/include/pbbam/BamFileMerger.h b/include/pbbam/BamFileMerger.h

new file mode 100644 (file)

index 0000000..e19821a
--- /dev/null
+++ b/include/pbbam/BamFileMerger.h
@@ -0,0 +1,61 @@
+// Author: Derek Barnett
+
+#ifndef BAMFILEMERGER_H
+#define BAMFILEMERGER_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+#include <vector>
+
+#include <pbbam/ProgramInfo.h>
+
+namespace PacBio {
+namespace BAM {
+
+class DataSet;
+class IRecordWriter;
+
+class BamFileMerger
+{
+public:
+    /// \brief Runs merger on BAM files.
+    ///
+    /// When this function exits, a merged BAM (and optional PBI) will have been
+    /// written and closed.
+    ///
+    /// \param[in] bamFilenames      input filenames
+    /// \param[in] outputFilename    resulting BAM output
+    /// \param[in] createPbi         if true, creates a PBI alongside output BAM
+    /// \param[in] pgInfo            allows client applications to add its @PG entry to merged header
+    ///
+    /// \throws std::runtime_error if any any errors encountered while reading or writing
+    ///
+    static void Merge(const std::vector<std::string>& bamFilenames,
+                      const std::string& outputFilename, bool createPbi = true,
+                      const ProgramInfo& pgInfo = ProgramInfo{});
+
+    /// \brief Runs merger on a dataset, applying any supplied filters.
+    ///
+    /// When this function exits, a merged BAM (and optional PBI) will have been
+    /// written and closed.
+    ///
+    /// \param[in] dataset              provides input filenames & filters
+    /// \param[in] outputFilename       resulting BAM output
+    /// \param[in] createPbi            if true, creates a PBI alongside output BAM
+    /// \param[in] pgInfo            allows client applications to add its @PG entry to merged header
+    ///
+    /// \throws std::runtime_error if any any errors encountered while reading or writing
+    ///
+    static void Merge(const PacBio::BAM::DataSet& dataset, const std::string& outputFilename,
+                      bool createPbi = true, const ProgramInfo& pgInfo = ProgramInfo{});
+
+    static void Merge(const std::vector<std::string>& bamFilenames, IRecordWriter& writer);
+
+    static void Merge(const PacBio::BAM::DataSet& dataset, IRecordWriter& writer);
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMFILEMERGER_H
diff --git a/include/pbbam/BamHeader.h b/include/pbbam/BamHeader.h

new file mode 100644 (file)

index 0000000..4e96783
--- /dev/null
+++ b/include/pbbam/BamHeader.h
@@ -0,0 +1,384 @@
+// File Description
+/// \file BamHeader.h
+/// \brief Defines the BamHeader class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMHEADER_H
+#define BAMHEADER_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "pbbam/ProgramInfo.h"
+#include "pbbam/ReadGroupInfo.h"
+#include "pbbam/SequenceInfo.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BamHeader class represents the header section of the %BAM file.
+///
+/// It provides metadata about the file including file version, reference
+/// sequences, read groups, comments, etc.
+///
+/// A BamHeader may be fetched from a BamFile to view an existing file's
+/// metadata. Or one may be created/edited for use with writing to a new file
+/// (via BamWriter).
+///
+/// \note A particular BamHeader is likely to be re-used in lots of places
+///       throughout the library, for read-only purposes. For this reason, even
+///       though a BamHeader may be returned by value, it is essentially a thin
+///       wrapper for a shared-pointer to the actual data. This means, though,
+///       that if you need to edit an existing BamHeader for use with a
+///       BamWriter, please consider using BamHeader::DeepCopy. Otherwise any
+///       modifications will affect all BamHeaders that are sharing its
+///       underlying data.
+///
+class PBBAM_EXPORT BamHeader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    ///
+    /// \brief Creates a BamHeader from SAM-formatted text
+    /// \param samHeaderText
+    ///
+    BamHeader(const std::string& samHeaderText);
+
+    BamHeader();
+
+    /// \brief Detaches underlying data from the shared-pointer, returning a
+    ///        independent copy of the header contents.
+    ///
+    /// This ensures that any modifications to the newly returned BamHeader do
+    /// not affect other BamHeader objects that were sharing its underlying data.
+    ///
+    BamHeader DeepCopy() const;
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    /// \brief Merges another header with this one.
+    ///
+    /// Headers must be compatible for merging. This means that their Version,
+    /// SortOrder, PacBioBamVersion (and in the case of aligned BAM data,
+    /// Sequences) must all match. If not, an exception will be thrown.
+    ///
+    /// \param[in] other  header to merge with this one
+    /// \returns reference to this header
+    ///
+    /// \throws std::runtime_error if the headers are not compatible
+    ///
+    BamHeader& operator+=(const BamHeader& other);
+
+    /// \brief Creates a new, merged header.
+    ///
+    /// Headers must be compatible for merging. This means that their Version,
+    /// SortOrder, PacBioBamVersion (and in the case of aligned BAM data,
+    /// Sequences) must all match. If not, an exception will be thrown.
+    ///
+    /// Both original headers (this header and \p other) will not be modified.
+    ///
+    /// \param[in] other  header to merge with this one
+    /// \returns merged header
+    ///
+    /// \throws std::runtime_error if the headers are not compatible
+    ///
+    BamHeader operator+(const BamHeader& other) const;
+
+    /// \}
+
+public:
+    /// \name General Attributes
+    /// \{
+
+    /// \returns the %PacBio %BAM version number (\@HD:pb)
+    ///
+    /// \note This is different from the SAM/BAM version number
+    /// \sa BamHeader::Version.
+    ///
+    std::string PacBioBamVersion() const;
+
+    /// \returns the sort order used
+    ///
+    /// Valid values: "unknown", "unsorted", "queryname", or "coordinate"
+    ///
+    std::string SortOrder() const;
+
+    /// \returns the SAM/BAM version number (\@HD:VN)
+    ///
+    /// \note This is different from the %PacBio %BAM version number
+    /// \sa BamHeader::PacBioBamVersion
+    ///
+    std::string Version() const;
+
+    /// \}
+
+public:
+    /// \name Read Groups
+    /// \{
+
+    /// \returns true if the header contains a read group with \p id (\@RG:ID)
+    bool HasReadGroup(const std::string& id) const;
+
+    /// \returns a ReadGroupInfo object representing the read group matching
+    ///          \p id (\@RG:ID)
+    /// \throws std::runtime_error if \p id is unknown
+    ///
+    ReadGroupInfo ReadGroup(const std::string& id) const;
+
+    /// \returns vector of read group IDs listed in this header
+    std::vector<std::string> ReadGroupIds() const;
+
+    /// \returns vector of ReadGroupInfo objects, representing all read groups
+    ///          listed in this header
+    ///
+    std::vector<ReadGroupInfo> ReadGroups() const;
+
+    /// \}
+
+public:
+    /// \name Sequences
+    /// \{
+
+    /// \returns true if header contains a sequence with \p name (\@SQ:SN)
+    bool HasSequence(const std::string& name) const;
+
+    /// \returns number of sequences (\@SQ entries) stored in this header
+    size_t NumSequences() const;
+
+    /// \returns numeric ID for sequence matching \p name (\@SQ:SN)
+    ///
+    /// This is the numeric ID used elsewhere throughout the API.
+    ///
+    /// \throws std::runtime_error if \p name is unknown
+    /// \sa BamReader::ReferenceId, PbiReferenceIdFilter,
+    ///     PbiRawMappedData::tId_
+    ///
+    int32_t SequenceId(const std::string& name) const;
+
+    /// \returns the length of the sequence (\@SQ:LN, e.g. chromosome length) at
+    ///          index \p id
+    ///
+    /// \sa SequenceInfo::Length, BamHeader::SequenceId
+    ///
+    std::string SequenceLength(const int32_t id) const;
+
+    /// \returns the name of the sequence (\@SQ:SN) at index \p id
+    ///
+    /// \sa SequenceInfo::Name, BamHeader::SequenceId
+    ///
+    std::string SequenceName(const int32_t id) const;
+
+    /// \returns vector of sequence names (\@SQ:SN) stored in this header
+    ///
+    /// Position in the vector is equivalent to SequenceId.
+    ///
+    std::vector<std::string> SequenceNames() const;
+
+    /// \returns SequenceInfo object at index \p id
+    ///
+    /// \throws std::out_of_range if \p is an invalid or unknown index
+    /// \sa BamHeader::SequenceId
+    ///
+    SequenceInfo Sequence(const int32_t id) const;
+
+    /// \returns SequenceInfo for the sequence matching \p name
+    SequenceInfo Sequence(const std::string& name) const;
+
+    /// \returns vector of SequenceInfo objects representing the sequences
+    ///          (\@SQ entries) stored in this header
+    ///
+    std::vector<SequenceInfo> Sequences() const;
+
+    /// \}
+
+public:
+    /// \name Programs
+    /// \{
+
+    /// \returns true if this header contains a program entry with ID (\@PG:ID)
+    ///          matching \p id
+    ///
+    bool HasProgram(const std::string& id) const;
+
+    /// \returns ProgramInfo object for the program entry matching \p id
+    /// \throws std::runtime_error if \p id is unknown
+    ///
+    ProgramInfo Program(const std::string& id) const;
+
+    /// \returns vector of program IDs (\@PG:ID)
+    std::vector<std::string> ProgramIds() const;
+
+    /// \returns vector of ProgramInfo objects representing program entries
+    ///          (\@PG) stored in this heder
+    ///
+    std::vector<ProgramInfo> Programs() const;
+
+    /// \}
+
+public:
+    /// \name Comments
+    /// \{
+
+    /// \returns vector of comment (\@CO) strings
+    std::vector<std::string> Comments() const;
+
+    /// \}
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \returns SAM-header-formatted string representing this header's data
+    std::string ToSam() const;
+
+    /// \}
+
+public:
+    /// \name General Attributes
+    /// \{
+
+    /// \brief Sets this header's PacBioBAM version number (\@HD:pb).
+    ///
+    /// \returns reference to this object
+    /// \throws std::runtime_error if version number cannot be parsed or
+    ///         is less than the minimum version allowed.
+    ///
+    BamHeader& PacBioBamVersion(const std::string& version);
+
+    /// \brief Sets this header's sort order label (\@HD:SO).
+    ///
+    /// Valid values: "unknown", "unsorted", "queryname", or "coordinate"
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& SortOrder(std::string order);
+
+    /// \brief Sets this header's SAM/BAM version number (\@HD:VN).
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& Version(std::string version);
+
+    /// \}
+
+public:
+    /// \name Read Groups
+    /// \{
+
+    /// \brief Appends a read group entry (\@RG) to this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& AddReadGroup(ReadGroupInfo readGroup);
+
+    /// \brief Removes all read group entries from this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ClearReadGroups();
+
+    /// \brief Replaces this header's list of read group entries with those in
+    ///        \p readGroups.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ReadGroups(std::vector<ReadGroupInfo> readGroups);
+
+    /// \}
+
+public:
+    /// \name Sequences
+    /// \{
+
+    /// \brief Appends a sequence entry (\@SQ) to this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& AddSequence(SequenceInfo sequence);
+
+    /// \brief Removes all sequence entries from this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ClearSequences();
+
+    /// \brief Replaces this header's list of sequence entries with those in
+    ///       \p sequences.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& Sequences(std::vector<SequenceInfo> sequences);
+
+    /// \}
+
+public:
+    /// \name Programs
+    /// \{
+
+    /// \brief Appends a program entry (\@PG) to this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& AddProgram(ProgramInfo pg);
+
+    /// \brief Removes all program entries from this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ClearPrograms();
+
+    /// \brief Replaces this header's list of program entries with those in
+    ///        \p programs.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& Programs(std::vector<ProgramInfo> programs);
+
+    /// \}
+
+public:
+    /// \name Comments
+    /// \{
+
+    /// \brief Appends a comment (\@CO) to this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& AddComment(std::string comment);
+
+    /// \brief Removes all comments from this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ClearComments();
+
+    /// \brief Replaces this header's list of comments with those in \p comments.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& Comments(std::vector<std::string> comments);
+
+    /// \}
+
+private:
+    class BamHeaderPrivate;
+    std::shared_ptr<BamHeaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMHEADER_H
diff --git a/include/pbbam/BamReader.h b/include/pbbam/BamReader.h

new file mode 100644 (file)

index 0000000..5afc696
--- /dev/null
+++ b/include/pbbam/BamReader.h
@@ -0,0 +1,152 @@
+// File Description
+/// \file BamReader.h
+/// \brief Defines the BamReader class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMREADER_H
+#define BAMREADER_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include <htslib/sam.h>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/GenomicInterval.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BamReader class provides basic read-access to a %BAM file.
+///
+/// The base-class implementation provides a sequential read-through of BAM
+/// records. Derived classes may implement other access schemes (e.g. genomic
+/// region, PBI-enabled record filtering).
+///
+class PBBAM_EXPORT BamReader : public internal::IQuery
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Opens BAM for streaming from stdin
+    BamReader();
+
+    /// \brief Opens BAM file for reading.
+    ///
+    /// \param[in] fn %BAM filename
+    /// \throws std::runtime_error if failed to open
+    ///
+    explicit BamReader(std::string fn);
+
+    /// \brief Opens BAM file for reading.
+    ///
+    /// \param[in] bamFile BamFile object
+    /// \throws std::runtime_error if failed to open
+    ///
+    explicit BamReader(BamFile bamFile);
+
+    virtual ~BamReader();
+
+    /// \}
+
+public:
+    /// \name BAM File Attributes
+    /// \{
+
+    /// \returns %BAM filename
+    const std::string& Filename() const;
+
+    /// \returns BamHeader object from %BAM header contents
+    const BamHeader& Header() const;
+
+    /// \}
+
+public:
+    /// \name BAM File I/O
+    /// \{
+
+    /// \brief Fetches the "next" %BAM record.
+    ///
+    /// Default implementation will read records until EOF. Derived readers may
+    /// use additional criteria to decide which record is "next" and when
+    /// reading is done.
+    ///
+    /// \param[out] record  next BamRecord object. Should not be used if method
+    ///                     returns false.
+    ///
+    /// \returns true if record was read successfully. Returns false if EOF (or
+    ///          end of iterator in derived readers). False is not an error,
+    ///          it indicates "end of data".
+    ///
+    /// \throws std::runtime_error if failed to read from file (e.g. possible
+    ///         truncated or corrupted file).
+    ///
+    bool GetNext(BamRecord& record);
+
+    /// \brief Seeks to virtual offset in %BAM.
+    ///
+    /// \note This is \b NOT a normal file offset, but the virtual offset used
+    ///       in %BAM indexing.
+    ///
+    /// \throws std::runtime_error if failed to seek
+    ///
+    void VirtualSeek(int64_t virtualOffset);
+
+    /// \returns current (virtual) file position.
+    ///
+    /// \note This is \b NOT a normal file offset, but the virtual offset used
+    ///       in %BAM indexing.
+    ///
+    int64_t VirtualTell() const;
+
+    /// \}
+
+protected:
+    /// \name BAM File I/O
+    /// \{
+
+    /// \brief Helper method for access to underlying BGZF stream pointer.
+    ///
+    /// Useful for derived readers' contact points with htslib methods.
+    ///
+    /// \returns BGZF stream pointer
+    ///
+    BGZF* Bgzf() const;
+
+    /// \brief Performs the actual raw read of the next record from the BAM
+    ///        file.
+    ///
+    /// Default implementation will read records, sequentially, until EOF.
+    /// Derived readers may use additional criteria to decide which record is
+    ///  "next" and when reading is done.
+    ///
+    /// Return value should be equivalent to htslib's bam_read1():
+    ///     >= 0 : normal
+    ///       -1 : EOF (not an error)
+    ///     < -1 : error
+    ///
+    /// \param[in]  bgzf BGZF stream pointer
+    /// \param[out] b    %BAM record pointer
+    /// \returns integer status code, see description
+    ///
+    virtual int ReadRawData(BGZF* bgzf, bam1_t* b);
+
+    /// \}
+
+private:
+    class BamReaderPrivate;
+    std::unique_ptr<BamReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMREADER_H
diff --git a/include/pbbam/BamRecord.h b/include/pbbam/BamRecord.h

new file mode 100644 (file)

index 0000000..c0dc6be
--- /dev/null
+++ b/include/pbbam/BamRecord.h
@@ -0,0 +1,1314 @@
+// File Description
+/// \file BamRecord.h
+/// \brief Defines the BamRecord class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORD_H
+#define BAMRECORD_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <pbcopper/data/MappedRead.h>
+#include <pbcopper/data/Read.h>
+
+#include "pbbam/Accuracy.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecordImpl.h"
+#include "pbbam/ClipType.h"
+#include "pbbam/FrameEncodingType.h"
+#include "pbbam/Frames.h"
+#include "pbbam/LocalContextFlags.h"
+#include "pbbam/Orientation.h"
+#include "pbbam/PulseBehavior.h"
+#include "pbbam/PulseExclusionReason.h"
+#include "pbbam/QualityValues.h"
+#include "pbbam/ReadGroupInfo.h"
+#include "pbbam/RecordType.h"
+#include "pbbam/Strand.h"
+#include "pbbam/ZmwType.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+
+namespace PacBio {
+namespace BAM {
+
+class Pulse2BaseCache;
+
+/// \brief The BamRecord class represents a %PacBio %BAM record.
+///
+/// %PacBio %BAM records are extensions of normal SAM/BAM records. Thus in
+/// addition to normal fields like bases, qualities, mapping coordinates, etc.,
+/// tags are used extensively to annotate records with additional
+/// PacBio-specific data.
+///
+/// Mapping and clipping APIs are provided as well to ensure that such
+/// operations "trickle down" to all data fields properly.
+///
+/// \sa https://samtools.github.io/hts-specs/SAMv1.pdf
+///     for more information on standard %BAM data, and
+///     https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/BAM.rst
+///     for more information on %PacBio %BAM fields.
+///
+class PBBAM_EXPORT BamRecord
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    BamRecord();
+    BamRecord(BamHeader header);
+    BamRecord(BamRecordImpl impl);
+    BamRecord(const BamRecord& other);
+    BamRecord(BamRecord&& other) noexcept;
+    BamRecord& operator=(const BamRecord& other);
+    BamRecord& operator=(BamRecord&& other) noexcept;
+    virtual ~BamRecord();
+
+    /// \}
+
+public:
+    /// \name General Data
+    /// \{
+
+    /// \returns this record's full name
+    /// \sa BamRecordImpl::Name
+    ///
+    std::string FullName() const;
+
+    /// \returns shared pointer to this record's associated BamHeader
+    BamHeader Header() const;
+
+    /// \returns ZMW hole number
+    /// \throws if missing zm tag & record name does not contain hole number
+    ///
+    int32_t HoleNumber() const;
+
+    /// \returns this record's LocalContextFlags
+    PacBio::BAM::LocalContextFlags LocalContextFlags() const;
+
+    /// \returns this record's movie name
+    std::string MovieName() const;
+
+    /// \returns "number of complete passes of the insert"
+    int32_t NumPasses() const;
+
+    /// \returns the record's query end position, or Sequence().length() if not
+    ///          stored
+    /// \note QueryEnd is in polymerase read coordinates, NOT genomic
+    ///       coordinates.
+    ///
+    Position QueryEnd() const;
+
+    /// \returns the number of frames from start of movie to the last base of read
+    ///
+    int32_t QueryEndFrameNumber() const;
+
+    /// \returns the record's query start position, or 0 if not stored
+    ///
+    /// \note QueryStart is in polymerase read coordinates, NOT genomic
+    ///       coordinates.
+    ///
+    Position QueryStart() const;
+
+    /// \returns the number of frames from start of movie to the first base of read
+    ///
+    int32_t QueryStartFrameNumber() const;
+
+    /// \returns this record's expected read accuracy [0, 1000]
+    Accuracy ReadAccuracy() const;
+
+    /// \returns ReadGroupInfo object for this record
+    ReadGroupInfo ReadGroup() const;
+
+    /// \returns string ID of this record's read group
+    ///
+    /// This method should be perferred over ReadGroupBaseId() in most cases,
+    /// e.g. mapping between header info.
+    ///
+    /// For "ID:12345678":
+    ///     b.ReadGroupId()     -> "12345678"
+    ///     b.ReadGroupBaseId() -> "12345678"
+    ///
+    /// For "ID:12345678/0--0":
+    ///     b.ReadGroupId()   -> "12345678/0--0";
+    ///     b.ReadGroupBaseId -> "12345678"
+    ///
+    /// \sa BamRecord::ReadGroupBaseId
+    /// \sa ReadGroupInfo::Id
+    /// \sa ReadGroupInfo::BaseId
+    ///
+    std::string ReadGroupId() const;
+
+    /// \returns string base ID (stripped of optional barcode labels)
+    ///
+    /// ReadGroupId() should be preferred over this method in most cases. This
+    /// is intended for use with hash-string or integers directly.
+    ///
+    /// For "ID:12345678":
+    ///     b.ReadGroupId()     -> "12345678"
+    ///     b.ReadGroupBaseId() -> "12345678"
+    ///
+    /// For "ID:12345678/0--0":
+    ///     b.ReadGroupId()   -> "12345678/0--0";
+    ///     b.ReadGroupBaseId -> "12345678"
+    ///
+    /// \sa BamRecord::ReadGroupId
+    /// \sa ReadGroupInfo::Id
+    /// \sa ReadGroupInfo::BaseId
+    ///
+    std::string ReadGroupBaseId() const;
+
+    /// \returns integer value for this record's read group ID
+    int32_t ReadGroupNumericId() const;
+
+    /// \returns this scrap record's scrap region type
+    VirtualRegionType ScrapRegionType() const;
+
+    /// \returns this scrap record's scrap ZMW type
+    ZmwType ScrapZmwType() const;
+
+    /// \returns this record's average signal-to-noise for each of A, C, G,
+    ///          and T
+    ///
+    std::vector<float> SignalToNoise() const;
+
+    /// \returns this record's type
+    /// \sa RecordType
+    RecordType Type() const;
+
+    /// \}
+
+public:
+    /// \name Mapping Data
+    /// \{
+
+    /// \returns the record's aligned end position
+    ///
+    /// \note AlignedEnd is in polymerase read coordinates, NOT genomic
+    ///       coordinates.
+    ///
+    Position AlignedEnd() const;
+
+    /// \returns the record's aligned start position
+    ///
+    /// \note AlignedStart is in polymerase read coordinates, NOT genomic
+    ///       coordinates.
+    ///
+    Position AlignedStart() const;
+
+    /// \returns the record's strand as a Strand enum value
+    Strand AlignedStrand() const;
+
+    /// \returns the record's CIGAR data as a Cigar object
+    ///
+    /// \param[in] exciseAllClips   if true, remove all clipping operations
+    ///                             (hard & soft) [default:false]
+    ///
+    Cigar CigarData(bool exciseAllClips = false) const;
+
+    /// \returns true if this record was mapped by aligner
+    bool IsMapped() const;
+
+    /// \returns this record's mapping quality. A value of 255 indicates
+    ///          "unknown"
+    ///
+    uint8_t MapQuality() const;
+
+    /// \returns the number of deleted bases (relative to reference)
+    size_t NumDeletedBases() const;
+
+    /// \returns the number of inserted bases (relative to reference)
+    size_t NumInsertedBases() const;
+
+    /// \returns the number of matching bases (sum of '=' CIGAR op lengths)
+    size_t NumMatches() const;
+
+    /// \returns a tuple containing NumMatches (first) and NumMismatches
+    ///         (second)
+    ///
+    std::pair<size_t, size_t> NumMatchesAndMismatches() const;
+
+    /// \returns the number of mismatching bases (sum of 'X' CIGAR op lengths)
+    size_t NumMismatches() const;
+
+    /// \returns this record's reference ID, or -1 if unmapped.
+    ///
+    /// \note This is only a valid identifier within this %BAM file
+    ///
+    int32_t ReferenceId() const;
+
+    /// \returns this record's reference name.
+    ///
+    /// \throws an exception if unmapped record.
+    ///
+    std::string ReferenceName() const;
+
+    /// \returns the record's reference end position, or UnmappedPosition if
+    ///          unmapped
+    ///
+    /// \note ReferenceEnd is in reference coordinates, NOT polymerase read
+    ///       coordinates.
+    ///
+    Position ReferenceEnd() const;
+
+    /// \returns the record's reference start position, or UnmappedPosition if
+    ///          unmapped
+    ///
+    /// \note ReferenceStart is in reference coordinates, NOT polymerase read
+    ///       coordinates.
+    ///
+    Position ReferenceStart() const;
+
+    /// \}
+
+public:
+    /// \name Barcode Data
+    /// \{
+
+    /// \returns forward barcode id
+    ///
+    /// \throws std::runtime_error if barcode data is absent or malformed.
+    /// \sa HasBarcodes
+    ///
+    int16_t BarcodeForward() const;
+
+    /// \returns barcode call confidence (Phred-scaled posterior probability
+    ///          of correct barcode call)
+    ///
+    /// \sa HasBarcodeQuality
+    ///
+    uint8_t BarcodeQuality() const;
+
+    /// \returns reverse barcode id
+    ///
+    /// \throws std::runtime_error if barcode data is absent or malformed.
+    /// \sa HasBarcodes
+    ///
+    int16_t BarcodeReverse() const;
+
+    /// \returns the forward and reverse barcode ids
+    ///
+    /// \throws std::runtime_error if barcode data is absent or malformed.
+    /// \sa HasBarcodes
+    ///
+    std::pair<int16_t, int16_t> Barcodes() const;
+
+    /// \}
+
+public:
+    /// \name Auxiliary Data Queries
+    /// \{
+
+    /// \returns true if this record has AltLabelQV data
+    bool HasAltLabelQV() const;
+
+    /// \returns true if this record has AltLabelTag data
+    bool HasAltLabelTag() const;
+
+    /// \returns true if this record has Barcode data
+    bool HasBarcodes() const;
+
+    /// \returns true is this record has BarcodeQuality data
+    bool HasBarcodeQuality() const;
+
+    /// \returns true if this record has DeletionQV data
+    bool HasDeletionQV() const;
+
+    /// \returns true if this record has DeletionTag data
+    bool HasDeletionTag() const;
+
+    /// \returns true if this record has a HoleNumber
+    bool HasHoleNumber() const;
+
+    /// \returns true if this record has InsertionQV data
+    bool HasInsertionQV() const;
+
+    /// \returns true if this record has IPD data
+    bool HasIPD() const;
+
+    /// \returns true if this record has LabelQV data
+    bool HasLabelQV() const;
+
+    /// \returns true if this record has LocalContextFlags (absent in CCS)
+    bool HasLocalContextFlags() const;
+
+    /// \returns true if this record has MergeQV data
+    bool HasMergeQV() const;
+
+    /// \returns true if this record has NumPasses data
+    bool HasNumPasses() const;
+
+    /// \returns true if this record has Pkmean data
+    bool HasPkmean() const;
+
+    /// \returns true if this record has Pkmid data
+    bool HasPkmid() const;
+
+    /// \returns true if this record has Pkmean2 data
+    bool HasPkmean2() const;
+
+    /// \returns true if this record has Pkmid2 data
+    bool HasPkmid2() const;
+
+    /// \returns true if this record has PreBaseFrames aka IPD data
+    bool HasPreBaseFrames() const;
+
+    /// \returns true if this record has PrePulseFrames data
+    bool HasPrePulseFrames() const;
+
+    /// \returns true if this record has PulseCall data
+    bool HasPulseCall() const;
+
+    /// \returns true if this record has PulseCallWidth data
+    bool HasPulseCallWidth() const;
+
+    /// \returns true if this record has PulseExclusion data
+    bool HasPulseExclusion() const;
+
+    /// \returns true if this record has PulseMergeQV data
+    bool HasPulseMergeQV() const;
+
+    /// \returns true if this record has PulseWidth data
+    bool HasPulseWidth() const;
+
+    /// \returns true if this record has ReadAccuracyTag data
+    bool HasReadAccuracy() const;
+
+    /// \returns true if this record has QueryEnd data
+    bool HasQueryEnd() const;
+
+    /// \returns true if this record has QueryEnd data
+    bool HasQueryEndFrameNumber() const;
+
+    /// \returns true if this record has QueryStart data
+    bool HasQueryStart() const;
+
+    /// \returns true if this record has QueryStartFrameNumber data
+    bool HasQueryStartFrameNumber() const;
+
+    /// \returns true if this record has ScrapRegionType data (only in SCRAP)
+    bool HasScrapRegionType() const;
+
+    /// \returns true if this record has scrap ZMW type data (only in SCRAP)
+    bool HasScrapZmwType() const;
+
+    /// \returns true if this record has signal-to-noise data (absent in
+    ///          POLYMERASE)
+    ///
+    bool HasSignalToNoise() const;
+
+    /// \returns true if this record has StartFrame data
+    bool HasStartFrame() const;
+
+    /// \returns true if this record has SubstitutionQV data
+    bool HasSubstitutionQV() const;
+
+    /// \returns true if this record has SubstitutionTag data
+    bool HasSubstitutionTag() const;
+
+    /// \}
+
+public:
+    /// \name Sequence & Tag Data
+    /// \{
+
+    /// \brief Fetches this record's AltLabelTag values ("pt" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new gap chars will be '-' and padding chars will be '*'.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    ///
+    /// \returns AltLabelTags string
+    ///
+    std::string AltLabelTag(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                            bool exciseSoftClips = false,
+                            PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's DeletionTag values ("dt" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new gap chars will be '-' and padding chars will be '*'.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns DeletionTag string
+    ///
+    std::string DeletionTag(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                            bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's DNA sequence (SEQ field).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new gap chars will be '-' and padding chars will be '*'.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns sequence string
+    ///
+    std::string Sequence(const Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                         bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's SubstitutionTag values ("st" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new gap chars will be '-' and padding chars will be '*'.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns SubstitutionTags string
+    ///
+    std::string SubstitutionTag(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                                bool exciseSoftClips = false) const;
+
+    /// \}
+
+public:
+    /// \name Quality Data
+    /// \{
+
+    /// \brief Fetches this record's AltLabelQV values ("pv" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation     Orientation of output.
+    ///
+    /// \returns AltLabelQV as QualityValues object
+    ///
+    QualityValues AltLabelQV(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                             bool exciseSoftClips = false,
+                             PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's DeletionQV values ("dq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns DeletionQV as QualityValues object
+    ///
+    QualityValues DeletionQV(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                             bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's InsertionQV values ("iq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns InsertionQVs as QualityValues object
+    ///
+    QualityValues InsertionQV(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                              bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's LabelQV values ("pq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation     Orientation of output.
+    ///
+    /// \returns LabelQV as QualityValues object
+    ///
+    QualityValues LabelQV(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                          bool exciseSoftClips = false,
+                          PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's MergeQV values ("mq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns MergeQV as QualityValues object
+    ///
+    QualityValues MergeQV(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                          bool exciseSoftClips = false) const;
+
+    /// \brief Fetches  this record's %BAM quality values (QUAL field).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns %BAM qualities as QualityValues object
+    ///
+    QualityValues Qualities(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                            bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's SubstitutionQV values ("sq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns SubstitutionQV as QualityValues object
+    ///
+    QualityValues SubstitutionQV(Orientation orientation = Orientation::NATIVE,
+                                 bool aligned = false, bool exciseSoftClips = false) const;
+
+    /// \}
+
+public:
+    /// \name Pulse Data
+    /// \{
+
+    /// \brief Fetches this record's IPD values ("ip" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new frames will have a value of 0;
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns IPD as Frames object
+    ///
+    Frames IPD(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+               bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's IPD values ("ip" tag), but does not upscale.
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns IPD as Frames object
+    ///
+    Frames IPDRaw(Orientation orientation = Orientation::NATIVE) const;
+
+    /// \brief Fetches this record's Pkmean values ("pa" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns Pkmean as vector<float> object
+    ///
+    std::vector<float> Pkmean(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                              bool exciseSoftClips = false,
+                              PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's Pkmid values ("pm" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns Pkmid as vector<float> object
+    ///
+    std::vector<float> Pkmid(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                             bool exciseSoftClips = false,
+                             PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's Pkmean2 values ("pi" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns Pkmean as vector<float> object
+    ///
+    std::vector<float> Pkmean2(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                               bool exciseSoftClips = false,
+                               PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's Pkmid2 values ("ps" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns Pkmid as vector<float> object
+    ///
+    std::vector<float> Pkmid2(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                              bool exciseSoftClips = false,
+                              PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PreBaseFrames aka IPD values ("ip" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new frames will have a value of 0;
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns IPD as Frames object
+    ///
+    Frames PreBaseFrames(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                         bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's PrePulseFrames values ("pd" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PrePulseFrames as Frames object
+    ///
+    Frames PrePulseFrames(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                          bool exciseSoftClips = false,
+                          PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PulseCall values ("pc" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PulseCalls string
+    ///
+    std::string PulseCall(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                          bool exciseSoftClips = false,
+                          PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PulseCallWidth values ("px" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PulseCallWidth as Frames object
+    ///
+    Frames PulseCallWidth(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                          bool exciseSoftClips = false,
+                          PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PulseExclusionReason values ("pe" tag).
+    ///
+    /// \returns vector of pulse exclusion reason value
+    ///
+    std::vector<PacBio::BAM::PulseExclusionReason> PulseExclusionReason(
+        Orientation orientation = Orientation::NATIVE, bool aligned = false,
+        bool exciseSoftClips = false, PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetch this record's PulseMergeQV values ("pg" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PulseMergeQV as QualityValues object
+    ///
+    QualityValues PulseMergeQV(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                               bool exciseSoftClips = false,
+                               PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PulseWidth values ("pw" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new frames will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns PulseWidths as Frames object
+    ///
+    Frames PulseWidth(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                      bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's PulseWidth values ("pw" tag), but does not
+    ///        upscale.
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PulseWidth as Frames object
+    ///
+    Frames PulseWidthRaw(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                         bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's StartFrame values ("sf" tag).
+    ///
+    /// \param[in] orientation     Orientation of output
+    ///
+    /// \returns StartFrame as uint32_t vector
+    ///
+    std::vector<uint32_t> StartFrame(Orientation orientation = Orientation::NATIVE,
+                                     bool aligned = false, bool exciseSoftClips = false,
+                                     PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \}
+
+public:
+    /// \name Low-Level Access & Operations
+    /// \{
+
+    /// \warning This method should be considered temporary and avoided as much
+    ///          as possible. Direct access to the internal object is likely to
+    ///          disappear as BamRecord interface matures.
+    ///
+    /// \returns const reference to underlying BamRecordImpl object
+    ///
+    const BamRecordImpl& Impl() const;
+
+    /// \warning This method should be considered temporary and avoided as much
+    ///          as possible. Direct access to the internal object is likely to
+    ///          disappear as BamRecord interface matures.
+    ///
+    /// \returns reference to underlying BamRecordImpl object
+    ///
+    BamRecordImpl& Impl();
+
+    /// \}
+
+public:
+    /// \name General Data
+    /// \{
+
+    /// \brief Sets this record's ZMW hole number.
+    ///
+    /// \param[in] holeNumber
+    /// \returns reference to this record
+    ///
+    BamRecord& HoleNumber(const int32_t holeNumber);
+
+    /// \brief Sets this record's local context flags
+    ///
+    /// \param[in] flags
+    /// \returns reference to this record
+    ///
+    BamRecord& LocalContextFlags(const PacBio::BAM::LocalContextFlags flags);
+
+    /// \brief Sets this record's "number of complete passes of the insert".
+    ///
+    /// \param[in] numPasses
+    /// \returns reference to this record
+    ///
+    BamRecord& NumPasses(const int32_t numPasses);
+
+    /// \brief Sets this record's query end position.
+    ///
+    /// \note Changing this will modify the name of non-CCS records.
+    ///
+    /// \param[in] pos
+    /// \returns reference to this record
+    ///
+    BamRecord& QueryEnd(const PacBio::BAM::Position pos);
+
+    /// \brief Sets this record's query end frame number
+    ///
+    /// \param[in] frame number
+    /// \returns reference to this record
+    ///
+    BamRecord& QueryEndFrameNumber(const int32_t frameNumber);
+
+    /// \brief Sets this record's query start position.
+    ///
+    /// \note Changing this will modify the name of non-CCS records.
+    ///
+    /// \param[in] pos
+    /// \returns reference to this record
+    ///
+    BamRecord& QueryStart(const PacBio::BAM::Position pos);
+
+    /// \brief Sets this record's query start frame number
+    ///
+    /// \param[in] frame number
+    /// \returns reference to this record
+    ///
+    BamRecord& QueryStartFrameNumber(const int32_t frameNumber);
+
+    /// \brief Sets this record's expected read accuracy [0, 1000]
+    ///
+    /// \param[in] accuracy
+    /// \returns reference to this record
+    ///
+    BamRecord& ReadAccuracy(const Accuracy& accuracy);
+
+    /// \brief Attaches this record to the provided read group, changing the
+    ///        record name & 'RG' tag.
+    ///
+    /// \param[in] rg
+    /// \returns reference to this record
+    ///
+    BamRecord& ReadGroup(const ReadGroupInfo& rg);
+
+    /// \brief Attaches this record to the provided read group, changing the
+    ///        record name & 'RG' tag.
+    ///
+    /// \param[in] id
+    /// \returns reference to this record
+    ///
+    BamRecord& ReadGroupId(const std::string& id);
+
+    /// \brief Sets this scrap record's ScrapRegionType
+    ///
+    /// \param[in] type
+    /// \returns reference to this record
+    ///
+    BamRecord& ScrapRegionType(const VirtualRegionType type);
+
+    /// \brief Sets this scrap record's ScrapRegionType
+    ///
+    /// \param[in] type character equivalent of VirtualRegionType
+    /// \returns reference to this record
+    ///
+    BamRecord& ScrapRegionType(const char type);
+
+    /// \brief Sets this scrap record's ScrapZmwType
+    ///
+    /// \param[in] type
+    /// \returns reference to this record
+    ///
+    BamRecord& ScrapZmwType(const ZmwType type);
+
+    /// \brief Sets this scrap record's ScrapZmwType
+    ///
+    /// \param[in] type character equivalent of ZmwType
+    /// \returns reference to this record
+    ///
+    BamRecord& ScrapZmwType(const char type);
+
+    /// \brief Sets this record's average signal-to-noise in each of A, C, G,
+    ///        and T
+    ///
+    /// \param[in] snr average signal-to-noise of A, C, G, and T (in this order)
+    /// \returns reference to this record
+    ///
+    BamRecord& SignalToNoise(const std::vector<float>& snr);
+
+    /// \}
+
+public:
+    /// \name Barcode Data
+    /// \{
+
+    /// \brief Sets this record's barcode IDs ('bc' tag)
+    ///
+    /// \param[in] barcodeIds
+    /// \returns reference to this record
+    ///
+    BamRecord& Barcodes(const std::pair<int16_t, int16_t>& barcodeIds);
+
+    /// \brief Sets this record's barcode quality ('bq' tag)
+    ///
+    /// \param[in] quality Phred-scaled confidence call
+    /// \returns reference to this record
+    ///
+    BamRecord& BarcodeQuality(const uint8_t quality);
+
+    /// \}
+
+public:
+    /// \name Sequence & Tag Data
+    /// \{
+
+    /// \brief Sets this record's AltLabelTag values ("at" tag).
+    ///
+    /// \param[in] tags
+    /// \returns reference to this record
+    ///
+    BamRecord& AltLabelTag(const std::string& tags);
+
+    /// \brief Sets this record's DeletionTag values ("dt" tag).
+    ///
+    /// \param[in] tags
+    /// \returns reference to this record
+    ///
+    BamRecord& DeletionTag(const std::string& tags);
+
+    /// \brief Sets this record's SubstitutionTag values ("st" tag).
+    ///
+    /// \param[in] tags
+    /// \returns reference to this record
+    ///
+    BamRecord& SubstitutionTag(const std::string& tags);
+
+    /// \}
+
+public:
+    /// \name Quality Data
+    /// \{
+
+    /// \brief Sets this record's AltLabelQV values ("pv" tag).
+    ///
+    /// \param[in] altLabelQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& AltLabelQV(const QualityValues& altLabelQVs);
+
+    /// \brief Sets this record's DeletionQV values ("dq" tag).
+    ///
+    /// \param[in] deletionQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& DeletionQV(const QualityValues& deletionQVs);
+
+    /// \brief Sets this record's InsertionQV values ("iq" tag).
+    ///
+    /// \param[in] insertionQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& InsertionQV(const QualityValues& insertionQVs);
+
+    /// \brief Sets this record's LabelQV values ("pq" tag).
+    ///
+    /// \param[in] labelQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& LabelQV(const QualityValues& labelQVs);
+
+    /// \brief Sets this record's MergeQV values ("mq" tag).
+    ///
+    /// \param[in] mergeQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& MergeQV(const QualityValues& mergeQVs);
+
+    /// \brief Sets this record's SubstitutionQV values ("sq" tag).
+    ///
+    /// \param[in] substitutionQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& SubstitutionQV(const QualityValues& substitutionQVs);
+
+    /// \}
+
+public:
+    /// \name Pulse Data
+    /// \{
+
+    /// \brief Sets this record's IPD values ("ip" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& IPD(const Frames& frames, const FrameEncodingType encoding);
+
+    /// \brief Sets this record's Pkmean values ("pm" tag).
+    ///
+    /// \param[in] photons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmean(const std::vector<float>& photons);
+
+    /// \brief Sets this record's Pkmean values ("pm" tag).
+    ///
+    /// \param[in] encodedPhotons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmean(const std::vector<uint16_t>& encodedPhotons);
+
+    /// \brief Sets this record's Pkmid values ("pa" tag).
+    ///
+    /// \param[in] photons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmid(const std::vector<float>& photons);
+
+    /// \brief Sets this record's Pkmid values ("pa" tag).
+    ///
+    /// \param[in] encodedPhotons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmid(const std::vector<uint16_t>& encodedPhotons);
+
+    /// \brief Sets this record's Pkmean2 values ("ps" tag).
+    ///
+    /// \param[in] photons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmean2(const std::vector<float>& photons);
+
+    /// \brief Sets this record's Pkmean2 values ("ps" tag).
+    ///
+    /// \param[in] encodedPhotons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmean2(const std::vector<uint16_t>& encodedPhotons);
+
+    /// \brief Sets this record's Pkmid2 values ("pi" tag).
+    ///
+    /// \param[in] photons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmid2(const std::vector<float>& photons);
+
+    /// \brief Sets this record's Pkmid2 values ("pi" tag).
+    ///
+    /// \param[in] encodedPhotons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmid2(const std::vector<uint16_t>& encodedPhotons);
+
+    /// \brief Sets this record's PreBaseFrames aka IPD values ("ip" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& PreBaseFrames(const Frames& frames, const FrameEncodingType encoding);
+
+    /// \brief Sets this record's PrePulseFrames values ("pd" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& PrePulseFrames(const Frames& frames, const FrameEncodingType encoding);
+
+    /// \brief Sets this record's PulseCall values ("pc" tag).
+    ///
+    /// \param[in] tags
+    /// \returns reference to this record
+    ///
+    BamRecord& PulseCall(const std::string& tags);
+
+    /// \brief Sets this record's PulseCallWidth values ("px" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& PulseCallWidth(const Frames& frames, const FrameEncodingType encoding);
+
+    ///
+    /// \\brief Sets this record's PulseExclusionReason values ("pe" tag).
+    /// \param[in] reasons
+    /// \return reference to this record
+    ///
+    BamRecord& PulseExclusionReason(const std::vector<PacBio::BAM::PulseExclusionReason>& reasons);
+
+    /// \brief Sets this record's PulseMergeQV values ("pg" tag).
+    ///
+    /// \param[in] pulseMergeQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& PulseMergeQV(const QualityValues& pulseMergeQVs);
+
+    /// \brief Sets this record's PulseWidth values ("pw" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& PulseWidth(const Frames& frames, const FrameEncodingType encoding);
+
+    /// \brief Sets this record's StartFrame values ("sf" tag).
+    ///
+    /// \param[in] startFrame
+    /// \returns reference to this record
+    ///
+    BamRecord& StartFrame(const std::vector<uint32_t>& startFrame);
+
+    /// \}
+
+public:
+    /// \name Low-Level Access & Operations
+    /// \{
+
+    /// \brief Resets cached aligned start/end.
+    ///
+    /// \note This method should not be needed in most client code. It exists
+    ///       primarily as a hook for internal reading loops (queries, index
+    ///       build, etc.) It's essentially a workaround and will likely be
+    ///       removed from the API.
+    ///
+    void ResetCachedPositions() const;
+
+    /// \brief Resets cached aligned start/end.
+    ///
+    /// \note This method should not be needed in most client code. It exists
+    ///       primarily as a hook for internal reading loops (queries, index
+    ///       build, etc.) It's essentially a workaround and will likely be
+    ///       removed from the API.
+    ///
+    void ResetCachedPositions();
+
+    /// \brief Updates the record's name (BamRecord::FullName) to reflect
+    ///        modifications to name components (movie name, ZMW hole number,
+    ///        etc.)
+    ///
+    void UpdateName();
+
+    /// \}
+
+public:
+    /// \name Pulse Data
+    /// \{
+
+    static const float photonFactor;
+
+    static std::vector<uint16_t> EncodePhotons(const std::vector<float>& data);
+
+    /// \}
+
+public:
+    /// \name [Mapped]Read conversion
+    /// \{
+
+    ///
+    /// \return Data::Read representation of this record
+    ///
+    Data::Read ToRead() const;
+
+    ///
+    /// \return Data::MappedRead representation of this record
+    ///
+    /// \throws std::runtime_error if record is unmapped
+    ///
+    Data::MappedRead ToMappedRead() const;
+
+    /// \}
+
+public:
+    /// \name Clipping & Mapping
+    /// \{
+
+    /// Creates a copied record from input, with clipping applied
+    static BamRecord Clipped(const BamRecord& input, const ClipType clipType,
+                             const PacBio::BAM::Position start, const PacBio::BAM::Position end,
+                             const bool exciseFlankingInserts = false);
+
+    /// Creates a copied record from input, with mapping applied
+    static BamRecord Mapped(const BamRecord& input, const int32_t referenceId,
+                            const Position refStart, const Strand strand, const Cigar& cigar,
+                            const uint8_t mappingQuality);
+
+    /// Applies clipping to this record
+    BamRecord& Clip(const ClipType clipType, const PacBio::BAM::Position start,
+                    const PacBio::BAM::Position end, const bool exciseFlankingInserts = false);
+
+    /// Creates a copied record from this one, with clipping applied
+    BamRecord Clipped(const ClipType clipType, const PacBio::BAM::Position start,
+                      const PacBio::BAM::Position end,
+                      const bool exciseFlankingInserts = false) const;
+
+    /// Applies mapping to this record
+    BamRecord& Map(const int32_t referenceId, const Position refStart, const Strand strand,
+                   const Cigar& cigar, const uint8_t mappingQuality);
+
+    /// Creates a copied record from this one, with mapping applied
+    BamRecord Mapped(const int32_t referenceId, const Position refStart, const Strand strand,
+                     const Cigar& cigar, const uint8_t mappingQuality) const;
+    /// \}
+
+private:
+    BamRecordImpl impl_;
+
+public:
+    /// public & mutable so that queries can directly set the header info,
+    /// even on a record that is const from client code's perspective
+    mutable BamHeader header_;
+
+private:
+    /// \internal
+    /// cached positions (mutable to allow lazy-calc in const methods)
+    mutable Position alignedStart_;
+    mutable Position alignedEnd_;
+
+private:
+    /// \internal
+    /// pulse to bam mapping cache
+    mutable std::unique_ptr<Pulse2BaseCache> p2bCache_;
+
+public:
+    /// clips the PacBio tags to a specified length
+    void ClipTags(const size_t clipPos, const size_t clipLength);
+
+private:
+    ///\internal
+    /// clipping methods
+
+    void ClipFields(const size_t clipPos, const size_t clipLength);
+
+    BamRecord& ClipToQuery(const PacBio::BAM::Position start, const PacBio::BAM::Position end);
+    BamRecord& ClipToReference(const PacBio::BAM::Position start, const PacBio::BAM::Position end,
+                               const bool exciseFlankingInserts);
+
+private:
+    ///\internal
+    /// raw tag data fetching
+
+    // sequence tags
+    std::string FetchBasesRaw(const BamRecordTag tag) const;
+    std::string FetchBases(const BamRecordTag tag,
+                           const Orientation orientation = Orientation::NATIVE,
+                           const bool aligned = false, const bool exciseSoftClips = false,
+                           const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // frame tags
+    Frames FetchFramesRaw(const BamRecordTag tag) const;
+    Frames FetchFrames(const BamRecordTag tag, const Orientation orientation = Orientation::NATIVE,
+                       const bool aligned = false, const bool exciseSoftClips = false,
+                       const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // pulse tags
+    std::vector<float> FetchPhotonsRaw(const BamRecordTag tag) const;
+    std::vector<float> FetchPhotons(const BamRecordTag tag,
+                                    const Orientation orientation = Orientation::NATIVE,
+                                    const bool aligned = false, const bool exciseSoftClips = false,
+                                    const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // QV tags
+    QualityValues FetchQualitiesRaw(const BamRecordTag tag) const;
+    QualityValues FetchQualities(const BamRecordTag tag,
+                                 const Orientation orientation = Orientation::NATIVE,
+                                 const bool aligned = false, const bool exciseSoftClips = false,
+                                 const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // UInt tags (e.g. start frame)
+    //
+    // TODO (DB): clean this up w.r.t FetchUInt8s
+    //
+    std::vector<uint32_t> FetchUInt32sRaw(const BamRecordTag tag) const;
+    std::vector<uint32_t> FetchUInt32s(
+        const BamRecordTag tag, const Orientation orientation = Orientation::NATIVE,
+        const bool aligned = false, const bool exciseSoftClips = false,
+        const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // UInt tags (e.g. pulse exclusion)
+    //
+    // ODO (DB): clean this up w.r.t FetchUInt32s
+    //
+    std::vector<uint8_t> FetchUInt8sRaw(const BamRecordTag tag) const;
+    std::vector<uint8_t> FetchUInt8s(const BamRecordTag tag,
+                                     const Orientation orientation = Orientation::NATIVE,
+                                     const bool aligned = false, const bool exciseSoftClips = false,
+                                     const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+private:
+    ///\internal
+    /// marked const to allow calling from const methods
+    /// but updates our mutable cached values
+    void CalculateAlignedPositions() const;
+    void CalculatePulse2BaseCache() const;
+
+    friend class BamRecordMemory;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMRECORD_H
diff --git a/include/pbbam/BamRecordBuilder.h b/include/pbbam/BamRecordBuilder.h

new file mode 100644 (file)

index 0000000..b1c1080
--- /dev/null
+++ b/include/pbbam/BamRecordBuilder.h
@@ -0,0 +1,239 @@
+// File Description
+/// \file BamRecordBuilder.h
+/// \brief Defines the BamRecordBuilder class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDBUILDER_H
+#define BAMRECORDBUILDER_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <string>
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BamRecordBuilder class provides a helper utility for building
+///        BamRecords.
+///
+/// This class provides a mechanism for building up %BAM data and
+/// lazy-encoding/constructing the actual BamRecord. Currently, the methods here
+/// really only support  filling in the low-level SAM/BAM-style fields, not so
+/// much the PacBio-specific fields.
+///
+class PBBAM_EXPORT BamRecordBuilder
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty %BAM record builder.
+    BamRecordBuilder();
+
+    /// \brief Creates an empty %BAM record builder, with header info to apply
+    ///        to built records.
+    ///
+    /// \param[in] header   BamHeader object
+    ///
+    explicit BamRecordBuilder(BamHeader header);
+
+    /// \brief Creates record builder with inital record data.
+    ///
+    /// \param[in] prototype    data from this record will be used to seed the
+    ///                         builder
+    ///
+    BamRecordBuilder(const BamRecord& prototype);
+
+    /// \}
+
+public:
+    /// \name Record-Building
+    /// \{
+
+    /// \brief Builds a BamRecord from current builder attributes.
+    ///
+    /// \returns newly-built BamRecord object
+    ///
+    BamRecord Build() const;
+
+    /// \brief Replaces an existing BamRecord's data with current builder
+    ///        attributes.
+    ///
+    /// \param[out] record resulting record
+    /// \returns true if successful
+    ///
+    bool BuildInPlace(BamRecord& record) const;
+
+    /// \brief Resets builder attributes to default values.
+    ///
+    void Reset();
+
+    /// \brief Resets builder attributes with \p prototype's data.
+    ///
+    /// \param[in] prototype
+    ///
+    void Reset(BamRecord prototype);
+
+    /// \}
+
+public:
+    /// \name Core Attribute Setup
+    /// \{
+
+    /// \brief Sets the record's (BAI) index bin ID.
+    ///
+    /// \param[in] bin BAI index bin ID.
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Bin(const uint32_t bin);
+
+    /// \brief Sets this record's alignment flag, using a raw integer.
+    ///
+    /// \param[in] flag raw alignment flag
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& Flag(const uint32_t flag);
+
+    /// \brief Sets this record's insert size.
+    ///
+    /// \param[in] iSize insert size
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& InsertSize(const int32_t iSize);
+
+    /// \brief Sets this record's map quality.
+    ///
+    /// \param[in] mapQual mapping quality - value of 255 indicates "unknown"
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& MapQuality(const uint8_t mapQual);
+
+    /// \brief Sets this record's mate's mapped position.
+    ///
+    /// \param[in] pos mapped position. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& MatePosition(const int32_t pos);
+
+    /// \brief Sets this record's mate's mapped reference ID
+    ///
+    /// \param[in] id reference ID. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& MateReferenceId(const int32_t id);
+
+    /// \brief Sets this record's mapped position.
+    ///
+    /// \param[in] pos mapped position. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& Position(const int32_t pos);
+
+    /// \brief Sets this record's mapped reference ID
+    ///
+    /// \param[in] id reference ID. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& ReferenceId(const int32_t id);
+
+    /// \}
+
+public:
+    /// \name Alignment Flag Setup
+    /// \{
+
+    /// \brief Sets whether this record is a PCR/optical duplicate
+    BamRecordBuilder& SetDuplicate(bool ok);
+
+    /// \brief Sets whether this record failed quality controls
+    BamRecordBuilder& SetFailedQC(bool ok);
+
+    /// \brief Sets whether this record is the first mate of a pair.
+    BamRecordBuilder& SetFirstMate(bool ok);
+
+    /// \brief Sets whether this record was aligned.
+    BamRecordBuilder& SetMapped(bool ok);
+
+    /// \brief Sets whether this record's mate was aligned.
+    BamRecordBuilder& SetMateMapped(bool ok);
+
+    /// \brief Sets whether this record's mate mapped to reverse strand.
+    BamRecordBuilder& SetMateReverseStrand(bool ok);
+
+    /// \brief Sets whether this record came from paired-end sequencing.
+    BamRecordBuilder& SetPaired(bool ok);
+
+    /// \brief Sets whether this record is a read's primary alignment.
+    BamRecordBuilder& SetPrimaryAlignment(bool ok);
+
+    /// \brief Sets whether this record & its mate were properly mapped, per the
+    ///        aligner.
+    ///
+    BamRecordBuilder& SetProperPair(bool ok);
+
+    /// \brief Sets whether this record mapped to reverse strand.
+    BamRecordBuilder& SetReverseStrand(bool ok);
+
+    /// \brief Sets whether this record is the second mate of a pair.
+    BamRecordBuilder& SetSecondMate(bool ok);
+
+    /// \brief Sets whether this record is a supplementary alignment.
+    BamRecordBuilder& SetSupplementaryAlignment(bool ok);
+
+    /// \}
+
+public:
+    /// \name Variable-Length Data Setup
+    /// \{
+
+    /// \brief Sets the record's CIGAR data.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Cigar(PacBio::BAM::Cigar cigar);
+
+    /// \brief Sets the record's name.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Name(std::string name);
+
+    /// \brief Sets the record's qualities.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Qualities(std::string qualities);
+
+    /// \brief Sets the record's sequence.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Sequence(std::string sequence);
+
+    /// \brief Sets the record's tags.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Tags(TagCollection tags);
+
+    /// \}
+
+private:
+    BamHeader header_;
+    bam1_core_t core_;
+    std::string name_;
+    std::string sequence_;
+    std::string qualities_;
+    PacBio::BAM::Cigar cigar_;
+    TagCollection tags_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMRECORDBUILDER_H
diff --git a/include/pbbam/BamRecordImpl.h b/include/pbbam/BamRecordImpl.h

new file mode 100644 (file)

index 0000000..81a8a03
--- /dev/null
+++ b/include/pbbam/BamRecordImpl.h
@@ -0,0 +1,594 @@
+// File Description
+/// \file BamRecordImpl.h
+/// \brief Defines the BamRecordImpl class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDIMPL_H
+#define BAMRECORDIMPL_H
+
+#include "pbbam/Config.h"
+
+#include <htslib/sam.h>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "pbbam/BamRecordTag.h"
+#include "pbbam/Cigar.h"
+#include "pbbam/Position.h"
+#include "pbbam/QualityValues.h"
+#include "pbbam/TagCollection.h"
+
+namespace PacBio {
+namespace BAM {
+
+class SamWriter;
+
+/// \brief The BamRecordImpl class holds all data necessary for creating,
+///        querying or editing a generic %BAM record.
+///
+/// For PacBio-specific extensions and convenience methods, see BamRecord.
+///
+/// \note This class is mostly an internal implementation detail and will
+///       likely be removed from the public API in the future. Please use
+///       BamRecord as much as possible.
+///
+class PBBAM_EXPORT BamRecordImpl
+{
+public:
+    // clang-format off
+    /// These flags describe the alignment status of the record.
+    enum AlignmentFlag
+    {
+        PAIRED              = 0x0001,   ///< Record comes from paired-end sequencing
+        PROPER_PAIR         = 0x0002,   ///< Each mate of a pair was properly aligned ("proper" as determined by aligner)
+        UNMAPPED            = 0x0004,   ///< Record was not mapped by aligner
+        MATE_UNMAPPED       = 0x0008,   ///< Record's mate was not mapped by aligner
+        REVERSE_STRAND      = 0x0010,   ///< Record was aligned to reverse strand (Sequence() is reverse-complemented)
+        MATE_REVERSE_STRAND = 0x0020,   ///< Record's mate was aligned to reverse strand (mate's Sequence() is reverse-complemented)
+        MATE_1              = 0x0040,   ///< Record is first mate of pair
+        MATE_2              = 0x0080,   ///< Record is second mate of pair
+        SECONDARY           = 0x0100,   ///< Record is a secondary alignment
+        FAILED_QC           = 0x0200,   ///< Record failed quality controls
+        DUPLICATE           = 0x0400,   ///< Record is a PCR/optical duplicate
+        SUPPLEMENTARY       = 0x0800    ///< Record is a supplementary alignment
+    };
+    // clang-format on
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    BamRecordImpl();
+    BamRecordImpl(const BamRecordImpl& other);
+    BamRecordImpl(BamRecordImpl&& other) noexcept = default;
+    BamRecordImpl& operator=(const BamRecordImpl& other);
+    BamRecordImpl& operator=(BamRecordImpl&& other) noexcept = default;
+    virtual ~BamRecordImpl() = default;
+
+    /// \}
+
+public:
+    /// \name Core Data
+    /// \{
+
+    /// \returns this record's assigned (BAI) index bin ID.
+    uint32_t Bin() const;
+
+    /// \returns this record's alignment flag, in raw integer form.
+    uint32_t Flag() const;
+
+    /// \returns this record's insert size
+    int32_t InsertSize() const;
+
+    /// \returns this record's mapping quality. A value of 255 indicates "unknown"
+    uint8_t MapQuality() const;
+
+    /// \returns this record's mate's mapped position, or -1 if unmapped
+    PacBio::BAM::Position MatePosition() const;
+
+    /// \returns this record's mate's mapped reference ID, or -1 if unmapped
+    int32_t MateReferenceId() const;
+
+    /// \returns this record's mapped position, or -1 if unmapped
+    PacBio::BAM::Position Position() const;
+
+    /// \returns this record's mate's mapped reference ID, or -1 if unmapped
+    int32_t ReferenceId() const;
+
+    /// Sets the record's (BAI) index bin ID.
+    ///
+    /// \param[in] bin BAI index bin ID.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& Bin(uint32_t bin);
+
+    /// Sets this record's alignment flag, using a raw integer.
+    ///
+    /// \param[in] flag raw alignment flag
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& Flag(uint32_t flag);
+
+    /// Sets this record's insert size.
+    ///
+    /// \param[in] iSize insert size
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& InsertSize(int32_t iSize);
+
+    /// Sets this record's map quality.
+    ///
+    /// \param[in] mapQual mapping quality - value of 255 indicates "unknown"
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& MapQuality(uint8_t mapQual);
+
+    /// Sets this record's mate's mapped position.
+    ///
+    /// \param[in] pos mapped position. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& MatePosition(PacBio::BAM::Position pos);
+
+    /// Sets this record's mate's mapped reference ID
+    ///
+    /// \param[in] id reference ID. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& MateReferenceId(int32_t id);
+
+    /// Sets this record's mapped position.
+    ///
+    /// \param[in] pos mapped position. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& Position(PacBio::BAM::Position pos);
+
+    /// Sets this record's mapped reference ID
+    ///
+    /// \param[in] id reference ID. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& ReferenceId(int32_t id);
+
+    /// \}
+
+public:
+    /// \name Alignment Flags
+    /// \{
+
+    /// \returns true if this record is a PCR/optical duplicate
+    bool IsDuplicate() const;
+
+    /// \returns true if this record failed quality controls
+    bool IsFailedQC() const;
+
+    /// \returns true if this record is the first mate of a pair
+    bool IsFirstMate() const;
+
+    /// \returns true if this record was mapped by aligner
+    bool IsMapped() const;
+
+    /// \returns true if this record's mate was mapped by aligner
+    bool IsMateMapped() const;
+
+    /// \returns true if this record's mate was mapped to the reverse strand
+    bool IsMateReverseStrand() const;
+
+    /// \returns true if this record comes from paired-end sequencing
+    bool IsPaired() const;
+
+    /// \returns true if this record is a read's primary alignment
+    bool IsPrimaryAlignment() const;
+
+    /// \returns true if this record & its mate were properly aligned
+    bool IsProperPair() const;
+
+    /// \returns true if this record was mapped to the reverse strand
+    bool IsReverseStrand() const;
+
+    /// \returns true if this record is the second mate of a pair
+    bool IsSecondMate() const;
+
+    /// \returns true if this record is a supplementary alignment
+    bool IsSupplementaryAlignment() const;
+
+    /// Sets whether this record is a PCR/optical duplicate
+    BamRecordImpl& SetDuplicate(bool ok);
+
+    /// Sets whether this record failed quality controls
+    BamRecordImpl& SetFailedQC(bool ok);
+
+    /// Sets whether this record is the first mate of a pair.
+    BamRecordImpl& SetFirstMate(bool ok);
+
+    /// Sets whether this record was aligned.
+    BamRecordImpl& SetMapped(bool ok);
+
+    /// Sets whether this record's mate was aligned.
+    BamRecordImpl& SetMateMapped(bool ok);
+
+    /// Sets whether this record's mate mapped to reverse strand.
+    BamRecordImpl& SetMateReverseStrand(bool ok);
+
+    /// Sets whether this record came from paired-end sequencing.
+    BamRecordImpl& SetPaired(bool ok);
+
+    /// Sets whether this record is a read's primary alignment.
+    BamRecordImpl& SetPrimaryAlignment(bool ok);
+
+    /// Sets whether this record & its mate were properly mapped, per the aligner.
+    BamRecordImpl& SetProperPair(bool ok);
+
+    /// Sets whether this record mapped to reverse strand.
+    BamRecordImpl& SetReverseStrand(bool ok);
+
+    /// Sets whether this record is the second mate of a pair.
+    BamRecordImpl& SetSecondMate(bool ok);
+
+    /// Sets whether this record is a supplementary alignment.
+    BamRecordImpl& SetSupplementaryAlignment(bool ok);
+
+    /// \}
+
+public:
+    /// \name Variable-length Data (sequence, qualities, etc.)
+    /// \{
+
+    /// \returns the record's CIGAR data as a Cigar object
+    Cigar CigarData() const;
+
+    /// Sets the record's CIGAR data using a Cigar object
+    ///
+    /// \param[in] cigar PacBio::BAM::Cigar object
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& CigarData(const Cigar& cigar);
+
+    /// Sets the record's CIGAR data using a CIGAR-formatted string.
+    ///
+    /// \param[in] cigarString CIGAR-formatted string
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& CigarData(const std::string& cigarString);
+
+    // TODO: CIGAR iterator - Cigar only or here as well ??
+
+    /// \returns the record's query name
+    std::string Name() const;
+
+    /// Sets the record's "query name".
+    ///
+    /// \param name new name
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& Name(const std::string& name);
+
+    /// \returns the record's quality values (phred-style ASCII)
+    ///
+    /// \note Usually Qualities().size() == Sequence.size(). However, in
+    ///       some data sets, the quality values are not provided. In that
+    ///       case, this method will return an empty container.
+    ///
+    QualityValues Qualities() const;
+
+    /// \returns the record's DNA sequence.
+    std::string Sequence() const;
+
+    size_t SequenceLength() const;
+
+    /// \brief Sets the record's DNA sequence and quality values
+    ///
+    /// This is an overloaded function. Sets the DNA sequence and quality
+    /// values, using the length of \p sequence.
+    ///
+    /// \note When using this overload (and \p qualities is non-empty), the
+    ///       lengths of \p sequence and \p qualities \b must be equal.
+    ///
+    /// \todo How to handle mismatched lengths?
+    ///
+    /// \param[in] sequence  std::string containing DNA sequence
+    /// \param[in] qualities std::string containing ASCII quality values
+    ///
+    /// \returns reference to this record.
+    ///
+    /// \sa SetSequenceAndQualities(const char* sequence,
+    ///     const size_t sequenceLength, const char* qualities)
+    ///
+    BamRecordImpl& SetSequenceAndQualities(const std::string& sequence,
+                                           const std::string& qualities = std::string());
+
+    /// \brief Sets the record's DNA sequence and quality values.
+    ///
+    /// The \p sequence must consist of IUPAC nucleotide codes {=ACMGRSVTWYHKDBN}.
+    /// The \p qualities, if not empty, must consist of 'phred'-style ASCII
+    /// quality values. \p qualities may be an empty string or NULL pointer in
+    /// cases where there are no such data available.
+    ///
+    /// \param[in] sequence         C-string containing DNA sequence
+    /// \param[in] sequenceLength   length of DNA sequence
+    /// \param[in] qualities        C-string containing 'phred-style' ASCII
+    ///                             quality values
+    ///
+    /// \note \p sequence does \b NOT have to be NULL-terminated. Length is
+    ///       explicitly determined by the value of \p sequenceLength provided.
+    ///
+    /// \returns reference to this record.
+    ///
+    BamRecordImpl& SetSequenceAndQualities(const char* sequence, const size_t sequenceLength,
+                                           const char* qualities = nullptr);
+
+    /// \brief Sets the record's DNA sequence and quality values.
+    ///
+    /// The \p encodedSequence should be preencoded/packed into the BAM binary
+    /// format. The \p qualities, if not empty, must consist of 'phred'-style
+    /// ASCII quality values. \p qualities may be an empty string or NULL
+    /// pointer in cases where there are no such data available.
+    ///
+    /// \param[in] encodedSequence      C-string containing BAM-format-encoded
+    ///                                 DNA sequence
+    /// \param[in] rawSequenceLength    length of DNA sequence (not the encoded
+    ///                                 length)
+    /// \param[in] qualities            C-string containing 'phred-style' ASCII
+    ///                                 quality values
+    ///
+    /// \note \p encodedSequence does \b NOT have to be NULL-terminated. Length
+    ///       is explicitly determined by the value of \p sequenceLength
+    ///       provided.
+    ///
+    /// \returns reference to this record.
+    ///
+    /// \sa SetSequenceAndQualities(const char* sequence,
+    ///     const size_t sequenceLength, const char* qualities)
+    ///
+    BamRecordImpl& SetPreencodedSequenceAndQualities(const char* encodedSequence,
+                                                     const size_t rawSequenceLength,
+                                                     const char* qualities = nullptr);
+
+    /// \}
+
+public:
+    /// \name Tag Data
+    /// \{
+
+    /// \returns record's full tag data as a TagCollection object
+    TagCollection Tags() const;
+
+    /// \brief Sets the record's full tag data via a TagCollection object
+    ///
+    BamRecordImpl& Tags(const TagCollection& tags);
+
+    /// \brief Adds a new tag to this record.
+    ///
+    /// \param[in] tagName  2-character tag name.
+    /// \param[in] value    Tag object that describes the type & value of data
+    ///                     to be added
+    ///
+    /// \note Any value that can be used to implicitly construct a Tag is valid.
+    /// \code
+    ///     string s;
+    ///     vector<uint32_t> v;
+    ///     record.AddTag("XX", s); // will add a string-type tag
+    ///     record.AddTag("YY", v); // will add a uint32-array-type tag
+    /// \endcode
+    ///
+    /// \returns true if tag was successfully added.
+    ///
+    bool AddTag(const std::string& tagName, const Tag& value);
+
+    /// \brief Adds a new tag to this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag      BamRecordTag enum
+    /// \param[in] value    Tag object that describes the type & value of data
+    ///                     to be added
+    /// \returns true if tag was successfully added.
+    ///
+    bool AddTag(const BamRecordTag tag, const Tag& value);
+
+    /// \brief Adds a new tag to this record, with an optional modifier.
+    ///
+    /// \param[in] tagName              2-character tag name.
+    /// \param[in] value                Tag object that describes the type &
+    ///                                 value of data to be added
+    /// \param[in] additionalModifier   optional extra modifier (for explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \note Any value that can be used to implicitly construct a Tag is valid.
+    /// \code
+    ///     char c;
+    ///     string h;
+    ///     record.AddTag("XX", c, TagModifier::ASCII_CHAR); // will add a char-type tag
+    ///     record.AddTag("YY", h, TagModifier::HEX_STRING); // will add a hex string-type tag
+    /// \endcode
+    ///
+    /// \returns true if tag was successfully added.
+    ///
+    bool AddTag(const std::string& tagName, const Tag& value, const TagModifier additionalModifier);
+
+    /// \brief Adds a new tag to this record, with an optional modifier.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag                  BamRecordTag enum.
+    /// \param[in] value                Tag object that describes the type &
+    ///                                 value of data to be added
+    /// \param[in] additionalModifier   optional extra modifier (for explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \returns true if tag was successfully added.
+    ///
+    bool AddTag(const BamRecordTag tag, const Tag& value, const TagModifier additionalModifier);
+
+    /// \brief Edits an existing tag on this record.
+    ///
+    /// \param[in] tagName      2-character tag name. Name must be present
+    ///                         (see HasTag)
+    /// \param[in] newValue     Tag object that describes the type & value of
+    ///                         new data to be added
+    ///
+    /// \note Any value that can be used to implicitly construct a Tag is valid.
+    /// \code
+    ///     string s;
+    ///     vector<uint32_t> v;
+    ///     record.EditTag("XX", s); // will overwrite tag XX with a string-type tag
+    ///     record.EditTag("YY", v); // will overwrite tag YY with a uint32-array-type tag
+    /// \endcode
+    ///
+    /// \returns true if tag was successfully edited.
+    ///
+    bool EditTag(const std::string& tagName, const Tag& newValue);
+
+    /// \brief Edits an existing tag on this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag          BamRecordTag enum
+    /// \param[in] newValue     Tag object that describes the type & value of
+    ///                         new data to be added
+    ///
+    /// \returns true if tag was successfully edited.
+    ///
+    bool EditTag(const BamRecordTag tag, const Tag& newValue);
+
+    /// \brief Edits an existing tag on this record.
+    ///
+    /// \param[in] tagName              2-character tag name. Name must be
+    ///                                 present (see HasTag)
+    /// \param[in] value                Tag object that describes the type &
+    ///                                 value of new data to be added
+    /// \param[in] additionalModifier   optional extra modifier (for explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \note Any value that can be used to implicitly construct a Tag is valid.
+    /// \code
+    ///     char c;
+    ///     string h;
+    ///     record.EditTag("XX", c, TagModifier::ASCII_CHAR); // will overwrite tag XX with a char-type tag
+    ///     record.EditTag("YY", h, TagModifier::HEX_STRING); // will overwrite tag YY with a hex string-type tag
+    /// \endcode
+    ///
+    /// \returns true if tag was successfully edited.
+    ///
+    bool EditTag(const std::string& tagName, const Tag& value,
+                 const TagModifier additionalModifier);
+
+    /// \brief Edits an existing tag on this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag                  BamRecordTag enum
+    /// \param[in] value                Tag object that describes the type &
+    ///                                 value of new data to be added
+    /// \param[in] additionalModifier   optional extra modifier (for explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \returns true if tag was successfully edited.
+    ///
+    bool EditTag(const BamRecordTag tag, const Tag& value, const TagModifier additionalModifier);
+
+    /// \returns true if a tag with this name is present in this record.
+    bool HasTag(const std::string& tagName) const;
+
+    /// \returns true if this tag is present in this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    bool HasTag(const BamRecordTag tag) const;
+
+    /// \brief Removes an existing tag from this record.
+    ///
+    /// \param[in] tagName  2-character tag name.
+    ///
+    /// \returns true if tag was actaully removed (i.e. false if tagName
+    ///          previously unknown)
+    /// \sa HasTag
+    ///
+    bool RemoveTag(const std::string& tagName);
+
+    /// \brief Removes an existing tag from this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag  BamRecordTag enum
+    ///
+    /// \returns true if tag was actaully removed (i.e. false if tagName
+    ///          previously unknown)
+    /// \sa HasTag
+    ///
+    bool RemoveTag(const BamRecordTag tag);
+
+    /// \brief Fetches a tag from this record.
+    ///
+    /// \param[in] tagName  2-character tag name.
+    ///
+    /// \returns Tag object for the requested name. If name is unknown, a
+    ///          default constructed Tag is returned (Tag::IsNull() is true).
+    ///
+    Tag TagValue(const std::string& tagName) const;
+
+    /// \brief Fetches a tag from this record.
+    ///
+    /// This is an overloaded method
+    ///
+    /// \param[in] tag  BamRecordTag enum
+    ///
+    /// \returns Tag object for the requested name. If name is unknown, a
+    ///          default constructed Tag is returned (Tag::IsNull() is true).
+    ///
+    Tag TagValue(const BamRecordTag tag) const;
+
+    // change above to Tag();
+
+    //    template<typename T>
+    //    T TagValue(const std::string& tagName) const;
+
+    /// \}
+
+private:
+    // returns a BamRecordImpl object, with a deep copy of @rawData contents
+    static BamRecordImpl FromRawData(const std::shared_ptr<bam1_t>& rawData);
+
+    // internal memory setup/expand methods
+    void InitializeData();
+    void MaybeReallocData();
+    void UpdateTagMap() const;  // allowed to be called from const methods
+                                // (lazy update on request)
+
+    // internal tag helper methods
+    bool AddTagImpl(const std::string& tagName, const Tag& value,
+                    const TagModifier additionalModifier);
+    bool RemoveTagImpl(const std::string& tagName);
+
+    int TagOffset(const std::string& tagName) const;
+
+    // internal CIGAR handling
+    void SetCigarData(const Cigar& cigar);
+
+    // core seq/qual logic shared by the public API
+    BamRecordImpl& SetSequenceAndQualitiesInternal(const char* sequence,
+                                                   const size_t sequenceLength,
+                                                   const char* qualities, bool isPreencoded);
+
+private:
+    // data members
+    std::shared_ptr<bam1_t> d_;
+    mutable std::unordered_map<uint16_t, int> tagOffsets_;
+
+    // friends
+    friend class BamRecordMemory;
+
+    // remove this when we drop support for htslib pre-v1.7
+    friend class SamWriter;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMRECORDIMPL_H
diff --git a/include/pbbam/BamRecordTag.h b/include/pbbam/BamRecordTag.h

new file mode 100644 (file)

index 0000000..35ce216
--- /dev/null
+++ b/include/pbbam/BamRecordTag.h
@@ -0,0 +1,64 @@
+// File Description
+/// \file BamRecordTag.h
+/// \brief Defines the BamRecordTag enum.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDTAG_H
+#define BAMRECORDTAG_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+enum class BamRecordTag
+{
+    ALT_LABEL_QV,
+    ALT_LABEL_TAG,
+    BARCODE_QUALITY,
+    BARCODES,
+    CONTEXT_FLAGS,
+    DELETION_QV,
+    DELETION_TAG,
+    HOLE_NUMBER,
+    INSERTION_QV,
+    IPD,
+    LABEL_QV,
+    LONG_CIGAR,
+    MERGE_QV,
+    NUM_PASSES,
+    PKMEAN,
+    PKMEAN_2,
+    PKMID,
+    PKMID_2,
+    PRE_PULSE_FRAMES,
+    PULSE_CALL,
+    PULSE_CALL_WIDTH,
+    PULSE_EXCLUSION,
+    PULSE_MERGE_QV,
+    PULSE_WIDTH,
+    QUERY_END,
+    QUERY_END_FRAME_NUMBER,
+    QUERY_START,
+    QUERY_START_FRAME_NUMBER,
+    READ_ACCURACY,
+    READ_GROUP,
+    SCRAP_REGION_TYPE,
+    SCRAP_ZMW_TYPE,
+    SIGNAL_TO_NOISE,
+    START_FRAME,
+    SUBSTITUTION_QV,
+    SUBSTITUTION_TAG,
+
+    //
+    // not tags per se, but faking these here to simplify data fetching
+    //
+    QUAL,
+    SEQ
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMRECORDTAG_H
diff --git a/include/pbbam/BamRecordView.h b/include/pbbam/BamRecordView.h

new file mode 100644 (file)

index 0000000..7fe56f9
--- /dev/null
+++ b/include/pbbam/BamRecordView.h
@@ -0,0 +1,132 @@
+// File Description
+/// \file BamRecordView.h
+/// \brief Defines the BamRecordView class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDVIEW_H
+#define BAMRECORDVIEW_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+
+#include "pbbam/BamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief Provides a re-usable "view" onto a BamRecord
+///
+/// This class acts a convenience wrapper for working with per-base BamRecord
+/// data. Most of these BamRecord methods take a list of parameters, to adjust
+/// how the underlying data are presented to client code. Often these parameters
+/// will be re-used for each BamRecord method call. Thus, to simplify such
+/// client code, a BamRecordView can be used to state those parameters once, and
+/// then simply request the desired fields.
+///
+/// \internal
+/// \todo Sync up method names with BamRecord
+/// \endinternal
+///
+class PBBAM_EXPORT BamRecordView
+{
+public:
+    /// \brief Constructs a view onto \p record using the supplied parameters.
+    ///
+    /// For frame or QV data, if \p aligned is true, a value of 0 (Accuracy or
+    /// QualityValue) will be used at each inserted or padded base location.
+    ///
+    /// \param[in] record           BamRecord data source.
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    BamRecordView(const BamRecord& record, const Orientation orientation, const bool aligned,
+                  const bool exciseSoftClips,
+                  const PulseBehavior pulseBehavior = PulseBehavior::ALL);
+
+public:
+    /// \returns BamRecord::AltLabelQV with this view's parameters applied
+    QualityValues AltLabelQVs() const;
+
+    /// \returns BamRecord::AltLabelTag with this view's parameters applied
+    std::string AltLabelTags() const;
+
+    /// \returns BamRecord::DeletionQV with this view's parameters applied
+    QualityValues DeletionQVs() const;
+
+    /// \returns BamRecord::DeletionTag with this view's parameters applied
+    std::string DeletionTags() const;
+
+    /// \returns BamRecord::InsertionQV with this view's parameters applied
+    QualityValues InsertionQVs() const;
+
+    /// \returns BamRecord::IPD with this view's parameters applied
+    Frames IPD() const;
+
+    /// \returns BamRecord::LabelQV with this view's parameters applied
+    QualityValues LabelQVs() const;
+
+    /// \returns BamRecord::MergeQV with this view's parameters applied
+    QualityValues MergeQVs() const;
+
+    /// \returns BamRecord::PulseMergeQV with this view's parameters applied
+    QualityValues PulseMergeQVs() const;
+
+    /// \returns BamRecord::Pkmean with this view's parameters applied
+    std::vector<float> Pkmean() const;
+
+    /// \returns BamRecord::Pkmid with this view's parameters applied
+    std::vector<float> Pkmid() const;
+
+    /// \returns BamRecord::Pkmean2 with this view's parameters applied
+    std::vector<float> Pkmean2() const;
+
+    /// \returns BamRecord::Pkmid2 with this view's parameters applied
+    std::vector<float> Pkmid2() const;
+
+    /// \returns BamRecord::PreBaseFrames with this view's parameters applied
+    Frames PrebaseFrames() const;
+
+    /// \returns BamRecord::PrePulseFrames with this view's parameters applied
+    Frames PrePulseFrames() const;
+
+    /// \returns BamRecord::PulseCalls with this view's parameters applied
+    std::string PulseCalls() const;
+
+    /// \returns BamRecord::PulseCallWidth with this view's parameters applied
+    Frames PulseCallWidth() const;
+
+    /// \returns BamRecord::PulseWidths with this view's parameters applied
+    Frames PulseWidths() const;
+
+    /// \returns BamRecord::Qualities with this view's parameters applied
+    QualityValues Qualities() const;
+
+    /// \returns BamRecord::Sequence with this view's parameters applied
+    std::string Sequence() const;
+
+    /// \returns BamRecord::StartFrame with this view's parameters applied
+    std::vector<uint32_t> StartFrames() const;
+
+    /// \returns BamRecord::SubstitutionQV with this view's parameters applied
+    QualityValues SubstitutionQVs() const;
+
+    /// \returns BamRecord::SubstitutionTag with this view's parameters applied
+    std::string SubstitutionTags() const;
+
+private:
+    const BamRecord& record_;
+    Orientation orientation_;
+    bool aligned_;
+    bool exciseSoftClips_;
+    PulseBehavior pulseBehavior_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMRECORDVIEW_H
diff --git a/include/pbbam/BamTagCodec.h b/include/pbbam/BamTagCodec.h

new file mode 100644 (file)

index 0000000..6847d95
--- /dev/null
+++ b/include/pbbam/BamTagCodec.h
@@ -0,0 +1,92 @@
+// File Description
+/// \file BamTagCodec.h
+/// \brief Defines the BamTagCodec class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMTAGCODEC_H
+#define BAMTAGCODEC_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "pbbam/TagCollection.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BamTagCodec class provides binary encoding/decoding of %BAM tag
+///        data.
+///
+/// \note BamTagCodec is mostly an implementation and/or testing detail, and may
+///       be removed from the public API.
+///
+class PBBAM_EXPORT BamTagCodec
+{
+public:
+    /// \name Tag Collection Methods
+    /// \{
+
+    /// \brief Creates a TagCollection from raw BAM data.
+    ///
+    /// \param[in] data     BAM-formatted (binary) tag data
+    /// \returns TagCollection containing tag data
+    ///
+    static TagCollection Decode(const std::vector<uint8_t>& data);
+
+    /// \brief Creates binary BAM data from a TagCollection.
+    ///
+    /// \param[in] tags     TagCollection containing tag data
+    /// \returns vector of bytes (encoded BAM data)
+    ///
+    static std::vector<uint8_t> Encode(const PacBio::BAM::TagCollection& tags);
+
+    /// \}
+
+public:
+    /// \name Per-Tag Methods
+    /// \{
+
+    /// \brief Determines the SAM/BAM tag code for a Tag.
+    ///
+    /// \param[in] tag                  Tag object to check
+    /// \param[in] additionalModifier   optional extra modifier (allows explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \returns the SAM/BAM single char code for tag type
+    ///
+    static uint8_t TagTypeCode(const PacBio::BAM::Tag& tag,
+                               const TagModifier& additionalModifier = TagModifier::NONE);
+
+    /// \brief Encodes a single Tag's contents in %BAM binary
+    ///
+    /// \note This method does \b NOT encode the tag name & tag type. It does
+    ///       include the element type for array-type tags.
+    ///
+    /// \param[in] tag                  Tag object containing data to encode
+    /// \param[in] additionalModifier   optional extra modifier (allows explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \returns vector of bytes (encoded BAM data)
+    ///
+    static std::vector<uint8_t> ToRawData(
+        const PacBio::BAM::Tag& tag, const TagModifier& additionalModifier = TagModifier::NONE);
+
+    /// \brief Creates a Tag object from binary BAM data.
+    ///
+    /// \param[in] rawData      raw BAM bytes (assumed to be the result of
+    ///                         htslib's bam_aux_get())
+    ///
+    /// \returns resulting Tag object
+    ///
+    static PacBio::BAM::Tag FromRawData(uint8_t* rawData);
+
+    /// \}
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMTAGCODEC_H
diff --git a/include/pbbam/BamWriter.h b/include/pbbam/BamWriter.h

new file mode 100644 (file)

index 0000000..60945d8
--- /dev/null
+++ b/include/pbbam/BamWriter.h
@@ -0,0 +1,220 @@
+// File Description
+/// \file BamWriter.h
+/// \brief Defines the BamWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMWRITER_H
+#define BAMWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include <htslib/sam.h>
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/IRecordWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamFile;
+
+/// \brief The BamWriter class provides a writing interface for creating
+///        new %BAM files.
+///
+/// \note The underlying buffered data may not be flushed to the file until the
+///       destructor is called. Trying to access the file (reading, stat-ing,
+///       indexing, etc.) before the BamWriter is destroyed yields undefined
+///       behavior. Enclose the BamWriter in some form of local scope (curly
+///       braces, a separate function, etc.) to ensure that its destructor is
+///       called before proceeding to read-based operations.
+///
+/// \code{.cpp}
+///  {
+///     BamWriter w(...);
+///     // write data
+///  }
+///  // now safe to access the new file
+/// \endcode
+///
+///
+class PBBAM_EXPORT BamWriter : public IRecordWriter
+{
+public:
+    /// \brief This enum allows you to control the compression level of the
+    ///        output %BAM file.
+    ///
+    /// Values are equivalent to zlib compression levels. See its documentation
+    /// for more details: http://www.zlib.net/manual.html
+    ///
+    enum CompressionLevel
+    {
+        CompressionLevel_0 = 0,
+        CompressionLevel_1 = 1,
+        CompressionLevel_2 = 2,
+        CompressionLevel_3 = 3,
+        CompressionLevel_4 = 4,
+        CompressionLevel_5 = 5,
+        CompressionLevel_6 = 6,
+        CompressionLevel_7 = 7,
+        CompressionLevel_8 = 8,
+        CompressionLevel_9 = 9,
+
+        DefaultCompression = -1,
+        NoCompression = CompressionLevel_0,
+        FastCompression = CompressionLevel_1,
+        BestCompression = CompressionLevel_9
+    };
+
+    /// \brief This enum allows you to control whether BAI bin numbers are
+    ///        calculated for output records.
+    ///
+    /// For most cases, the default behavior (ON) should be retained for maximum
+    /// compatibility with downstream tools (e.g. samtools index). Disabling bin
+    /// calculation should only be used if all records are known to never be
+    /// mapped, and even then only if profiling revelas the calculation to
+    /// affect extremely performance-sensitive, "critical paths".
+    ///
+    enum BinCalculationMode
+    {
+        BinCalculation_ON = 0,
+        BinCalculation_OFF
+    };
+
+    ///
+    /// \brief The Config struct provides a "parameter object" for BamWriter
+    ///        settings. This allows for writer configuration without having to
+    ///        refer to ordering of parameters, default values, etc.
+    ///
+    struct Config
+    {
+        // zlib compression level
+        CompressionLevel compressionLevel = DefaultCompression;
+
+        // The number of threads for compression. If set to 0, BamWriter will
+        // attempt to determine a reasonable estimate. If set to 1, this will
+        // force single-threaded execution. No checks are made against an upper limit.
+        size_t numThreads = 4;
+
+        // If ON, ensures that proper BAI bin numbers are provided for all records.
+        BamWriter::BinCalculationMode binCalculationMode = BamWriter::BinCalculation_ON;
+
+        // If true, write to <filename>.tmp, and rename  to <filename> in dtor.
+        // This allows downstream checks to see if BAM file may be truncated
+        // due to early termination (e.g. a thrown exception). If false, write
+        // directly to <filename>.
+        bool useTempFile = true;
+    };
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Opens a %BAM file for writing & writes the header information.
+    ///
+    /// \note Set \p filename to "-" for stdout.
+    ///
+    /// \param[in] filename         path to output %BAM file
+    /// \param[in] header           BamHeader object
+    /// \param[in] compressionLevel zlib compression level
+    /// \param[in] numThreads       number of threads for compression. If set to
+    ///                             0, BamWriter will attempt to determine a
+    ///                             reasonable estimate. If set to 1, this will
+    ///                             force single-threaded execution. No checks
+    ///                             are made against an upper limit.
+    ///
+    /// \param[in] binCalculationMode BAI bin calculation mode. The default
+    ///            behavior will ensure proper bin numbers are provided for all
+    ///            records written. This extra step may turned off when bin
+    ///            numbers are not needed. Though if in doubt, keep the default.
+    ///
+    /// \param[in] useTempFile      If true, write to <filename>.tmp, and rename
+    ///                             to <filename>. This provides for downstream
+    ///                             checks to see if BAM file may be truncated
+    ///                             due to early termination (a thrown exception).
+    ///
+    /// \throws std::runtmie_error if there was a problem opening the file for
+    ///         writing or if an error occurred while writing the header
+    ///
+    BamWriter(const std::string& filename, const BamHeader& header,
+              const BamWriter::CompressionLevel compressionLevel = BamWriter::DefaultCompression,
+              const size_t numThreads = 4,
+              const BinCalculationMode binCalculationMode = BamWriter::BinCalculation_ON,
+              const bool useTempFile = true);
+
+    ///
+    /// \brief Opens a %BAM file for writing & writes the header information.
+    ///
+    /// \param[in] filename     path to output %BAM file
+    /// \param[in] header       BamHeader object
+    /// \param[in] config       container for add'l configuration options
+    ///
+    /// \throws std::runtmie_error if there was a problem opening the file for
+    ///         writing or if an error occurred while writing the header
+    ///
+    BamWriter(const std::string& filename, const BamHeader& header,
+              const BamWriter::Config& config);
+
+    BamWriter(BamWriter&&) noexcept;
+    BamWriter& operator=(BamWriter&&) noexcept;
+
+    /// Fully flushes all buffered data & closes file.
+    ~BamWriter() override;
+
+    /// \}
+
+public:
+    /// \name Data Writing & Resource Management
+    /// \{
+
+    /// \brief Try to flush any buffered data to file.
+    ///
+    /// \note The underlying implementation doesn't necessarily flush buffered
+    ///       data immediately, especially in a multithreaded writer situation.
+    ///       Let the BamWriter go out of scope to fully ensure flushing.
+    ///
+    /// \throws std::runtime_error if flush fails
+    ///
+    void TryFlush() override;
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] record BamRecord object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecord& record) override;
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] record BamRecord object
+    /// \param[out] vOffset BGZF virtual offset to start of \p record
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecord& record, int64_t* vOffset);
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] recordImpl BamRecordImpl object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecordImpl& recordImpl) override;
+
+    /// \}
+
+private:
+    class BamWriterPrivate;
+    std::unique_ptr<BamWriterPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMWRITER_H
diff --git a/include/pbbam/BarcodeQuery.h b/include/pbbam/BarcodeQuery.h

new file mode 100644 (file)

index 0000000..29f7511
--- /dev/null
+++ b/include/pbbam/BarcodeQuery.h
@@ -0,0 +1,64 @@
+// File Description
+/// \file BarcodeQuery.h
+/// \brief Defines the BarcodeQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef BARCODEQUERY_H
+#define BARCODEQUERY_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BarcodeQuery class provides iterable access to a DataSet's %BAM
+///        records, limiting results to those matching a particular barcode.
+///
+/// Example:
+/// \include code/BarcodeQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT BarcodeQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new BarcodeQuery, limiting record results to only those
+    ///        annotated with a particular barcode ID.
+    ///
+    /// \param[in] barcode  filtering criteria
+    /// \param[in] dataset  input data source(s)
+    ///
+    /// \sa BamRecord::Barcodes
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or PBI
+    ///         files.
+    ///
+    BarcodeQuery(const int16_t barcode, const DataSet& dataset);
+
+    ~BarcodeQuery() override;
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+private:
+    class BarcodeQueryPrivate;
+    std::unique_ptr<BarcodeQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BARCODEQUERY_H
diff --git a/include/pbbam/BgzipFastaWriter.h b/include/pbbam/BgzipFastaWriter.h

new file mode 100644 (file)

index 0000000..48408cf
--- /dev/null
+++ b/include/pbbam/BgzipFastaWriter.h
@@ -0,0 +1,47 @@
+// File Description
+/// \file BgzipFastaWriter.h
+/// \brief Defines the BgzipFastaWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef BGZIPFASTAWRITER_H
+#define BGZIPFASTAWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+#include "pbbam/BgzipWriter.h"
+#include "pbbam/IFastaWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+class BamRecordImpl;
+class FastaSequence;
+
+class BgzipFastaWriter final : public IFastaWriter
+{
+public:
+    explicit BgzipFastaWriter(const std::string& fn);
+    BgzipFastaWriter(const std::string& fn, const BgzipWriterConfig& config);
+
+public:
+    // IFastaWriter
+    void Write(const FastaSequence& fastq);
+    void Write(const std::string& name, const std::string& bases);
+
+    // IRecordWriter
+    void TryFlush();
+    void Write(const BamRecord& bam);
+    void Write(const BamRecordImpl& bam);
+
+private:
+    BgzipWriter writer_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BGZIPFASTAWRITER_H
diff --git a/include/pbbam/BgzipFastqWriter.h b/include/pbbam/BgzipFastqWriter.h

new file mode 100644 (file)

index 0000000..53cfc94
--- /dev/null
+++ b/include/pbbam/BgzipFastqWriter.h
@@ -0,0 +1,50 @@
+// File Description
+/// \file BgzipFastqWriter.h
+/// \brief Defines the BgzipFastqWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef BGZIPFASTQWRITER_H
+#define BGZIPFASTQWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include "pbbam/BgzipWriter.h"
+#include "pbbam/IFastqWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+class BamRecordImpl;
+class FastqSequence;
+
+class BgzipFastqWriter final : public IFastqWriter
+{
+public:
+    explicit BgzipFastqWriter(const std::string& fn);
+    BgzipFastqWriter(const std::string& fn, const BgzipWriterConfig& config);
+
+public:
+    // IFastqWriter
+    void Write(const FastqSequence& fastq);
+    void Write(const std::string& name, const std::string& bases, const QualityValues& quals);
+    void Write(const std::string& name, const std::string& bases, const std::string& quals);
+
+    // IRecordWriter
+    void TryFlush();
+    void Write(const BamRecord& bam);
+    void Write(const BamRecordImpl& bam);
+
+private:
+    BgzipWriter writer_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BGZFFASTQWRITER_H
diff --git a/include/pbbam/BgzipWriter.h b/include/pbbam/BgzipWriter.h

new file mode 100644 (file)

index 0000000..30c9213
--- /dev/null
+++ b/include/pbbam/BgzipWriter.h
@@ -0,0 +1,81 @@
+// File Description
+/// \file BgzipWriter.h
+/// \brief Defines the BgzipWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef BGZIPWRITER_H
+#define BGZIPWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+struct BgzipWriterConfig
+{
+    // Compression level, equivalent to zlib-defined levels
+    const size_t CompressionLevel = 0;
+
+    /// Nnumber of threads for compression. If set to 0, the writer will attempt
+    /// to determine a reasonable estimate. If set to 1, this will force
+    /// single-threaded execution. No checks are made against an upper limit.
+    const size_t NumThreads = 4;
+
+    /// If true, write to <filename>.tmp, and rename to <filename> on closing.
+    /// This provides for downstream checks to see if the file may be truncated
+    /// due to early termination (e.g. a thrown exception).
+    const bool UseTempFile = true;
+};
+
+/// \brief The BgzipWriter writes BGZF-compressed data to a file.
+///
+class PBBAM_EXPORT BgzipWriter
+{
+public:
+    ///
+    /// Create a BgzipWriter, using default configuration parameters.
+    ///
+    explicit BgzipWriter(std::string filename);
+
+    ///
+    /// Create a BgzipWriter, using configuration provided.
+    ///
+    BgzipWriter(std::string filename, const BgzipWriterConfig& config);
+
+    BgzipWriter(BgzipWriter&&) noexcept;
+    BgzipWriter& operator=(BgzipWriter&&) noexcept;
+    ~BgzipWriter();
+
+public:
+    ///
+    /// \brief Writes raw bytes to BGZF file.
+    ///
+    /// \param data         data buffer
+    /// \param numBytes     num bytes (data size * sizeof(T))
+    ///
+    /// \returns number of bytes written
+    ///
+    size_t Write(const void* data, size_t numBytes);
+
+    ///
+    /// \brief Writes string data to BGZF file.
+    ///
+    /// \param data         data string
+    ///
+    /// \returns number of bytes written
+    ///
+    size_t Write(const std::string& data);
+
+private:
+    class BgzipWriterPrivate;
+    std::unique_ptr<BgzipWriterPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BGZIPWRITER_H
diff --git a/include/pbbam/Cigar.h b/include/pbbam/Cigar.h

new file mode 100644 (file)

index 0000000..f3529d8
--- /dev/null
+++ b/include/pbbam/Cigar.h
@@ -0,0 +1,29 @@
+// File Description
+/// \file Cigar.h
+/// \brief Defines the Cigar class.
+//
+// Author: Derek Barnett
+
+#ifndef CIGAR_H
+#define CIGAR_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+#include <vector>
+
+#include <pbcopper/data/Cigar.h>
+
+#include "pbbam/CigarOperation.h"
+
+namespace PacBio {
+namespace BAM {
+
+using Cigar PBBAM_DEPRECATED = PacBio::Data::Cigar;
+
+PBBAM_DEPRECATED constexpr auto ReferenceLength = PacBio::Data::ReferenceLength;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // CIGAR_H
diff --git a/include/pbbam/CigarOperation.h b/include/pbbam/CigarOperation.h

new file mode 100644 (file)

index 0000000..17f4709
--- /dev/null
+++ b/include/pbbam/CigarOperation.h
@@ -0,0 +1,29 @@
+// File Description
+/// \file CigarOperation.h
+/// \brief Defines the CigarOperationType enum & CigarOperation class.
+//
+// Author: Derek Barnett
+
+#ifndef CIGAROPERATION_H
+#define CIGAROPERATION_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <stdexcept>
+
+#include <pbcopper/data/CigarOperation.h>
+
+namespace PacBio {
+namespace BAM {
+
+using CigarOperation PBBAM_DEPRECATED = PacBio::Data::CigarOperation;
+using CigarOperationType PBBAM_DEPRECATED = PacBio::Data::CigarOperationType;
+
+PBBAM_DEPRECATED constexpr auto ConsumesQuery = PacBio::Data::ConsumesQuery;
+PBBAM_DEPRECATED constexpr auto ConsumesReference = PacBio::Data::ConsumesReference;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // CIGAROPERATION_H
diff --git a/include/pbbam/ClipType.h b/include/pbbam/ClipType.h

new file mode 100644 (file)

index 0000000..cc48176
--- /dev/null
+++ b/include/pbbam/ClipType.h
@@ -0,0 +1,32 @@
+// File Description
+/// \file ClipType.h
+/// \brief Defines the ClipType enum.
+//
+// Author: Derek Barnett
+
+#ifndef CLIPTYPE_H
+#define CLIPTYPE_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the modes supported by BamRecord clipping
+///        operations.
+///
+/// Methods like BamRecord::Clip accept Position parameters - which may be in
+/// either polymerase or reference coorindates. Using this enum as a flag
+/// indicates how the positions should be interpreted.
+///
+enum class ClipType
+{
+    CLIP_NONE,         ///< No clipping will be performed.
+    CLIP_TO_QUERY,     ///< Clipping positions are in polymerase coordinates.
+    CLIP_TO_REFERENCE  ///< Clipping positions are in genomic coordinates.
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // CLIPTYPE_H
diff --git a/include/pbbam/Compare.h b/include/pbbam/Compare.h

new file mode 100644 (file)

index 0000000..d87ffb2
--- /dev/null
+++ b/include/pbbam/Compare.h
@@ -0,0 +1,471 @@
+// File Description
+/// \file Compare.h
+/// \brief Defines the Compare class & a number of function objects for
+///       comparing BamRecords.
+//
+// Author: Derek Barnett
+
+#ifndef COMPARE_H
+#define COMPARE_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <utility>
+
+#include "pbbam/BamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The Compare class provides utilities for sorting collections of
+///        BamRecords.
+///
+/// \note The functors provided here currently only support std::less<T>
+///       comparisons (i.e. sorting by ascending value).
+///
+/// \include code/Compare.txt
+///
+struct PBBAM_EXPORT Compare
+{
+public:
+    /// \name Comparison Type
+    /// \{
+
+    /// \brief This enum defines the supported comparison types
+    ///        { ==, !=, <, <=, >, >=, & (contains), ~ (not contains) }.
+    ///
+    enum Type
+    {
+        EQUAL = 0,
+        NOT_EQUAL,
+        LESS_THAN,
+        LESS_THAN_EQUAL,
+        GREATER_THAN,
+        GREATER_THAN_EQUAL,
+        CONTAINS,
+        NOT_CONTAINS
+    };
+
+    /// \brief Convert operator string to Compare::Type.
+    ///
+    /// \include code/Compare_TypeFromOperator.txt
+    ///
+    /// \param[in] opString operator string. Can be C++-style operators
+    ///                     ("==", "!=", "<=", etc) or alpha equivalents
+    ///                     ("eq", "ne", "lte", etc).
+    ///
+    /// \returns comparison type from an operator string
+    /// \throws std::runtime_error if cannot convert opString to Compare::Type
+    /// \sa Compare::TypeToOperator
+    ///
+    static Compare::Type TypeFromOperator(const std::string& opString);
+
+    /// \brief Convert a Compare::Type to printable enum name.
+    ///
+    /// \include code/Compare_TypeToName.txt
+    ///
+    /// \param[in] type Compare::Type to convert
+    /// \returns the printable name for a Compare::Type enum value.are::Type
+    /// \throws std::runtime_error on unknown Compare::Type
+    ///
+    static std::string TypeToName(const Compare::Type& type);
+
+    /// \brief Convert a Compare::Type to printable operator.
+    ///
+    /// \param[in] type     Compare::Type to convert
+    /// \param[in] asAlpha  (optional) flag to print using alpha equivalents
+    ///                     e.g. "lte" rather than "<="
+    /// \returns the printable operator string
+    /// \throws std::runtime_error on unknown Compare::Type
+    ///
+    static std::string TypeToOperator(const Compare::Type& type, bool asAlpha = false);
+
+    /// \}
+
+public:
+    /// \name Comparison Function Objects
+    /// \{
+
+    /// %Base class for all BamRecord compare functors.
+    ///
+    /// Mostly used for method signatures that can accept any comparator.
+    ///
+    /// Custom comparators may be used by inheriting from this class.
+    ///
+    struct Base : public std::function<bool(const BamRecord&, const BamRecord&)>
+    {
+    };
+
+private:
+    /// \internal
+    ///
+    /// Exists to provide the typedef we'll use in the actual
+    /// MemberFunctionBase, since we need to use it in the template signature.
+    /// This keeps that a lot easier to read.
+    ///
+    template <typename ValueType>
+    struct MemberFunctionBaseHelper : public Compare::Base
+    {
+        using MemberFnType = ValueType (BamRecord::*)() const;
+    };
+
+public:
+    /// \brief %Base class for all BamRecord compare functors that take a
+    ///        BamRecord function pointer and compare on its return type.
+    ///
+    /// Derived comparators usually need only declare the return value &
+    /// function pointer in the template signature. This class implements the
+    /// basic method-calling machinery.
+    ///
+    /// Custom comparators will work for any BamRecord member function that does
+    /// not take any input parameters.
+    ///
+    template <typename ValueType, typename MemberFunctionBaseHelper<ValueType>::MemberFnType fn,
+              typename CompareType = std::less<ValueType> >
+    struct MemberFunctionBase : public Compare::MemberFunctionBaseHelper<ValueType>
+    {
+        bool operator()(const BamRecord& lhs, const BamRecord& rhs) const;
+    };
+
+public:
+    /// \brief Compares on BamRecord::AlignedEnd.
+    ///
+    /// Example:
+    /// \include code/Compare_AlignedEnd.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct AlignedEnd : public MemberFunctionBase<Position, &BamRecord::AlignedEnd>
+    {
+    };
+
+    /// \brief Compares on BamRecord::AlignedStart.
+    ///
+    /// Example:
+    /// \include code/Compare_AlignedStart.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct AlignedStart : public MemberFunctionBase<Position, &BamRecord::AlignedStart>
+    {
+    };
+
+    /// \brief Compares on BamRecord::AlignedStrand
+    ///
+    /// Example:
+    /// \include code/Compare_AlignedStrand.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct AlignedStrand : public MemberFunctionBase<Strand, &BamRecord::AlignedStrand>
+    {
+    };
+
+    /// \brief Compares on BamRecord::BarcodeForward.
+    ///
+    /// Example:
+    /// \include code/Compare_BarcodeForward.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct BarcodeForward : public MemberFunctionBase<int16_t, &BamRecord::BarcodeForward>
+    {
+    };
+
+    /// \brief Compares on BamRecord::BarcodeQuality.
+    ///
+    /// Example:
+    /// \include code/Compare_BarcodeQuality.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct BarcodeQuality : public MemberFunctionBase<uint8_t, &BamRecord::BarcodeQuality>
+    {
+    };
+
+    /// \brief Compares on BamRecord::BarcodeReverse.
+    ///
+    /// Example:
+    /// \include code/Compare_BarcodeReverse.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct BarcodeReverse : public MemberFunctionBase<int16_t, &BamRecord::BarcodeReverse>
+    {
+    };
+
+    /// \brief Compares on BamRecord::FullName.
+    ///
+    /// Example:
+    /// \include code/Compare_FullName.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct FullName : public MemberFunctionBase<std::string, &BamRecord::FullName>
+    {
+    };
+
+    /// \brief Compares on BamRecord::LocalContextFlags.
+    ///
+    /// Example:
+    /// \include code/Compare_LocalContextFlag.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct LocalContextFlag
+        : public MemberFunctionBase<LocalContextFlags, &BamRecord::LocalContextFlags>
+    {
+    };
+
+    /// \brief Compares on BamRecord::MapQuality.
+    ///
+    /// Example:
+    /// \include code/Compare_MapQuality.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct MapQuality : public MemberFunctionBase<uint8_t, &BamRecord::MapQuality>
+    {
+    };
+
+    /// \brief Compares on BamRecord::MovieName.
+    ///
+    /// Example:
+    /// \include code/Compare_MovieName.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct MovieName : public MemberFunctionBase<std::string, &BamRecord::MovieName>
+    {
+    };
+
+    /// \brief Provides an operator() is essentially a no-op for
+    ///        comparing/sorting.
+    ///
+    /// If used in a sorting operation, then no change will occur.
+    ///
+    struct None : public Compare::Base
+    {
+        bool operator()(const BamRecord&, const BamRecord&) const;
+    };
+
+    ///\brief Compares on BamRecord::NumDeletedBases.
+    ///
+    /// Example:
+    /// \include code/Compare_NumDeletedBases.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct NumDeletedBases : public MemberFunctionBase<size_t, &BamRecord::NumDeletedBases>
+    {
+    };
+
+    /// \brief Compares on BamRecord::NumInsertedBases.
+    ///
+    /// Example:
+    /// \include code/Compare_NumInsertedBases.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct NumInsertedBases : public MemberFunctionBase<size_t, &BamRecord::NumInsertedBases>
+    {
+    };
+
+    /// \brief Compares on BamRecord::NumMatches.
+    ///
+    /// Example:
+    /// \include code/Compare_NumMatches.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct NumMatches : public MemberFunctionBase<size_t, &BamRecord::NumMatches>
+    {
+    };
+
+    /// \brief Compares on BamRecord::NumMismatches.
+    ///
+    /// Example:
+    /// \include code/Compare_NumMismatches.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct NumMismatches : public MemberFunctionBase<size_t, &BamRecord::NumMismatches>
+    {
+    };
+
+    /// \brief Compares on BamRecord::QueryEnd.
+    ///
+    /// Example:
+    /// \include code/Compare_QueryEnd.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct QueryEnd : public MemberFunctionBase<Position, &BamRecord::QueryEnd>
+    {
+    };
+
+    /// \brief Compares on BamRecord::QueryStart.
+    ///
+    /// Example:
+    /// \include code/Compare_QueryStart.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct QueryStart : public MemberFunctionBase<Position, &BamRecord::QueryStart>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReadAccuracy.
+    ///
+    /// Example:
+    /// \include code/Compare_ReadAccuracy.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReadAccuracy : public MemberFunctionBase<Accuracy, &BamRecord::ReadAccuracy>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReadGroupId.
+    ///
+    /// \note Even though the ReadGroupId string contains hex values, it is
+    ///       still just a std::string. Comparisons will use lexical, not
+    ///       numeric ordering. If numeric ordering is desired, use
+    ///       Compare::ReadGroupNumericId instead.
+    ///
+    /// Example:
+    /// \include code/Compare_ReadGroupId.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReadGroupId : public MemberFunctionBase<std::string, &BamRecord::ReadGroupId>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReadGroupNumericId.
+    ///
+    /// Example:
+    /// \include code/Compare_ReadGroupNumericId.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReadGroupNumericId : public MemberFunctionBase<int32_t, &BamRecord::ReadGroupNumericId>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReferenceEnd.
+    ///
+    /// Example:
+    /// \include code/Compare_ReferenceEnd.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReferenceEnd : public MemberFunctionBase<Position, &BamRecord::ReferenceEnd>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReferenceId.
+    ///
+    /// Example:
+    /// \include code/Compare_ReferenceId.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReferenceId : public MemberFunctionBase<int32_t, &BamRecord::ReferenceId>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReferenceName.
+    ///
+    /// Example:
+    /// \include code/Compare_ReferenceName.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReferenceName : public MemberFunctionBase<std::string, &BamRecord::ReferenceName>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReferenceStart.
+    ///
+    /// Example:
+    /// \include code/Compare_ReferenceStart.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReferenceStart : public MemberFunctionBase<Position, &BamRecord::ReferenceStart>
+    {
+    };
+
+    /// \brief Compares on BamRecord::HoleNumber.
+    ///
+    /// Example:
+    /// \include code/Compare_Zmw.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct Zmw : public MemberFunctionBase<int32_t, &BamRecord::HoleNumber>
+    {
+    };
+
+    /// \}
+
+    template <typename T>
+    static inline bool Check(const T& lhs, const T& rhs, const Compare::Type cmp)
+    {
+        switch (cmp) {
+            case Compare::EQUAL:
+            case Compare::CONTAINS:
+                return lhs == rhs;
+            case Compare::LESS_THAN:
+                return lhs < rhs;
+            case Compare::LESS_THAN_EQUAL:
+                return lhs <= rhs;
+            case Compare::GREATER_THAN:
+                return lhs > rhs;
+            case Compare::GREATER_THAN_EQUAL:
+                return lhs >= rhs;
+            case Compare::NOT_EQUAL:
+            case Compare::NOT_CONTAINS:
+                return lhs != rhs;
+            default:
+                assert(false);
+                throw std::runtime_error{"unsupported compare type requested"};
+        }
+    }
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/Compare.inl"
+
+#endif  // COMPARE_H
diff --git a/include/pbbam/CompositeBamReader.h b/include/pbbam/CompositeBamReader.h

new file mode 100644 (file)

index 0000000..52bc913
--- /dev/null
+++ b/include/pbbam/CompositeBamReader.h
@@ -0,0 +1,286 @@
+// File Description
+/// \file CompositeBamReader.h
+/// \brief Defines the composite BAM readers, for working with multiple input
+///       files.
+//
+// Author: Derek Barnett
+
+#ifndef COMPOSITEBAMREADER_H
+#define COMPOSITEBAMREADER_H
+
+#include "pbbam/Config.h"
+
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "pbbam/BaiIndexCache.h"
+#include "pbbam/BaiIndexedBamReader.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamReader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/GenomicInterval.h"
+#include "pbbam/PbiIndexedBamReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+
+/// \internal
+/// \brief The CompositeMergeItem class provides a helper struct for composite
+///        readers, containing a single-file reader and its "next" record.
+///
+struct CompositeMergeItem
+{
+public:
+    std::unique_ptr<BamReader> reader;
+    BamRecord record;
+
+public:
+    CompositeMergeItem(std::unique_ptr<BamReader> rdr);
+    CompositeMergeItem(std::unique_ptr<BamReader> rdr, BamRecord rec);
+};
+
+/// \internal
+/// \brief The CompositeMergeItemSorter class provides a helper function object
+///        for ordering composite reader results.
+///
+/// Essentially just exracts a BamRecord from its parent CompositeMergeItem for
+/// further checks.
+///
+template <typename CompareType>
+struct CompositeMergeItemSorter
+    : public std::function<bool(const CompositeMergeItem&, const CompositeMergeItem&)>
+{
+    bool operator()(const CompositeMergeItem& lhs, const CompositeMergeItem& rhs) const;
+};
+
+}  // namespace internal
+
+/// \brief The GenomicIntervalCompositeBamReader class provides read access to
+///        multipe %BAM files, limiting results to a genomic region.
+///
+/// Requires a ".bai" file for each input %BAM file.
+///
+/// Results will be returned in order of genomic coordinate (first by reference
+/// ID, then by position).
+///
+class PBBAM_EXPORT GenomicIntervalCompositeBamReader
+{
+public:
+    /// \name Contstructors & Related Methods
+    /// \{
+
+    /// \brief Constructs composite %BAM reader, that can be queried on genomic interval.
+    ///
+    /// \note Using this constructor means that an interval must be provided, via
+    ///       reader.Interval(i), before iterating.
+    ///
+    /// \param[in] bamFiles   input BamFile objects
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open
+    ///         for reading, or if the interval is invalid
+    ///
+    GenomicIntervalCompositeBamReader(const std::vector<BamFile>& bamFiles);
+    GenomicIntervalCompositeBamReader(const std::vector<BamFile>& bamFiles,
+                                      const BaiIndexCache& cache);
+
+    /// \brief Constructs composite %BAM reader, that can be queried on genomic interval.
+    ///
+    /// \note Using this constructor means that an interval must be provided, via
+    ///       reader.Interval(i), before iterating.
+    ///
+    /// \param[in] dataset      input DataSet
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open
+    ///         for reading, or if the interval is invalid
+    ///
+    GenomicIntervalCompositeBamReader(const DataSet& dataset);
+    GenomicIntervalCompositeBamReader(const DataSet& dataset, const BaiIndexCache& cache);
+
+    /// \brief Constructs composite %BAM reader, limiting record results to
+    ///        only those overalpping a GenomicInterval.
+    ///
+    /// \param[in] interval   genomic interval of interest
+    /// \param[in] bamFiles   input BamFile objects
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         BAI files.
+    ///
+    GenomicIntervalCompositeBamReader(const GenomicInterval& interval,
+                                      const std::vector<BamFile>& bamFiles);
+    GenomicIntervalCompositeBamReader(const GenomicInterval& interval,
+                                      const std::vector<BamFile>& bamFiles,
+                                      const BaiIndexCache& cache);
+
+    /// \brief Constructs composite %BAM reader, limiting record results to
+    ///        only those overalpping a GenomicInterval.
+    ///
+    /// \param[in] interval genomic interval of interest
+    /// \param[in] dataset  input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         BAI files.
+    ///
+    GenomicIntervalCompositeBamReader(const GenomicInterval& interval, const DataSet& dataset);
+    GenomicIntervalCompositeBamReader(const GenomicInterval& interval, const DataSet& dataset,
+                                      const BaiIndexCache& cache);
+
+    /// \}
+
+public:
+    /// \name Data Access
+    /// \{
+
+    /// Fetches next BAM record in the interval specified, storing in \p record
+    ///
+    /// \param[out] record
+    /// \returns true on success, false if no more data available.
+    ///
+    bool GetNext(BamRecord& record);
+
+    /// Sets a new genomic interval of interest.
+    ///
+    /// \returns reference to this reader
+    ///
+    GenomicIntervalCompositeBamReader& Interval(const GenomicInterval& interval);
+
+    /// \returns the current specified interval
+    ///
+    const GenomicInterval& Interval() const;
+
+    /// \}
+
+private:
+    void UpdateSort();
+
+private:
+    BaiIndexCache indexCache_;
+    GenomicInterval interval_;
+    std::deque<internal::CompositeMergeItem> mergeItems_;
+    std::vector<std::string> filenames_;
+};
+
+/// \brief Provides read access to multipe %BAM files, limiting results to those
+///        passing a PbiFilter.
+///
+/// Requires a ".pbi" file for each input %BAM file.
+///
+/// \note The template parameter OrderByType is not fully implemented at this
+///       time. Use of comparison functor (e.g. Compare::Zmw) for this will
+///       currently result in the proper "next" value <b> at each iteration
+///       step, independently, but not over the full data set. </b> If all
+///       files' "order-by" data values are accessible in increasing order
+///       within each file, then the expected ordering will be observed,
+///       However, if these data are not sorted within a file, the final results
+///       will appear unordered. \n
+///       \n
+///           Example:\n
+///           file 1: { 1, 5, 2, 6 } \n
+///           file 2: { 3, 8, 4, 7 } \n
+///           results: { 1, 3, 5, 2, 6, 8, 4, 7 } \n
+///       \n
+///       This a known issue and will be addressed in a future update. But in
+///       the meantime, use of Compare::None as the OrderByType is recommended,
+///       to explicitly indicate that no particular ordering is expected.
+///
+template <typename OrderByType>
+class PBBAM_EXPORT PbiFilterCompositeBamReader
+{
+public:
+    using value_type = internal::CompositeMergeItem;
+    using merge_sorter_type = internal::CompositeMergeItemSorter<OrderByType>;
+    using container_type = std::deque<value_type>;
+    using iterator = typename container_type::iterator;
+    using const_iterator = typename container_type::const_iterator;
+
+public:
+    /// \name Contstructors & Related Methods
+    /// \{
+
+    PbiFilterCompositeBamReader(const PbiFilter& filter, const std::vector<BamFile>& bamFiles);
+    PbiFilterCompositeBamReader(const PbiFilter& filter, const std::vector<BamFile>& bamFiles,
+                                const PbiIndexCache& cache);
+
+    PbiFilterCompositeBamReader(const PbiFilter& filter, const DataSet& dataset);
+    PbiFilterCompositeBamReader(const PbiFilter& filter, const DataSet& dataset,
+                                const PbiIndexCache& cache);
+
+    /// \}
+
+public:
+    /// \name Data Access
+    /// \{
+
+    /// Fetches next BAM record in the interval specified.
+    ///
+    /// \returns true on success, false if no more data available.
+    ///
+    bool GetNext(BamRecord& record);
+
+    /// Sets a new PBI filter
+    ///
+    /// \returns reference to this reader
+    ///
+    PbiFilterCompositeBamReader& Filter(const PbiFilter& filter);
+
+    uint32_t NumReads() const;
+
+    /// \}
+
+private:
+    void UpdateSort();
+
+private:
+    PbiIndexCache indexCache_;
+    container_type mergeQueue_;
+    std::vector<std::string> filenames_;
+    uint32_t numReads_;
+};
+
+/// \brief The SequentialCompositeBamReader class provides read access to
+///        multiple %BAM files, reading through the entire contents of each
+///        file.
+///
+/// Input files will be accessed in the order provided to the constructor. Each
+/// file's contents will be exhausted before moving on to the next one (as
+/// opposed to a "round-robin" scheme).
+///
+class PBBAM_EXPORT SequentialCompositeBamReader
+{
+public:
+    /// \name Contstructors & Related Methods
+    /// \{
+
+    SequentialCompositeBamReader(std::vector<BamFile> bamFiles);
+    SequentialCompositeBamReader(const DataSet& dataset);
+
+    /// \}
+
+public:
+    /// \name Data Access
+    /// \{
+
+    /// Fetches next BAM record from the .
+    ///
+    /// \returns true on success, false if no more data available.
+    ///
+    bool GetNext(BamRecord& record);
+
+    /// \}
+
+private:
+    std::deque<std::unique_ptr<BamReader> > readers_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/CompositeBamReader.inl"
+
+#endif  // COMPOSITEBAMREADER_H
diff --git a/include/pbbam/CompositeFastaReader.h b/include/pbbam/CompositeFastaReader.h

new file mode 100644 (file)

index 0000000..cd50f9f
--- /dev/null
+++ b/include/pbbam/CompositeFastaReader.h
@@ -0,0 +1,63 @@
+// File Description
+/// \file CompositeFastaReader.h
+/// \brief Defines the composite FASTA reader, for working with multiple input
+///       files.
+//
+// Author: Derek Barnett
+
+#ifndef COMPOSITEFASTAREADER_H
+#define COMPOSITEFASTAREADER_H
+
+#include "pbbam/Config.h"
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "pbbam/DataSet.h"
+#include "pbbam/FastaReader.h"
+#include "pbbam/FastaSequence.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The CompositeFastaReader class provides read access to
+///        multiple FASTA files, reading through the entire contents of each
+///        file.
+///
+/// Input files will be accessed in the order provided to the constructor. Each
+/// file's contents will be exhausted before moving on to the next one (as
+/// opposed to a "round-robin" scheme).
+///
+class PBBAM_EXPORT CompositeFastaReader
+{
+public:
+    /// \name Contstructors & Related Methods
+    /// \{
+
+    CompositeFastaReader(const std::vector<std::string>& fastaFiles);
+    CompositeFastaReader(const DataSet& dataset);
+
+    /// \}
+
+public:
+    /// \name Data Access
+    /// \{
+
+    /// Fetches next FASTA sequence.
+    ///
+    /// \returns true on success, false if no more data available.
+    ///
+    bool GetNext(FastaSequence& seq);
+
+    /// \}
+
+private:
+    std::deque<std::unique_ptr<FastaReader> > readers_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // COMPOSITEFASTAREADER_H
diff --git a/include/pbbam/Config.h b/include/pbbam/Config.h

new file mode 100644 (file)

index 0000000..a314d3e
--- /dev/null
+++ b/include/pbbam/Config.h
@@ -0,0 +1,67 @@
+// File Description
+/// \file Config.h
+/// \brief Defines library-wide macros & global variables.
+//
+// Author: Derek Barnett
+
+#ifndef PBBAM_CONFIG_H
+#define PBBAM_CONFIG_H
+
+/// \name Library Import/Export
+/// \{
+
+#ifndef PBBAM_EXPORT
+#if defined(WIN32)
+#define PBBAM_EXPORT __declspec(dllimport)
+#else
+#define PBBAM_EXPORT
+#endif
+#endif
+
+/// \}
+
+/// \name Switch for warnings for the pbbam -> pbcopper Data:: move
+/// \{
+
+#ifdef PACBIO_NODEPRECATED_API
+#define PBBAM_DEPRECATED [[deprecated("Use the version from pbcopper in Data::")]]
+#else
+#define PBBAM_DEPRECATED
+#endif
+
+/// \}
+
+namespace PacBio {
+namespace BAM {
+
+/// \name Verbosity Settings
+/// \{
+
+/// \brief Sets the desired verbosity level of htslib warnings.
+///
+/// Change this value to allow debug/warning statements from htslib itself.
+/// The valid range seems to be [0-3], where 0 indicates OFF, and 3 is the
+/// most verbose.
+///
+/// By default, pbbam disables htslib statements to keep output channels clean.
+/// We rely on exceptions & their associated messages instead.
+///
+/// This global variable is obviously not thread-safe by any means. But as a
+/// debug flag, it is unlikely to cause any real issues. The worst case would be
+/// unexpected presence/absence of output statements.
+///
+extern int HtslibVerbosity;
+
+///
+/// \brief DoesHtslibSupportLongCigar
+///
+/// \return true if runtime htslib is >= v1.7
+///
+bool DoesHtslibSupportLongCigar();
+
+/// \}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBBAM_CONFIG_H
diff --git a/include/pbbam/DataSet.h b/include/pbbam/DataSet.h

new file mode 100644 (file)

index 0000000..886cd02
--- /dev/null
+++ b/include/pbbam/DataSet.h
@@ -0,0 +1,835 @@
+// File Description
+/// \file DataSet.h
+/// \brief Defines the DataSet class.
+//
+// Author: Derek Barnett
+
+#ifndef DATASET_H
+#define DATASET_H
+
+#include "pbbam/Config.h"
+
+#include <chrono>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/DataSetTypes.h"
+#include "pbbam/GenomicInterval.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The DataSet class represents a %PacBio analyis dataset (e.g. from
+///        XML).
+///
+/// \nosubgrouping
+///
+/// It provides resource paths, filters, and metadata associated with a dataset
+/// under analysis.
+///
+class PBBAM_EXPORT DataSet
+{
+public:
+    /// \name DataSet Type
+    /// \{
+
+    /// \brief This enum defines the currently-supported DataSet types.
+    ///
+    enum TypeEnum
+    {
+        GENERIC = 0,
+        ALIGNMENT,
+        BARCODE,
+        CONSENSUS_ALIGNMENT,
+        CONSENSUS_READ,
+        CONTIG,
+        HDF_SUBREAD,
+        REFERENCE,
+        SUBREAD,
+        TRANSCRIPT,
+        TRANSCRIPT_ALIGNMENT
+    };
+
+    /// \brief Converts printable dataset type to type enum.
+    ///
+    /// \param[in] typeName printable dataset type
+    /// \returns dataset type enum
+    /// \throws std::runtime_error if \p typeName is unknown
+    ///
+    static DataSet::TypeEnum NameToType(const std::string& typeName);
+
+    /// \brief Converts dataset type enum to printable name.
+    ///
+    /// \param[in] type dataset type enum
+    /// \returns printable dataset type
+    /// \throws std::runtime_error if \p type is unknown
+    ///
+    static std::string TypeToName(const DataSet::TypeEnum& type);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Constructs an empty, generic DataSet.
+    ///
+    DataSet();
+
+    /// \brief Constructs an empty DataSet of the type specified.
+    ///
+    /// \param[in] type dataset type
+    /// \throws std::runtime_error if \p type is unknown
+    ///
+    DataSet(const DataSet::TypeEnum type);
+
+    /// \brief Constructs a DataSet from a %BAM file.
+    ///
+    /// This currently defaults to a SubreadSet, with an ExternalResource
+    /// pointing to BamFile::Filename.
+    ///
+    /// \param[in] bamFile  BamFile object
+    ///
+    DataSet(const BamFile& bamFile);
+
+    /// \brief Loads a DataSet from a file.
+    ///
+    /// \p filename may be one of the following types, indicated by its extension:\n
+    ///  - %BAM ("*.bam") \n
+    ///  - FOFN ("*.fofn") \n
+    ///  - FASTA ("*.fa" or "*.fasta") \n
+    ///  - DataSetXML ("*.xml") \n
+    ///
+    /// \param[in] filename  input filename
+    /// \throws std::runtime_error if \p filename has an unsupported extension,
+    ///         or if a valid DataSet could not be created from its contents
+    ///
+    DataSet(const std::string& filename);
+
+    /// \brief Constructs a DataSet from a list of files.
+    ///
+    /// \param[in] filenames  input filenames
+    /// \throws std::runtime_error if DataSet could not be created from
+    ///         \p filenames
+    ///
+    DataSet(const std::vector<std::string>& filenames);
+
+    DataSet(const DataSet&);
+    DataSet(DataSet&&) noexcept = default;
+    DataSet& operator=(const DataSet&);
+    DataSet& operator=(DataSet&&) noexcept = default;
+
+    /// \brief Creates a DataSet from "raw" XML data.
+    ///
+    /// \param[in] xml DataSetXML text
+    ///
+    static DataSet FromXml(const std::string& xml);
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    /// \brief Merges DataSet contents.
+    ///
+    /// Adds contents of \p other to this dataset object
+    ///
+    /// \param[in] other  some other dataset to add to this one
+    /// \returns reference to this dataset object
+    ///
+    DataSet& operator+=(const DataSet& other);
+
+    /// \}
+
+public:
+    /// \name Serialization
+    /// \{
+
+    /// \brief Saves dataset XML to file.
+    ///
+    /// \param[in] outputFilename destination for XML contents
+    ///
+    /// \throws std::runtime_error if file could be opened or if DataSet
+    ///         elements could not be converted to XML
+    ///
+    void Save(const std::string& outputFilename) const;
+
+    /// \brief Saves dataset XML to output stream, e.g. std::cout,
+    ///        std::stringstream.
+    ///
+    /// \param[out] out destination for XML contents
+    ///
+    /// \throws std::runtime_error if DataSet elements could not be converted to
+    ///         XML
+    ///
+    void SaveToStream(std::ostream& out) const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+    ///
+
+    /// \brief Fetches the value of a DataSet root element's attribute.
+    ///
+    /// These are the attributes attached to the root dataset element: \n
+    /// \verbatim <SubreadSet foo="x" bar="y" /> \endverbatim
+    ///
+    /// Built-in accessors exist for the standard attributes (e.g. CreatedAt)
+    /// but additional attributes can be used as well via these generic
+    /// Attribute methods.
+    ///
+    /// \param[in] name root element's attribute name
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Attribute(const std::string& name) const;
+
+    /// \brief Fetches the value of dataset's CreatedAt attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& CreatedAt() const;
+
+    /// \brief Fetches the value of dataset's Format attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Format() const;
+
+    /// \brief Fetches the value of dataset's MetaType attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& MetaType() const;
+
+    /// \brief Fetches the value of dataset's ModifiedAt attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& ModifiedAt() const;
+
+    /// \brief Fetches the value of dataset's Name attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Name() const;
+
+    /// \brief Fetches the value of dataset's ResourceId attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& ResourceId() const;
+
+    /// \brief Fetches the value of dataset's Tags attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Tags() const;
+
+    /// \brief Fetches the value of dataset's TimeStampedName attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& TimeStampedName() const;
+
+    /// \brief Fetches the value of dataset's UniqueId attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& UniqueId() const;
+
+    /// \brief Fetches the value of dataset's Version attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Version() const;
+
+    /// \}
+
+public:
+    /// \name DataSet Type
+    /// \{
+
+    /// \brief Fetches the dataset's type.
+    ///
+    /// \returns dataset type enum
+    ///
+    PacBio::BAM::DataSet::TypeEnum Type() const;
+
+    /// \brief Fetches the dataset's type.
+    ///
+    /// \returns printable dataset type
+    ///
+    std::string TypeName() const;
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Fetches the dataset's Extensions element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::Extensions& Extensions() const;
+
+    /// \brief Fetches the dataset's ExternalResources element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::ExternalResources& ExternalResources() const;
+
+    /// \brief Fetches the dataset's Filters element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::Filters& Filters() const;
+
+    /// \brief Fetches the dataset's DataSetMetadata element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::DataSetMetadata& Metadata() const;
+
+    /// \brief Fetches the dataset's DataSets element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::SubDataSets& SubDataSets() const;
+
+    /// \}
+
+public:
+    /// \name Resource Handling
+    /// \{
+
+    /// \brief Returns all of this dataset's resource files, with relative
+    ///        filepaths already resolved.
+    ///
+    /// Includes both primary resources (e.g. subread BAM files), as well as all
+    /// secondary or child resources (e.g. index files, scraps BAM, etc).
+    ///
+    /// \returns vector of (resolveD) filepaths
+    ///
+    /// \sa DataSet::ResolvedResourceIds
+    ///
+    std::vector<std::string> AllFiles() const;
+
+    /// \brief Returns this dataset's primary %BAM resources, with relative
+    ///        filepaths already resolved.
+    ///
+    /// Primary resources are those listed as top-level %ExternalResources, not
+    /// associated files (indices, references, scraps %BAMs, etc.).
+    ///
+    /// \returns vector of BamFiles
+    ///
+    /// \sa DataSet::ResolvedResourceIds
+    ///
+    std::vector<BamFile> BamFiles() const;
+
+    /// \brief Returns all filenames for BamFiles(), with paths resolved.
+    ///
+    //  Unlike BamFiles(), this does not actually open the BAM files.
+    //  (BamFile(fn) would read the header.)
+    //
+    /// \returns vector of std::string
+    ///
+    /// \sa DataSet::BamFiles
+    ///
+    std::vector<std::string> BamFilenames() const;
+
+    /// \brief Returns this dataset's primary FASTA resources, with relative
+    ///        filepaths already resolved.
+    ///
+    /// Primary resources are those listed as top-level %ExternalResources, not
+    /// associated files (indices, references, scraps %BAMs, etc.).
+    ///
+    /// \returns vector of filepaths to FASTA resources
+    ///
+    /// \sa DataSet::ResolvedResourceIds
+    ///
+    std::vector<std::string> FastaFiles() const;
+
+    ///
+    /// \returns (absolute) path for dataset
+    ///
+    const std::string& Path() const;
+
+    /// \brief Returns all primary external resource filepaths, with relative
+    ///        paths resolved.
+    ///
+    /// Primary resources are those listed as top-level %ExternalResources, not
+    /// associated files (indices, references, scraps %BAMs, etc.).
+    ///
+    /// \sa ResolvePath
+    ///
+    /// \returns resourceIds
+    ///
+    std::vector<std::string> ResolvedResourceIds() const;
+
+    /// \brief Resolves a filepath (that may be relative to the dataset).
+    ///
+    /// A DataSet's resources may be described using absolute filepaths or with
+    /// relative paths. For absolute paths, nothing is changed from the input.
+    /// For relative paths, these are resolved using the DataSet's own path
+    /// as a starting point. A DataSet's own path will be one of:\n
+    ///  1 - the location of its XML or %BAM input file, e.g. created using
+    ///      DataSet("foo.xml") or DataSet("foo.bam")\n
+    ///  2 - application's current working directory for all other DataSet
+    ///      construction methods { DataSet(), DataSet(type),
+    ///      DataSet("foo.fofn") }\n
+    ///
+    /// \param[in] originalPath     input file path (absolute or relative)
+    /// \returns resolved path
+    ///
+    std::string ResolvePath(const std::string& originalPath) const;
+
+    /// \returns sequence chemistry info for all read groups in this dataset
+    ///
+    /// \sa ReadGroupInfo::SequencingChemistry
+    ///
+    std::set<std::string> SequencingChemistries() const;
+
+    /// \brief Return a minimal list of genomic intervals covered by filters.
+    ///
+    /// \returns vector of genomic intervals
+    ///
+    /// \throws std::runtime_error if DataSet contains invalid or non-sensical
+    ///         filters, such as rname appearing twice, etc.
+    ///
+    std::vector<GenomicInterval> GenomicIntervals() const;
+
+    /// \}
+
+public:
+    /// \name XML Namespace Handling
+    /// \{
+
+    /// \brief Access this dataset's namespace info.
+    ///
+    /// \returns const reference to dataset's NamespaceRegistry
+    ///
+    const NamespaceRegistry& Namespaces() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Fetches the value of a DataSet root element's attribute.
+    ///
+    /// These are the attributes attached to the root dataset element: \n
+    /// \verbatim <SubreadSet foo="x" bar="y" /> \endverbatim
+    ///
+    /// Built-in accessors exist for the standard attributes (e.g. CreatedAt)
+    /// but additional attributes can be used as well via these generic methods.
+    ///
+    /// A new attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] name root element's attribute name
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Attribute(const std::string& name);
+
+    /// \brief Fetches the value of dataset's CreatedAt attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& CreatedAt();
+
+    /// \brief Fetches the value of dataset's Format attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Format();
+
+    /// \brief Fetches the value of dataset's MetaType attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& MetaType();
+
+    /// \brief Fetches the value of dataset's ModifiedAt attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& ModifiedAt();
+
+    /// \brief Fetches the value of dataset's Name attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Name();
+
+    /// \brief Fetches the value of dataset's ResourceId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& ResourceId();
+
+    /// \brief Fetches the value of dataset's Tags attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Tags();
+
+    /// \brief Fetches the value of dataset's TimeStampedName attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& TimeStampedName();
+
+    /// \brief Fetches the value of dataset's UniqueId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& UniqueId();
+
+    /// \brief Fetches the value of dataset's Version attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Version();
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Sets this dataset's XML attribute \p name, with \p value
+    ///
+    /// These are the attributes attached to the root dataset element: \n
+    /// \verbatim <SubreadSet foo="x" bar="y" /> \endverbatim
+    ///
+    /// Built-in accessors exist for the standard attributes (e.g. CreatedAt)
+    /// but additional attributes can be used as well via these generic methods.
+    ///
+    /// The attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] name   root element's attribute name
+    /// \param[in] value  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Attribute(const std::string& name, const std::string& value);
+
+    /// \brief Sets this dataset's CreatedAt attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] createdAt  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& CreatedAt(const std::string& createdAt);
+
+    /// \brief Sets this dataset's Format attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] format  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Format(const std::string& format);
+
+    /// \brief Sets this dataset's MetaType attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] metatype  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& MetaType(const std::string& metatype);
+
+    /// \brief Sets this dataset's ModifiedAt attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] modifiedAt  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& ModifiedAt(const std::string& modifiedAt);
+
+    /// \brief Sets this dataset's Name attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] name  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Name(const std::string& name);
+
+    /// \brief Sets this dataset's ResourceId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] resourceId  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& ResourceId(const std::string& resourceId);
+
+    /// \brief Sets this dataset's Tags attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] tags  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Tags(const std::string& tags);
+
+    /// \brief Sets this dataset's TimeStampedName attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] timeStampedName  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& TimeStampedName(const std::string& timeStampedName);
+
+    /// \brief Sets this dataset's UniqueId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] uuid  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& UniqueId(const std::string& uuid);
+
+    /// \brief Sets this dataset's Version attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] version  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Version(const std::string& version);
+
+    /// \}
+
+public:
+    /// \name DataSet Type
+    /// \{
+
+    /// \brief Edits dataset type.
+    ///
+    /// \param[in] type  new dataset type
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Type(const PacBio::BAM::DataSet::TypeEnum type);
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Fetches the dataset's Extensions element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Extensions& Extensions();
+
+    /// \brief Fetches the dataset's ExternalResources element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::ExternalResources& ExternalResources();
+
+    /// \brief Fetches the dataset's Filters element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Filters& Filters();
+
+    /// \brief Fetches the dataset's DataSetMetadata element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::DataSetMetadata& Metadata();
+
+    /// \brief Fetches the dataset's DataSets element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::SubDataSets& SubDataSets();
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Sets this dataset's Extensions element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] extensions  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Extensions(const PacBio::BAM::Extensions& extensions);
+
+    /// \brief Sets this dataset's ExternalResources element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] resources  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& ExternalResources(const PacBio::BAM::ExternalResources& resources);
+
+    /// \brief Sets this dataset's Filters element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] filters  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Filters(const PacBio::BAM::Filters& filters);
+
+    /// \brief Sets this dataset's DataSetMetadata element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] metadata  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Metadata(const PacBio::BAM::DataSetMetadata& metadata);
+
+    /// \brief Sets this dataset's DataSets element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] subdatasets  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& SubDataSets(const PacBio::BAM::SubDataSets& subdatasets);
+
+    /// \}
+
+public:
+    /// \name XML Namespace Handling
+    /// \{
+
+    /// \brief Access this dataset's namespace info.
+    ///
+    /// \returns non-const reference to dataset's NamespaceRegistry
+    ///
+    NamespaceRegistry& Namespaces();
+
+    /// \}
+
+private:
+    std::unique_ptr<DataSetBase> d_;
+};
+
+/// \name DataSet Timestamp Utilities
+/// \{
+
+/// \brief Fetches current time, in "DataSetXML format".
+///
+/// \returns DataSetXML formatted timestamp
+///
+/// \sa ToDataSetFormat
+///
+PBBAM_EXPORT std::string CurrentTimestamp();
+
+/// \brief Converts a time_point to "DataSetXML-formatted" timestamp.
+///
+/// This is the format used as a component of the DataSet::TimeStampedName
+/// (yymmdd_HHmmssttt>.
+///
+/// \returns "DataSetXML-formatted" timestamp
+///
+PBBAM_EXPORT std::string ToDataSetFormat(const std::chrono::system_clock::time_point& tp);
+
+/// \brief Converts a time_t to "DataSetXML-formatted" timestamp.
+///
+/// This is the format used as a component of the DataSet::TimeStampedName
+/// (yymmdd_HHmmssttt>.
+///
+/// \returns "DataSetXML-formatted" timestamp
+///
+PBBAM_EXPORT std::string ToDataSetFormat(const time_t& tp);
+
+/// \brief Converts a time_point to ISO-8601 formatted timestamp.
+///
+/// This is the format used in DataSet::CreatedAt and DataSet::ModifiedAt.
+///
+/// \returns ISO-8601 formatted timestamp
+///
+PBBAM_EXPORT std::string ToIso8601(const std::chrono::system_clock::time_point& tp);
+
+/// \brief Converts a time_t to ISO-8601 formatted timestamp.
+///
+/// This is the format used in DataSet::CreatedAt and DataSet::ModifiedAt.
+///
+/// \returns ISO-8601 formatted timestamp
+///
+PBBAM_EXPORT std::string ToIso8601(const time_t& t);
+
+/// \}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // DATASET_H
diff --git a/include/pbbam/DataSetTypes.h b/include/pbbam/DataSetTypes.h

new file mode 100644 (file)

index 0000000..4bab41d
--- /dev/null
+++ b/include/pbbam/DataSetTypes.h
@@ -0,0 +1,1211 @@
+// File Description
+/// \file DataSetTypes.h
+/// \brief Defines the public DataSet component classes.
+//
+// Author: Derek Barnett
+
+#ifndef DATASETTYPES_H
+#define DATASETTYPES_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/DataSetXsd.h"
+#include "pbbam/internal/DataSetBaseTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The DNABarcode class represents a %DNABarcode element in
+///        DataSetXML, consisting of a Name and optional UniqueId.
+///
+class PBBAM_EXPORT DNABarcode : public internal::DataSetElement
+{
+public:
+    DNABarcode(const std::string& name);
+    DNABarcode(const std::string& name, const std::string& uuid);
+    DNABarcode(const std::string& name, const internal::FromInputXml& fromInputXml);
+    DNABarcode(const std::string& name, const std::string& uuid,
+               const internal::FromInputXml& fromInputXml);
+
+    const std::string& Name() const;
+    std::string& Name();
+    DNABarcode& Name(const std::string& name);
+
+    const std::string& UniqueId() const;
+    std::string& UniqueId();
+    DNABarcode& UniqueId(const std::string& name);
+};
+
+/// \brief The DNABarcodes class represents an %DNABarcodes element in DataSetXML.
+///
+/// The DNABarcodes element is essentially just a list of DNABarcode
+/// objects.
+///
+class PBBAM_EXPORT DNABarcodes : public internal::DataSetElement
+{
+public:
+    DNABarcodes();
+    DNABarcodes(const internal::FromInputXml& fromInputXml);
+
+public:
+    void Add(const DNABarcode& barcode);
+    void Remove(const DNABarcode& barcode);
+
+public:
+    using value_type = DNABarcode;
+    using iterator_type = internal::DataSetElementIterator<value_type>;
+    using const_iterator_type = internal::DataSetElementConstIterator<value_type>;
+
+    const value_type& operator[](size_t index) const;
+    value_type& operator[](size_t index);
+
+    iterator_type begin();
+    const_iterator_type begin() const;
+    const_iterator_type cbegin() const;
+    iterator_type end();
+    const_iterator_type end() const;
+    const_iterator_type cend() const;
+};
+
+class PBBAM_EXPORT BioSample : public internal::DataSetElement
+{
+public:
+    BioSample(const std::string& name);
+    BioSample(const std::string& name, const internal::FromInputXml& fromInputXml);
+
+    const PacBio::BAM::DNABarcodes& DNABarcodes() const;
+    PacBio::BAM::DNABarcodes& DNABarcodes();
+    BioSample& DNABarcodes(const PacBio::BAM::DNABarcodes& barcodes);
+
+    const std::string& Name() const;
+    std::string& Name();
+    BioSample& Name(const std::string& name);
+};
+
+/// \brief The DNABarcodes class represents an %DNABarcodes element in DataSetXML.
+///
+/// The DNABarcodes element is essentially just a list of DNABarcode
+/// objects.
+///
+class PBBAM_EXPORT BioSamples : public internal::DataSetElement
+{
+public:
+    BioSamples();
+    BioSamples(const internal::FromInputXml& fromInputXml);
+
+public:
+    void Add(const BioSample& sample);
+    void Remove(const BioSample& sample);
+
+public:
+    using value_type = BioSample;
+    using iterator_type = internal::DataSetElementIterator<value_type>;
+    using const_iterator_type = internal::DataSetElementConstIterator<value_type>;
+
+    const value_type& operator[](size_t index) const;
+    value_type& operator[](size_t index);
+
+    iterator_type begin();
+    const_iterator_type begin() const;
+    const_iterator_type cbegin() const;
+    iterator_type end();
+    const_iterator_type end() const;
+    const_iterator_type cend() const;
+};
+
+/// \brief The ExtensionElement class represents an %ExtensionElement element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT ExtensionElement : public internal::DataSetElement
+{
+public:
+    ExtensionElement();
+    ExtensionElement(const internal::FromInputXml& fromInputXml);
+};
+
+/// \brief The Extensions class represents an %Extensions element in DataSetXML.
+///
+/// The Extensions element is essentially just a list of ExtensionElement
+/// objects.
+///
+class PBBAM_EXPORT Extensions : public internal::DataSetElement
+{
+public:
+    /// \brief Creates an empty extensions list.
+    Extensions();
+    Extensions(const internal::FromInputXml& fromInputXml);
+
+public:
+    using value_type = ExtensionElement;
+    using iterator_type = internal::DataSetElementIterator<value_type>;
+    using const_iterator_type = internal::DataSetElementConstIterator<value_type>;
+
+    const value_type& operator[](size_t index) const;
+    value_type& operator[](size_t index);
+
+    iterator_type begin();
+    const_iterator_type begin() const;
+    const_iterator_type cbegin() const;
+    iterator_type end();
+    const_iterator_type end() const;
+    const_iterator_type cend() const;
+};
+
+class ExternalResources;
+
+/// \brief The ExternalResource class represents an %ExternalResource element in
+///        DataSetXML.
+///
+/// An ExternalResource can itself have a child element, ExternalResources, that
+/// lists related files (e.g. index files).
+///
+class PBBAM_EXPORT ExternalResource : public internal::IndexedDataType
+{
+public:
+    /// \brief Creates an ExternalResource from a BamFile object.
+    ///
+    /// The metatype & resourceId are automatically set.
+    ///
+    ExternalResource(const BamFile& bamFile);
+
+    /// \brief Creates an ExternalResource with provided \p metatype and
+    ///        \p filename as resource ID.
+    ///
+    ExternalResource(const std::string& metatype, const std::string& filename);
+
+    ExternalResource(const std::string& metatype, const std::string& filename,
+                     const internal::FromInputXml& fromInputXml);
+
+public:
+    /// \brief Fetches the resource's ExternalResources child element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::ExternalResources& ExternalResources() const;
+
+public:
+    /// \brief Fetches the resource's ExternalResources child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::ExternalResources& ExternalResources();
+
+    /// \brief Sets this resource's ExternalResources child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] resources  new value for the element
+    /// \returns reference to this resource object
+    ///
+    ExternalResource& ExternalResources(const PacBio::BAM::ExternalResources& resources);
+
+public:
+    /// \brief Converts an ExternalResource to a BamFile object
+    ///
+    /// \returns corresponding BamFile object for this ExternalResource
+    /// \throws std::runtime_error if fails to open %BAM file (e.g. does not
+    ///         exist, not a %BAM file, etc.)
+    ///
+    /// \deprecated Use the results from DataSet::BamFiles instead. This method
+    ///             cannot resolve relative filepaths and will be removed in the
+    ///             near future.
+    ///
+    BamFile ToBamFile() const;
+};
+
+/// \brief The ExternalResources class represents an %ExternalResources element
+///        in DataSetXML.
+///
+/// The ExternalResources element is essentially just a list of ExternalResource
+/// elements.
+///
+class PBBAM_EXPORT ExternalResources : public internal::DataSetElement
+{
+public:
+    /// \brief Creates an empty resource list.
+    ExternalResources();
+    ExternalResources(const internal::FromInputXml& fromInputXml);
+
+    /// \brief Merges \p other resource list with this one.
+    ExternalResources& operator+=(const ExternalResources& other);
+
+public:
+    /// \brief Adds an ExternalResource to this list.
+    void Add(const ExternalResource& ext);
+
+    /// \brief Removes an ExternalResource from this list.
+    void Remove(const ExternalResource& ext);
+
+public:
+    /// \brief Converts resource list to BamFile objects.
+    ///
+    /// \deprecated Use DataSet::BamFiles instead. This method cannot resolve
+    ///             relative filepaths and will be removed in the near future.
+    ///
+    std::vector<BamFile> BamFiles() const;
+
+public:
+    using value_type = ExternalResource;
+    using iterator_type = internal::DataSetElementIterator<value_type>;
+    using const_iterator_type = internal::DataSetElementConstIterator<value_type>;
+
+    const value_type& operator[](size_t index) const;
+    value_type& operator[](size_t index);
+
+    iterator_type begin();
+    const_iterator_type begin() const;
+    const_iterator_type cbegin() const;
+    iterator_type end();
+    const_iterator_type end() const;
+    const_iterator_type cend() const;
+};
+
+/// \brief The FileIndex class represents a %FileIndex element in DataSetXML.
+///
+/// A FileIndex is used as an auxiliary to an ExternalResource, providing
+/// information about a data file's index file (e.g. for %BAM files, *.bai or
+/// *.pbi).
+///
+class PBBAM_EXPORT FileIndex : public internal::InputOutputDataType
+{
+public:
+    /// \brief Creates a FileIndex with provided \p metatype and \p filename as
+    ///        resource ID.
+    ///
+    FileIndex(const std::string& metatype, const std::string& filename);
+
+    FileIndex(const std::string& metatype, const std::string& filename,
+              const internal::FromInputXml& fromInputXml);
+};
+
+/// \brief The FileIndices class represents a %FileIndices element in DataSetXML.
+///
+/// The FileIndices element is essentially just a list of FileIndex elements,
+/// providing information about a data file's index files (e.g. for %BAM files
+/// this will usually be *.bai and/or *.pbi).
+///
+class PBBAM_EXPORT FileIndices : public internal::DataSetElement
+{
+public:
+    /// \brief Creates an empty index list.
+    FileIndices();
+    FileIndices(const internal::FromInputXml& fromInputXml);
+
+public:
+    /// \brief Adds a FileIndex to this list.
+    void Add(const FileIndex& index);
+
+    /// \brief Removes a FileIndex from this list.
+    void Remove(const FileIndex& index);
+
+public:
+    using value_type = FileIndex;
+    using iterator_type = internal::DataSetElementIterator<value_type>;
+    using const_iterator_type = internal::DataSetElementConstIterator<value_type>;
+
+    const value_type& operator[](size_t index) const;
+    value_type& operator[](size_t index);
+
+    iterator_type begin();
+    const_iterator_type begin() const;
+    const_iterator_type cbegin() const;
+    iterator_type end();
+    const_iterator_type end() const;
+    const_iterator_type cend() const;
+};
+
+/// \brief The Filter class represents a %Filter element in DataSetXML.
+///
+/// The Filter element allows analysis pipelines to describe filters on data
+/// that should be respected downstream, without needing to create filtered
+/// intermediate files.
+///
+/// A filter consists of a list of Property elements, each of which must be
+/// passed (logical AND) to pass the filter, e.g. property1 && property2 &&
+/// property3.
+///
+class PBBAM_EXPORT Filter : public internal::DataSetElement
+{
+public:
+    /// \brief Creates an empty filter.
+    Filter();
+    Filter(const internal::FromInputXml& fromInputXml);
+
+public:
+    /// \brief Fetches the filter's property list element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::Properties& Properties() const;
+
+public:
+    /// \brief Fetches the filter's property list child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Properties& Properties();
+
+    /// \brief Sets this filter's Properties child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] properties new value for the element
+    /// \returns reference to this filter object
+    ///
+    Filter& Properties(const PacBio::BAM::Properties& properties);
+};
+
+/// \brief The Filters class represents a %Filters list element in DataSetXML.
+///
+/// The Filters element is essentially a list of Filter elements. For analysis
+/// purpose, each filter is considered separately (logical OR) to consider which
+/// data passes, e.g. filter1 || filter2 || filter3.
+///
+class PBBAM_EXPORT Filters : public internal::DataSetElement
+{
+public:
+    /// \brief Creates an empty filter list.
+    Filters();
+    Filters(const internal::FromInputXml& fromInputXml);
+
+    /// \brief Merges \p other filter list with this one.
+    Filters& operator+=(const Filters& other);
+
+public:
+    /// \brief Adds a filter to this list.
+    void Add(const Filter& filter);
+
+    /// \brief Removes a filter from this list.
+    void Remove(const Filter& filter);
+
+public:
+    using value_type = Filter;
+    using iterator_type = internal::DataSetElementIterator<value_type>;
+    using const_iterator_type = internal::DataSetElementConstIterator<value_type>;
+
+    const value_type& operator[](size_t index) const;
+    value_type& operator[](size_t index);
+
+    iterator_type begin();
+    const_iterator_type begin() const;
+    const_iterator_type cbegin() const;
+    iterator_type end();
+    const_iterator_type end() const;
+    const_iterator_type cend() const;
+};
+
+/// \brief The ParentTool class represents a %ParentTool element in DataSetXML.
+///
+class PBBAM_EXPORT ParentTool : public internal::BaseEntityType
+{
+public:
+    /// \brief Creates an empty %ParentTool element.
+    ParentTool();
+    ParentTool(const internal::FromInputXml& fromInputXml);
+};
+
+/// \brief The Property class represents a %Property element in DataSetXML.
+///
+/// A Property is the primary building block of %DataSetXML filtering. The
+/// %Property element describes a data record's property (or field), some value,
+/// and a comparison operator.
+///
+/// For example, one could filter all %BAM records with a read accuracy at or
+/// above 0.9. In C++ this could be constructed like:
+/// \code{.cpp}
+/// Property p("accuracy", "0.9", ">=");
+/// \endcode
+///
+class PBBAM_EXPORT Property : public internal::DataSetElement
+{
+public:
+    /// \brief Constructs a filter property.
+    Property(const std::string& name, const std::string& value, const std::string& op);
+    Property(const std::string& name, const std::string& value, const std::string& op,
+             const internal::FromInputXml& fromInputXml);
+
+public:
+    /// \brief Fetches the value of property's Name attribute.
+    ///
+    /// \returns const reference to attribute value
+    ///
+    const std::string& Name() const;
+
+    /// \brief Fetches the value of property's Operator attribute.
+    ///
+    /// \returns const reference to attribute value
+    ///
+    const std::string& Operator() const;
+
+    /// \brief Fetches the value of property's Value attribute.
+    ///
+    /// \returns const reference to attribute value
+    ///
+    const std::string& Value() const;
+
+public:
+    /// \brief Fetches the value of property's Name attribute.
+    ///
+    /// \returns non-const reference to attribute value
+    ///
+    std::string& Name();
+
+    /// \brief Fetches the value of property's Operator attribute.
+    ///
+    /// \returns non-const reference to attribute value
+    ///
+    std::string& Operator();
+
+    /// \brief Fetches the value of property's Value attribute.
+    ///
+    /// \returns nonconst reference to attribute value
+    ///
+    std::string& Value();
+
+public:
+    /// \brief Sets this property's Name attribute.
+    ///
+    /// \param[in] name  new value for the attribute
+    /// \returns reference to this property object
+    ///
+    Property& Name(const std::string& name);
+
+    /// \brief Sets this property's Operator attribute.
+    ///
+    /// \param[in] op  new value for the attribute
+    /// \returns reference to this property object
+    ///
+    Property& Operator(const std::string& op);
+
+    /// \brief Sets this property's Value attribute.
+    ///
+    /// \param[in] value  new value for the attribute
+    /// \returns reference to this property object
+    ///
+    Property& Value(const std::string& value);
+};
+
+/// \brief The Properties class represents a %Properties list element in
+///        DataSetXML.
+///
+/// The Properties element is essentially a list of Property elements.
+///
+class PBBAM_EXPORT Properties : public internal::DataSetElement
+{
+public:
+    /// \brief Creates an empty property list.
+    Properties();
+    Properties(const internal::FromInputXml& fromInputXml);
+
+public:
+    /// \brief Adds a property to this list.
+    void Add(const Property& property);
+
+    /// \brief Removes a property from this list.
+    void Remove(const Property& property);
+
+public:
+    using value_type = Property;
+    using iterator_type = internal::DataSetElementIterator<value_type>;
+    using const_iterator_type = internal::DataSetElementConstIterator<value_type>;
+
+    const value_type& operator[](size_t index) const;
+    value_type& operator[](size_t index);
+
+    iterator_type begin();
+    const_iterator_type begin() const;
+    const_iterator_type cbegin() const;
+    iterator_type end();
+    const_iterator_type end() const;
+    const_iterator_type cend() const;
+};
+
+/// \brief The Provenance class represents a %Provenance element in DataSetXML.
+///
+class PBBAM_EXPORT Provenance : public internal::DataSetElement
+{
+public:
+    /// \brief Creates a empty provenance element.
+    Provenance();
+    Provenance(const internal::FromInputXml& fromInputXml);
+
+public:
+    /// \brief Fetches the value of CreatedBy attribute.
+    ///
+    /// \returns const reference to attribute value (empty string if not
+    ///          present)
+    ///
+    const std::string& CreatedBy() const;
+
+    /// \brief Fetches the value of CommonServicesInstanceId attribute.
+    ///
+    /// \returns const reference to attribute value (empty string if not
+    ///          present)
+    ///
+    const std::string& CommonServicesInstanceId() const;
+
+    /// \brief Fetches the value of CreatorUserId attribute.
+    ///
+    /// \returns const reference to attribute value (empty string if not
+    ///          present)
+    ///
+    const std::string& CreatorUserId() const;
+
+    /// \brief Fetches the value of ParentJobId attribute.
+    ///
+    /// \returns const reference to attribute value (empty string if not
+    ///          present)
+    ///
+    const std::string& ParentJobId() const;
+
+    /// \brief Fetches the ParentTool child element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::ParentTool& ParentTool() const;
+
+public:
+    /// \brief Fetches the value of CreatedBy attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute value (empty string if this is
+    ///          a new attribute)
+    ///
+    std::string& CreatedBy();
+
+    /// \brief Fetches the value of CommonServicesInstanceId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute value (empty string if this is
+    ///          a new attribute)
+    ///
+    std::string& CommonServicesInstanceId();
+
+    /// \brief Fetches the value of CreatorUserId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute value (empty string if this is
+    ///          a new attribute)
+    ///
+    std::string& CreatorUserId();
+
+    /// \brief Fetches the value of ParentJobId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute value (empty string if this is
+    ///          a new attribute)
+    ///
+    std::string& ParentJobId();
+
+    /// \brief Fetches the ParentTool element element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::ParentTool& ParentTool();
+
+public:
+    /// \brief Sets the CreatedBy attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] createdBy  new value for the attribute
+    /// \returns reference to this object
+    ///
+    Provenance& CreatedBy(const std::string& createdBy);
+
+    /// \brief Sets the CommonServicesInstanceId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] id  new value for the attribute
+    /// \returns reference to this object
+    ///
+    Provenance& CommonServicesInstanceId(const std::string& id);
+
+    /// \brief Sets the CreatorUserId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] id  new value for the attribute
+    /// \returns reference to this object
+    ///
+    Provenance& CreatorUserId(const std::string& id);
+
+    /// \brief Sets the ParentJobId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] id  new value for the attribute
+    /// \returns reference to this object
+    ///
+    Provenance& ParentJobId(const std::string& id);
+
+    /// \brief Sets the ParentTool child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] tool  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    Provenance& ParentTool(const PacBio::BAM::ParentTool& tool);
+};
+
+/// \brief The DataSetMetadata class represents the %DataSetMetadata child
+///        element in DataSetXML.
+///
+/// A few top-level elements are built-in, but as pbbam is not primarily a
+/// DataSetXML API, most of the metadata hierarchy needs to be manually managed.
+///
+class PBBAM_EXPORT DataSetMetadata : public internal::DataSetElement
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    DataSetMetadata();
+    DataSetMetadata(const internal::FromInputXml& fromInputXml);
+
+    /// \brief Constructs a DataSetMetadata with required fields.
+    DataSetMetadata(const std::string& numRecords, const std::string& totalLength);
+    DataSetMetadata(const std::string& numRecords, const std::string& totalLength,
+                    const internal::FromInputXml& fromInputXml);
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    /// \brief Merges DataSetMetadata contents.
+    ///
+    /// Adds contents of \p other to this metadata object
+    ///
+    /// \param[in] other  some other metadata to add to this one
+    /// \returns reference to this object
+    ///
+    DataSetMetadata& operator+=(const DataSetMetadata& other);
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Fetches the text of the NumRecords element.
+    ///
+    /// \returns const reference to element text (empty string if not present)
+    ///
+    const std::string& NumRecords() const;
+
+    /// \brief Fetches the text of the TotalLength element.
+    ///
+    /// \returns const reference to element text (empty string if not present)
+    ///
+    const std::string& TotalLength() const;
+
+    /// \brief Fetches the Provenance element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::Provenance& Provenance() const;
+
+    /// \brief Fetches the BioSamples element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::BioSamples& BioSamples() const;
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Fetches the text of the NumRecords element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to element text
+    ///
+    std::string& NumRecords();
+
+    /// \brief Fetches the text of the TotalLength element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to element text
+    ///
+    std::string& TotalLength();
+
+    /// \brief Fetches Provenance element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Provenance& Provenance();
+
+    /// \brief Fetches BioSamples element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::BioSamples& BioSamples();
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Sets the text of the NumRecords element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns reference to this metadata object
+    ///
+    DataSetMetadata& NumRecords(const std::string& numRecords);
+
+    /// \brief Sets the text of the TotalLength element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns reference to this metadata object
+    ///
+    DataSetMetadata& TotalLength(const std::string& totalLength);
+
+    /// \brief Sets the Provenance child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns reference to this metadata object
+    ///
+    DataSetMetadata& Provenance(const PacBio::BAM::Provenance& provenance);
+
+    /// \brief Sets the BioSamples child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns reference to this metadata object
+    ///
+    DataSetMetadata& BioSamples(const PacBio::BAM::BioSamples& samples);
+
+    /// \}
+};
+
+class SubDataSets;
+
+/// \brief The DataSetBase class provides the attributes & child elements shared
+///        by all dataset types.
+///
+/// Client code should not need to use this class directly. It should be
+/// considered as more of an implementation detail and may in fact be removed
+/// from public API in the future. The top-level DataSet is the recommended
+/// entry point.
+///
+class PBBAM_EXPORT DataSetBase : public internal::StrictEntityType
+{
+public:
+    /// \brief Creates a DataSetBase object, or one of its subclasses, from an
+    ///        XML element name (e.g. SubreadSet)
+    ///
+    static std::shared_ptr<DataSetBase> Create(const std::string& typeName);
+    static std::shared_ptr<DataSetBase> Create(const std::string& typeName,
+                                               const internal::FromInputXml& fromInputXml);
+
+public:
+    /// \brief Creates an empty, generic DataSetBase.
+    DataSetBase();
+    DataSetBase(const internal::FromInputXml& fromInputXml);
+
+protected:
+    /// \brief Creates a DataSetBase with key values initialized.
+    DataSetBase(const std::string& metatype, const std::string& label, const XsdType& xsd);
+    DataSetBase(const std::string& metatype, const std::string& label,
+                const internal::FromInputXml& fromInputXml, const XsdType& xsd);
+
+    /// \brief Returns a new DataSetBase containing a deep copy of contents
+    DataSetBase* DeepCopy() const;
+
+public:
+    /// \brief Merges dataset contents.
+    ///
+    /// Adds contents of \p other to this dataset object
+    ///
+    /// \param[in] other  some other dataset to add to this one
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& operator+=(const DataSetBase& other);
+
+public:
+    /// \brief Fetches the dataset's ExternalResources element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::ExternalResources& ExternalResources() const;
+
+    /// \brief Fetches the dataset's Filters element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::Filters& Filters() const;
+
+    /// \brief Fetches the dataset's DataSetMetadata element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::DataSetMetadata& Metadata() const;
+
+    /// \brief Fetches the dataset's DataSets element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::SubDataSets& SubDataSets() const;
+
+public:
+    /// \brief Access this dataset's namespace info.
+    ///
+    /// \returns const reference to dataset's NamespaceRegistry
+    ///
+    const NamespaceRegistry& Namespaces() const;
+
+public:
+    /// \brief Fetches the dataset's ExternalResources element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::ExternalResources& ExternalResources();
+
+    /// \brief Fetches the dataset's Filters element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Filters& Filters();
+
+    /// \brief Fetches the dataset's DataSetMetadata element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::DataSetMetadata& Metadata();
+
+    /// \brief Fetches the dataset's DataSets element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::SubDataSets& SubDataSets();
+
+public:
+    /// \brief Sets this dataset's ExternalResources element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] resources  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& ExternalResources(const PacBio::BAM::ExternalResources& resources);
+
+    /// \brief Sets this dataset's Filters element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] filters  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& Filters(const PacBio::BAM::Filters& filters);
+
+    /// \brief Sets this dataset's DataSetMetadata element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] metadata  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& Metadata(const PacBio::BAM::DataSetMetadata& metadata);
+
+    /// \brief Sets this dataset's DataSets element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] subdatasets  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& SubDataSets(const PacBio::BAM::SubDataSets& subdatasets);
+
+public:
+    /// \brief Access this dataset's namespace info.
+    ///
+    /// \returns non-const reference to dataset's NamespaceRegistry
+    ///
+    NamespaceRegistry& Namespaces();
+
+public:
+    /// \brief Saves dataset XML to file.
+    ///
+    /// \param[in] outputFilename destination for XML contents
+    ///
+    /// \throws std::runtime_error if file could be opened or if DataSet
+    ///         elements could not be converted to XML
+    ///
+    void Save(const std::string& outputFilename);
+
+    /// \brief Saves dataset XML to output stream, e.g. std::cout,
+    ///        std::stringstream.
+    ///
+    /// \param[out] out destination for XML contents
+    ///
+    /// \throws std::runtime_error if DataSet elements could not be converted to
+    ///         XML
+    ///
+    void SaveToStream(std::ostream& out);
+
+public:
+    ///
+    /// \returns true if dataset was read from XML input
+    ///
+    bool FromInputXml() const;
+
+    ///
+    /// \brief Indicate that dataset was read from XML input
+    ///
+    void FromInputXml(bool ok);
+
+    ///
+    /// \returns (absolute) path for dataset
+    ///
+    const std::string& Path() const;
+
+    ///
+    /// \brief Set dataset path
+    ///
+    void Path(const std::string& path);
+
+private:
+    NamespaceRegistry registry_;
+    std::string path_;
+    bool fromInputXml_ = false;
+};
+
+/// \brief The AlignmentSet class represents an %AlignmentSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT AlignmentSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty AlignmentSet dataset.
+    AlignmentSet();
+    AlignmentSet(const internal::FromInputXml& fromInputXml);
+};
+
+/// \brief The BarcodeSet class represents a %BarcodeSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT BarcodeSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty BarcodeSet dataset.
+    BarcodeSet();
+    BarcodeSet(const internal::FromInputXml& fromInputXml);
+};
+
+/// \brief The ConsensusAlignmentSet class represents a %ConsensusAlignmentSet
+///        root element in DataSetXML.
+///
+class PBBAM_EXPORT ConsensusAlignmentSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty ConsensusAlignmentSet dataset.
+    ConsensusAlignmentSet();
+    ConsensusAlignmentSet(const internal::FromInputXml& fromInputXml);
+};
+
+/// \brief The ConsensusReadSet class represents a %ConsensusReadSet root
+///        element in DataSetXML.
+///
+class PBBAM_EXPORT ConsensusReadSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty ConsensusReadSet dataset.
+    ConsensusReadSet();
+    ConsensusReadSet(const internal::FromInputXml& fromInputXml);
+};
+
+/// \brief The ContigSet class represents a %ContigSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT ContigSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty ContigSet dataset.
+    ContigSet();
+    ContigSet(const internal::FromInputXml& fromInputXml);
+};
+
+/// \brief The HdfSubreadSet class represents a %HdfSubreadSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT HdfSubreadSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty HdfSubreadSet dataset.
+    HdfSubreadSet();
+    HdfSubreadSet(const internal::FromInputXml& fromInputXml);
+};
+
+/// \brief The ReferenceSet class represents a %ReferenceSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT ReferenceSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty ReferenceSet dataset.
+    ReferenceSet();
+    ReferenceSet(const internal::FromInputXml& fromInputXml);
+};
+
+/// \brief The SubDataSets class represents a %DataSets list element in
+///        DataSetXML.
+///
+/// The SubDataSets element is essentially a list of DataSets.
+///
+class PBBAM_EXPORT SubDataSets : public internal::DataSetElement
+{
+public:
+    /// \brief Creates an empty list of sub-datasets.
+    SubDataSets();
+    SubDataSets(const internal::FromInputXml& fromInputXml);
+
+public:
+    /// \brief Adds \p other sub-dataset to this list.
+    SubDataSets& operator+=(const DataSetBase& other);  // single
+
+    /// \brief Adds \p other sub-dataset list to this list.
+    SubDataSets& operator+=(const SubDataSets& other);  // list
+
+public:
+    /// \brief Adds a sub-dataset to this list.
+    void Add(const DataSetBase& subdataset);
+
+    /// \brief Removes a sub-dataset from this list.
+    void Remove(const DataSetBase& subdataset);
+
+public:
+    using value_type = DataSetBase;
+    using iterator_type = internal::DataSetElementIterator<value_type>;
+    using const_iterator_type = internal::DataSetElementConstIterator<value_type>;
+
+    const value_type& operator[](size_t index) const;
+    value_type& operator[](size_t index);
+
+    iterator_type begin();
+    const_iterator_type begin() const;
+    const_iterator_type cbegin() const;
+    iterator_type end();
+    const_iterator_type end() const;
+    const_iterator_type cend() const;
+};
+
+/// \brief The SubreadSet class represents a %SubreadSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT SubreadSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty SubreadSet dataset.
+    SubreadSet();
+    SubreadSet(const internal::FromInputXml& fromInputXml);
+};
+
+/// \brief The TranscriptSet class represents a %TranscriptSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT TranscriptSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty TranscriptSet dataset.
+    TranscriptSet();
+    TranscriptSet(const internal::FromInputXml& fromInputXml);
+};
+
+/// \brief The TranscriptAlignmentSet class represents a %TranscriptAlignmentSet
+///        root element in DataSetXML.
+///
+class PBBAM_EXPORT TranscriptAlignmentSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty TranscriptAlignmentSet dataset.
+    TranscriptAlignmentSet();
+    TranscriptAlignmentSet(const internal::FromInputXml& fromInputXml);
+};
+
+enum class XmlElementType
+{
+    GENERIC_ELEMENT,
+    DATASET_METADATA,
+    BIOSAMPLE,
+    BIOSAMPLES,
+    DNA_BARCODE,
+    DNA_BARCODES,
+    EXTENSION,
+    EXTENSIONS,
+    EXTERNAL_RESOURCE,
+    EXTERNAL_RESOURCES,
+    FILE_INDEX,
+    FILE_INDICES,
+    FILTER,
+    FILTERS,
+    PARENT_TOOL,
+    PROPERTY,
+    PROPERTIES,
+    PROVENANCE,
+
+    GENERIC_DATASET,
+    ALIGNMENT_SET,
+    BARCODE_SET,
+    CONSENSUS_ALIGNMENT_SET,
+    CONSENSUS_READ_SET,
+    CONTIG_SET,
+    HDF_SUBREAD_SET,
+    REFERENCE_SET,
+    SUBREAD_SET,
+    TRANSCRIPT_SET,
+    TRANSCRIPT_ALIGNMENT_SET,
+    SUBDATASETS
+};
+
+/// \returns the enum value for the requested XML element
+///          (generic if not a built-in element type)
+XmlElementType ElementTypeFromName(const std::string& name);
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // DATASETTYPES_H
diff --git a/include/pbbam/DataSetXsd.h b/include/pbbam/DataSetXsd.h

new file mode 100644 (file)

index 0000000..df18595
--- /dev/null
+++ b/include/pbbam/DataSetXsd.h
@@ -0,0 +1,116 @@
+// File Description
+/// \file DataSetXsd.h
+/// \brief Defines the XSD- and namespace-related classes for DataSetXML.
+//
+// Author: Derek Barnett
+
+#ifndef DATASETXSD_H
+#define DATASETXSD_H
+
+#include "pbbam/Config.h"
+
+#include <map>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The XsdType enum defines the supported XSD namespaces.
+///
+enum class XsdType
+{
+    NONE,
+    AUTOMATION_CONSTRAINTS,
+    BASE_DATA_MODEL,
+    COLLECTION_METADATA,
+    COMMON_MESSAGES,
+    DATA_MODEL,
+    DATA_STORE,
+    DATASETS,
+    DECL_DATA,
+    PART_NUMBERS,
+    PRIMARY_METRICS,
+    REAGENT_KIT,
+    RIGHTS_AND_ROLES,
+    SAMPLE_INFO,
+    SEEDING_DATA
+};
+
+/// \brief The NamespaceInfo class provides XML namespace info (prefix & URI).
+///
+class PBBAM_EXPORT NamespaceInfo
+{
+public:
+    /// \brief Creates an empty entry.
+    ///
+    /// This constructor only exists for STL container compatibility.
+    ///
+    NamespaceInfo() = default;
+
+    /// \brief Creates a valid info entry.
+    NamespaceInfo(std::string name, std::string uri);
+
+public:
+    /// \brief Fetches namespace name (i.e. prefix)
+    const std::string& Name() const;
+
+    /// \brief Fetches namespace URI.
+    const std::string& Uri() const;
+
+private:
+    std::string name_;
+    std::string uri_;
+};
+
+/// \brief The NamespaceRegistry class provides a per-dataset registry of XML
+///        namespace information.
+///
+/// This is used to format XML output - properly prefixing element labels with
+/// namespace as appropriate.
+///
+class PBBAM_EXPORT NamespaceRegistry
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    NamespaceRegistry();
+
+    /// \}
+
+public:
+    /// \name Registry Access
+    /// \{
+
+    /// \brief Fetches namespace info for the dataset's default XSD type.
+    const NamespaceInfo& DefaultNamespace() const;
+
+    /// \brief Fetches dataset's default XSD type.
+    XsdType DefaultXsd() const;
+
+    /// \brief Fetches namespace info for the requested XSD type.
+    const NamespaceInfo& Namespace(const XsdType& xsd) const;
+
+    /// \brief Registers namespace info for a particular XSD type.
+    void Register(const XsdType& xsd, const NamespaceInfo& namespaceInfo);
+
+    /// \brief Updates dataset's default XSD type.
+    void SetDefaultXsd(const XsdType& xsd);
+
+    /// \brief Fetches the XSD type for \p elementLabel.
+    XsdType XsdForElement(const std::string& elementLabel) const;
+
+    /// \brief Fetches the XSD type for a particular URI.
+    XsdType XsdForUri(const std::string& uri) const;
+
+    /// \}
+
+private:
+    std::map<XsdType, NamespaceInfo> data_;
+    XsdType defaultXsdType_ = XsdType::DATASETS;
+};
+
+}  // namespace PacBio
+}  // namespace BAM
+
+#endif  // DATASETXSD_H
diff --git a/include/pbbam/EntireFileQuery.h b/include/pbbam/EntireFileQuery.h

new file mode 100644 (file)

index 0000000..2d15fa7
--- /dev/null
+++ b/include/pbbam/EntireFileQuery.h
@@ -0,0 +1,65 @@
+// File Description
+/// \file EntireFileQuery.h
+/// \brief Defines the EntireFileQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef ENTIREFILEQUERY_H
+#define ENTIREFILEQUERY_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The EntireFileQuery class provides iterable access to a DataSet's
+///        %BAM records, reading through the entire contents of each file.
+///
+/// Input files will be accessed in the order listed in the DataSet.
+///
+/// \include code/EntireFileQuery.txt
+///
+/// Iteration is not limited to only 'const' records. The files themselves will
+/// not be affected, but individual records may be modified if needed.
+///
+/// \include code/EntireFileQuery_NonConst.txt
+///
+/// \note DataSets can be implicitly constructed from %BAM filenames as well.
+///       Thus a single %BAM file can be read through using the following:
+///
+/// \include code/EntireFileQuery_BamFilename.txt
+///
+class PBBAM_EXPORT EntireFileQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new EntireFileQuery, reading through the entire
+    ///        contents of a dataset.
+    ///
+    /// \param[in] dataset  input data source(s)
+    /// \throws std::runtime_error on failure to open/read underlying %BAM
+    ///         files.
+    ///
+    EntireFileQuery(const PacBio::BAM::DataSet& dataset);
+    ~EntireFileQuery() override;
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+private:
+    class EntireFileQueryPrivate;
+    std::unique_ptr<EntireFileQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ENTIREFILEQUERY_H
diff --git a/include/pbbam/FaiIndex.h b/include/pbbam/FaiIndex.h

new file mode 100644 (file)

index 0000000..a830058
--- /dev/null
+++ b/include/pbbam/FaiIndex.h
@@ -0,0 +1,101 @@
+// File Description
+/// \file FaiIndex.h
+/// \brief Defines the FaiIndex class.
+//
+// Author: Derek Barnett
+
+#ifndef FAIINDEX_H
+#define FAIINDEX_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+struct FaiEntry
+{
+    /// Total length of this reference sequence, in bases
+    uint64_t Length = 0;
+
+    /// Offset in the FASTA/FASTQ file of this sequence's first base
+    uint64_t SeqOffset = 0;
+
+    /// The number of bases on each line
+    uint16_t NumBases = 0;
+
+    // The number of bytes in each line, including the newline (allows for Windows newlines)
+    uint16_t NumBytes = 0;
+
+    // Offset of sequence's first quality within the FASTQ file (-1 if FASTA only)
+    int64_t QualOffset = -1;
+};
+
+class FaiIndex
+{
+public:
+    ///
+    /// \brief Load FAI data from \p fn (*.fai)
+    ///
+    explicit FaiIndex(const std::string& fn);
+
+    FaiIndex();
+    FaiIndex(FaiIndex&&) noexcept;
+    FaiIndex& operator=(FaiIndex&&) noexcept;
+    ~FaiIndex();
+
+    ///
+    /// \brief Add new FAI line to index
+    ///
+    void Add(std::string name, FaiEntry entry);
+
+    ///
+    /// \returns FAI entry for sequence name
+    ///
+    const FaiEntry& Entry(const std::string& name) const;
+
+    ///
+    /// \returns FAI entry at \p row
+    ///
+    const FaiEntry& Entry(const uint32_t row) const;
+
+    ///
+    /// \returns true if sequence name found in index
+    ///
+    bool HasEntry(const std::string& name) const;
+
+    ///
+    /// \returns sequence names in FAI file
+    ///
+    const std::vector<std::string>& Names() const;
+
+    ///
+    /// \brief Save FAI data to file
+    ///
+    void Save(const std::string& fn) const;
+
+    ///
+    /// \brief Save FAI data to output stream
+    ///
+    void Save(std::ostream& out) const;
+
+private:
+    class FaiIndexPrivate;
+    std::unique_ptr<FaiIndexPrivate> d_;
+};
+
+bool operator==(const FaiEntry& lhs, const FaiEntry& rhs);
+
+// NOTE: FaiEntry output *does not* include the name column, FaiIndex::Save()
+//       handles this mapping
+std::ostream& operator<<(std::ostream& out, const FaiEntry& entry);
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FAIINDEX_H
diff --git a/include/pbbam/FastaCache.h b/include/pbbam/FastaCache.h

new file mode 100644 (file)

index 0000000..ab91a6f
--- /dev/null
+++ b/include/pbbam/FastaCache.h
@@ -0,0 +1,60 @@
+// File Description
+/// \file FastaCache.h
+/// \brief Defines the FastaCache
+//
+// Author: Derek Barnett
+
+#ifndef FASTACACHE_H
+#define FASTACACHE_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "pbbam/FastaSequence.h"
+
+namespace PacBio {
+namespace BAM {
+
+class FastaCacheData
+{
+public:
+    explicit FastaCacheData(const std::string& filename);
+
+    /// \brief Fetches FASTA sequence for desired interval.
+    ///
+    /// \param[in] name     reference sequence name
+    /// \param[in] begin    start position
+    /// \param[in] end      end position
+    ///
+    /// \returns sequence string at desired interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string Subsequence(const std::string& name, size_t begin, size_t end) const;
+
+    /// \returns the names of all sequences stored in the FASTA file
+    std::vector<std::string> Names() const;
+
+    /// \returns length of FASTA sequence
+    ///
+    /// \throws std::runtime_error if name is unknown
+    ///
+    size_t SequenceLength(const std::string& name) const;
+
+private:
+    std::vector<FastaSequence> cache_;
+    std::unordered_map<std::string, size_t> lookup_;
+};
+
+using FastaCache = std::shared_ptr<FastaCacheData>;
+
+FastaCache MakeFastaCache(const std::string& filename);
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FASTACACHE_H
diff --git a/include/pbbam/FastaReader.h b/include/pbbam/FastaReader.h

new file mode 100644 (file)

index 0000000..daa2c30
--- /dev/null
+++ b/include/pbbam/FastaReader.h
@@ -0,0 +1,85 @@
+// File Description
+/// \file FastaReader.h
+/// \brief Defines the FastaReader class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTAREADER_H
+#define FASTAREADER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "pbbam/FastaSequence.h"
+
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The FastaReader provides sequential access to FASTA records.
+///
+class FastaReader : public internal::QueryBase<FastaSequence>
+{
+public:
+    ///
+    /// \brief Reads all FASTA sequences from a file
+    ///
+    /// \param fn   FASTA filename
+    /// \return vector of FastaSequence results
+    ///
+    static std::vector<FastaSequence> ReadAll(const std::string& fn);
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    explicit FastaReader(const std::string& fn);
+
+    FastaReader(FastaReader&&) noexcept;
+    FastaReader& operator=(FastaReader&&) noexcept;
+    ~FastaReader();
+
+    /// \}
+
+public:
+    ///
+    /// \brief GetNext
+    ///
+    /// Allows iteration with range-for:
+    /// \code{cpp}
+    ///
+    /// FastaReader reader{fn};
+    /// for (const FastaSequence& seq : reader) {
+    ///     // do stuff with seq
+    /// }
+    /// \endcode
+    ///
+    /// or you can iterate 'manually':
+    /// \code{cpp}
+    ///
+    /// FastaReader reader{fn};
+    /// FastaSequence seq;
+    /// while (reader.GetNext(seq)) {
+    ///     // do stuff with seq
+    /// }
+    /// \endcode
+    ///
+    /// \param[out] record
+    /// \return success/failure
+    ///
+    bool GetNext(FastaSequence& record);
+
+private:
+    class FastaReaderPrivate;
+    std::unique_ptr<FastaReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FASTAREADER_H
diff --git a/include/pbbam/FastaSequence.h b/include/pbbam/FastaSequence.h

new file mode 100644 (file)

index 0000000..ef769dd
--- /dev/null
+++ b/include/pbbam/FastaSequence.h
@@ -0,0 +1,82 @@
+// File Description
+/// \file FastaSequence.h
+/// \brief Defines the FastaSequence class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTASEQUENCE_H
+#define FASTASEQUENCE_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The FastaSequence class represents a FASTA record (name & bases)
+///
+class FastaSequence
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    ///
+    /// \brief FastaSequence
+    /// \param name
+    /// \param bases
+    ///
+    explicit FastaSequence(std::string name, std::string bases);
+
+    FastaSequence() = default;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    ///
+    /// \brief Name
+    /// \return
+    ///
+    const std::string& Name() const;
+
+    ///
+    /// \brief
+    ///
+    /// \param name
+    /// \return FastaSequence&
+    ///
+    FastaSequence& Name(std::string name);
+
+    ///
+    /// \brief Bases
+    /// \return
+    ///
+    const std::string& Bases() const;
+
+    ///
+    /// \brief
+    ///
+    /// \param bases
+    /// \return FastaSequence&
+    ///
+    FastaSequence& Bases(std::string bases);
+
+    /// \}
+
+    bool operator==(const FastaSequence& other) const;
+    bool operator!=(const FastaSequence& other) const;
+
+private:
+    std::string name_;
+    std::string bases_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FASTASEQUENCE_H
diff --git a/include/pbbam/FastaSequenceQuery.h b/include/pbbam/FastaSequenceQuery.h

new file mode 100644 (file)

index 0000000..0886c6b
--- /dev/null
+++ b/include/pbbam/FastaSequenceQuery.h
@@ -0,0 +1,52 @@
+// File Description
+/// \file FastaSequenceQuery.h
+/// \brief Defines the FastaSequenceQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTASEQUENCEQUERY_H
+#define FASTASEQUENCEQUERY_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+
+#include "pbbam/DataSet.h"
+#include "pbbam/FastaSequence.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The FastaSequence class represents a FASTA record (name & bases)
+///
+class FastaSequenceQuery : public internal::QueryBase<FastaSequence>
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    FastaSequenceQuery(const PacBio::BAM::DataSet& dataset);
+    ~FastaSequenceQuery() override;
+
+    /// \}
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(FastaSequence& seq) override;
+
+private:
+    class FastaSequenceQueryPrivate;
+    std::unique_ptr<FastaSequenceQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FASTASEQUENCEQUERY_H
diff --git a/include/pbbam/FastaWriter.h b/include/pbbam/FastaWriter.h

new file mode 100644 (file)

index 0000000..3cbfd9f
--- /dev/null
+++ b/include/pbbam/FastaWriter.h
@@ -0,0 +1,47 @@
+// File Description
+/// \file FastaWriter.h
+/// \brief Defines the FastaWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTAWRITER_H
+#define FASTAWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include "pbbam/IFastaWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+class BamRecordImpl;
+class FastaSequence;
+
+class FastaWriter final : public IFastaWriter
+{
+public:
+    FastaWriter(const std::string& fn);
+
+public:
+    // IFastaWriter
+    void Write(const FastaSequence& fastq);
+    void Write(const std::string& name, const std::string& bases);
+
+    // IRecordWriter
+    void TryFlush();
+    void Write(const BamRecord& bam);
+    void Write(const BamRecordImpl& bam);
+
+private:
+    std::ofstream file_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FASTAWRITER_H
diff --git a/include/pbbam/FastqReader.h b/include/pbbam/FastqReader.h

new file mode 100644 (file)

index 0000000..18c2c70
--- /dev/null
+++ b/include/pbbam/FastqReader.h
@@ -0,0 +1,85 @@
+// File Description
+/// \file FastqReader.h
+/// \brief Defines the FastqReader class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTQREADER_H
+#define FASTQREADER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "pbbam/FastqSequence.h"
+
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The FastqReader provides sequential access to Fastq records.
+///
+class FastqReader : public internal::QueryBase<FastqSequence>
+{
+public:
+    ///
+    /// \brief Reads all Fastq sequences from a file
+    ///
+    /// \param fn   Fastq filename
+    /// \return vector of FastqSequence results
+    ///
+    static std::vector<FastqSequence> ReadAll(const std::string& fn);
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    explicit FastqReader(const std::string& fn);
+
+    FastqReader(FastqReader&&) noexcept;
+    FastqReader& operator=(FastqReader&&) noexcept;
+    virtual ~FastqReader();
+
+    /// \}
+
+public:
+    ///
+    /// \brief GetNext
+    ///
+    /// Allows iteration with range-for:
+    /// \code{cpp}
+    ///
+    /// FastqReader reader{fn};
+    /// for (const FastqSequence& seq : reader) {
+    ///     // do stuff with seq
+    /// }
+    /// \endcode
+    ///
+    /// or you can iterate 'manually':
+    /// \code{cpp}
+    ///
+    /// FastqReader reader{fn};
+    /// FastqSequence seq;
+    /// while (reader.GetNext(seq)) {
+    ///     // do stuff with seq
+    /// }
+    /// \endcode
+    ///
+    /// \param[out] record
+    /// \return success/failure
+    ///
+    bool GetNext(FastqSequence& record);
+
+private:
+    class FastqReaderPrivate;
+    std::unique_ptr<FastqReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FASTQREADER_H
diff --git a/include/pbbam/FastqSequence.h b/include/pbbam/FastqSequence.h

new file mode 100644 (file)

index 0000000..0f48d35
--- /dev/null
+++ b/include/pbbam/FastqSequence.h
@@ -0,0 +1,80 @@
+// File Description
+/// \file FastqSequence.h
+/// \brief Defines the FastqSequence class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTQSEQUENCE_H
+#define FASTQSEQUENCE_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+#include <pbbam/FastaSequence.h>
+#include <pbbam/QualityValues.h>
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The FastqSequence class represents a FASTQ record (name, bases, and
+///        qualities)
+///
+class FastqSequence : public FastaSequence
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    ///
+    /// \brief FastaSequence
+    /// \param name
+    /// \param bases
+    /// \param qualities
+    ///
+    explicit FastqSequence(std::string name, std::string bases, QualityValues qualities);
+
+    ///
+    /// \brief FastaSequence
+    /// \param name
+    /// \param bases
+    /// \param qualities
+    ///
+    explicit FastqSequence(std::string name, std::string bases, std::string qualities);
+
+    FastqSequence() = default;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    ///
+    /// \brief Qualities
+    /// \return
+    ///
+    const QualityValues& Qualities() const;
+
+    ///
+    /// \brief
+    ///
+    /// \param quals
+    /// \return FastqSequence
+    ///
+    FastqSequence& Qualities(QualityValues quals);
+
+    /// \}
+
+    bool operator==(const FastqSequence& other) const;
+    bool operator!=(const FastqSequence& other) const;
+
+private:
+    QualityValues qualities_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FASTQSEQUENCE_H
diff --git a/include/pbbam/FastqWriter.h b/include/pbbam/FastqWriter.h

new file mode 100644 (file)

index 0000000..cb38d74
--- /dev/null
+++ b/include/pbbam/FastqWriter.h
@@ -0,0 +1,48 @@
+// File Description
+/// \file FastqWriter.h
+/// \brief Defines the FastqWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTQWRITER_H
+#define FASTQWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include "pbbam/IFastqWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+class BamRecordImpl;
+class FastqSequence;
+
+class FastqWriter final : public IFastqWriter
+{
+public:
+    FastqWriter(const std::string& fn);
+
+public:
+    // IFastqWriter
+    void Write(const FastqSequence& fastq);
+    void Write(const std::string& name, const std::string& bases, const QualityValues& quals);
+    void Write(const std::string& name, const std::string& bases, const std::string& quals);
+
+    // IRecordWriter
+    void TryFlush();
+    void Write(const BamRecord& bam);
+    void Write(const BamRecordImpl& bam);
+
+private:
+    std::ofstream file_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FASTQWRITER_H
diff --git a/include/pbbam/FormatUtils.h b/include/pbbam/FormatUtils.h

new file mode 100644 (file)

index 0000000..e301320
--- /dev/null
+++ b/include/pbbam/FormatUtils.h
@@ -0,0 +1,44 @@
+// Author: Derek Barnett
+
+#ifndef FORMAT_UTILS_H
+#define FORMAT_UTILS_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+#include <vector>
+
+#include <htslib/bgzf.h>
+
+namespace PacBio {
+namespace BAM {
+
+enum class HtslibCompression
+{
+    NONE,
+    GZIP,
+    BGZIP
+};
+
+class FormatUtils
+{
+public:
+    static const std::vector<std::string>& BedExtensions();
+    static const std::vector<std::string>& FastaExtensions();
+    static const std::vector<std::string>& FastqExtensions();
+
+    static bool IsBedFilename(const std::string& fn);
+    static bool IsFastaFilename(const std::string& fn);
+    static bool IsFastqFilename(const std::string& fn);
+
+    static HtslibCompression CompressionType(BGZF* fp);
+    static HtslibCompression CompressionType(const std::string& fn);
+
+private:
+    static bool IsFormat(const std::string& fn, const std::vector<std::string>& extensions);
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FORMAT_UTILS_H
diff --git a/include/pbbam/FrameEncodingType.h b/include/pbbam/FrameEncodingType.h

new file mode 100644 (file)

index 0000000..f68810f
--- /dev/null
+++ b/include/pbbam/FrameEncodingType.h
@@ -0,0 +1,33 @@
+// File Description
+/// \file FrameEncodingType.h
+/// \brief Defines the FrameEncodingType enum.
+//
+// Author: Derek Barnett
+
+#ifndef FRAMEENCODINGTYPE_H
+#define FRAMEENCODINGTYPE_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the possible encoding modes used in Frames data
+/// (e.g. BamRecord::IPD or BamRecord::PulseWidth).
+///
+/// The LOSSY mode is the default in production output; LOSSLESS mode
+/// being used primarily for internal applications.
+///
+/// \sa https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/BAM.rst
+///     for more information on pulse frame encoding schemes.
+///
+enum class FrameEncodingType
+{
+    LOSSY,    ///< 8-bit compression (using CodecV1) of frame data
+    LOSSLESS  ///< 16-bit native frame data
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FRAMEENCODINGTYPE_H
diff --git a/include/pbbam/Frames.h b/include/pbbam/Frames.h

new file mode 100644 (file)

index 0000000..eb1a610
--- /dev/null
+++ b/include/pbbam/Frames.h
@@ -0,0 +1,26 @@
+// File Description
+/// \file Frames.h
+/// \brief Defines the Frames class.
+//
+// Author: Derek Barnett
+
+#ifndef FRAMES_H
+#define FRAMES_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <pbcopper/data/Frames.h>
+
+namespace PacBio {
+namespace BAM {
+
+using Frames PBBAM_DEPRECATED = PacBio::Data::Frames;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FRAMES_H
diff --git a/include/pbbam/GenomicInterval.h b/include/pbbam/GenomicInterval.h

new file mode 100644 (file)

index 0000000..b8d493f
--- /dev/null
+++ b/include/pbbam/GenomicInterval.h
@@ -0,0 +1,22 @@
+// File Description
+/// \file GenomicInterval.h
+/// \brief Defines the GenomicInterval class.
+//
+// Author: Derek Barnett
+
+#ifndef GENOMICINTERVAL_H
+#define GENOMICINTERVAL_H
+
+#include "pbbam/Config.h"
+
+#include <pbcopper/data/GenomicInterval.h>
+
+namespace PacBio {
+namespace BAM {
+
+using GenomicInterval PBBAM_DEPRECATED = Data::GenomicInterval;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // GENOMICINTERVAL_H
diff --git a/include/pbbam/GenomicIntervalQuery.h b/include/pbbam/GenomicIntervalQuery.h

new file mode 100644 (file)

index 0000000..ccaa508
--- /dev/null
+++ b/include/pbbam/GenomicIntervalQuery.h
@@ -0,0 +1,103 @@
+// File Description
+/// \file GenomicIntervalQuery.h
+/// \brief Defines the GenomicIntervalQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef GENOMICINTERVALQUERY_H
+#define GENOMICINTERVALQUERY_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+
+#include "pbbam/BaiIndexCache.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace Data {
+
+class GenomicInterval;
+}
+
+namespace BAM {
+
+class BamRecord;
+class DataSet;
+
+/// \brief The GenomicIntervalQuery class provides iterable access to a
+///        DataSet's %BAM records, limiting results to those overlapping a
+///        GenomicInterval.
+///
+/// Example:
+/// \include code/GenomicIntervalQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".bai" index file.
+///       Use BamFile::EnsureStandardIndexExists before creating the query if
+///       one may not be present.
+///
+class PBBAM_EXPORT GenomicIntervalQuery : public internal::IQuery
+{
+public:
+    /// \brief Constructs a new GenomiIntervalQuery, that can be used to retrieve
+    ///        only those records overlapping a GenomicInterval.
+    ///
+    /// \note Using this constructor means that an interval must be provided, via
+    ///       query.Interval(i), before iterating.
+    ///
+    /// \param[in] dataset  input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         BAI files.
+    ///
+    GenomicIntervalQuery(const DataSet& dataset);
+    GenomicIntervalQuery(const DataSet& dataset, const BaiIndexCache& cache);
+
+    /// \brief Constructs a new GenomiIntervalQuery, limiting record results to
+    ///        only those overalpping a GenomicInterval.
+    ///
+    /// \param[in] interval genomic interval of interest
+    /// \param[in] dataset  input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         BAI files.
+    ///
+    GenomicIntervalQuery(const Data::GenomicInterval& interval, const DataSet& dataset);
+    GenomicIntervalQuery(const Data::GenomicInterval& interval, const DataSet& dataset,
+                         const BaiIndexCache& cache);
+
+    ~GenomicIntervalQuery() override;
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+public:
+    /// \brief Sets a new genomic interval.
+    ///
+    /// This allows the same dataset/query to be re-used over multiple regions of
+    /// interest:
+    ///
+    /// \include code/GenomicIntervalQuery_Reuse.txt
+    ///
+    /// \param[in] interval new genomic interval
+    /// \returns reference to this query
+    ///
+    GenomicIntervalQuery& Interval(const Data::GenomicInterval& interval);
+
+    /// \returns Current genomic interval active on this query.
+    const Data::GenomicInterval& Interval() const;
+
+private:
+    class GenomicIntervalQueryPrivate;
+    std::unique_ptr<GenomicIntervalQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // GENOMICINTERVALQUERY_H
diff --git a/include/pbbam/IFastaWriter.h b/include/pbbam/IFastaWriter.h

new file mode 100644 (file)

index 0000000..2573be3
--- /dev/null
+++ b/include/pbbam/IFastaWriter.h
@@ -0,0 +1,39 @@
+// File Description
+/// \file IFastaWriter.h
+/// \brief Defines the IFastaWriter interface.
+//
+// Author: Derek Barnett
+
+#ifndef IFASTAWRITER_H
+#define IFASTAWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+#include "pbbam/IRecordWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+class FastaSequence;
+
+class IFastaWriter : public IRecordWriter
+{
+public:
+    virtual ~IFastaWriter();
+
+public:
+    using IRecordWriter::Write;
+
+    virtual void Write(const FastaSequence& fastq) = 0;
+    virtual void Write(const std::string& name, const std::string& bases) = 0;
+
+protected:
+    IFastaWriter();
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // IFASTAWRITER_H
diff --git a/include/pbbam/IFastqWriter.h b/include/pbbam/IFastqWriter.h

new file mode 100644 (file)

index 0000000..3cc8167
--- /dev/null
+++ b/include/pbbam/IFastqWriter.h
@@ -0,0 +1,43 @@
+// File Description
+/// \file IFastqWriter.h
+/// \brief Defines the IFastqWriter interface.
+//
+// Author: Derek Barnett
+
+#ifndef IFASTQWRITER_H
+#define IFASTQWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+#include "pbbam/IRecordWriter.h"
+#include "pbbam/QualityValues.h"
+
+namespace PacBio {
+namespace BAM {
+
+class FastqSequence;
+
+class IFastqWriter : public IRecordWriter
+{
+public:
+    virtual ~IFastqWriter();
+
+public:
+    using IRecordWriter::Write;
+
+    virtual void Write(const FastqSequence& fastq) = 0;
+    virtual void Write(const std::string& name, const std::string& bases,
+                       const QualityValues& quals) = 0;
+    virtual void Write(const std::string& name, const std::string& bases,
+                       const std::string& quals) = 0;
+
+protected:
+    IFastqWriter();
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // IFASTQWRITER_H
diff --git a/include/pbbam/IRecordWriter.h b/include/pbbam/IRecordWriter.h

new file mode 100644 (file)

index 0000000..fbbcf69
--- /dev/null
+++ b/include/pbbam/IRecordWriter.h
@@ -0,0 +1,57 @@
+// File Description
+/// \file IRecordWriter.h
+/// \brief Defines the IRecordWriter interface.
+//
+// Author: Derek Barnett
+
+#ifndef IRECORDWRITER_H
+#define IRECORDWRITER_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+class BamRecordImpl;
+
+class IRecordWriter
+{
+public:
+    virtual ~IRecordWriter();
+
+public:
+    /// \brief Try to flush any buffered data to file.
+    ///
+    /// \note The underlying implementation may not necessarily flush buffered
+    ///       data immediately, especially in a multithreaded writer situation.
+    ///       Let the writer go out of scope to fully ensure flushing.
+    ///
+    /// \throws std::runtime_error if flush fails
+    ///
+    virtual void TryFlush() = 0;
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] record BamRecord object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    virtual void Write(const BamRecord& record) = 0;
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] recordImpl BamRecordImpl object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    virtual void Write(const BamRecordImpl& recordImpl) = 0;
+
+protected:
+    IRecordWriter();
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // IRECORDWRITER_H
diff --git a/include/pbbam/IndexedBamWriter.h b/include/pbbam/IndexedBamWriter.h

new file mode 100644 (file)

index 0000000..991db8e
--- /dev/null
+++ b/include/pbbam/IndexedBamWriter.h
@@ -0,0 +1,155 @@
+// File Description
+/// \file IndexedBamWriter.h
+/// \brief Defines the IndexedBamWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef INDEXEDBAMWRITER_H
+#define INDEXEDBAMWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+
+#include "pbbam/BamWriter.h"
+#include "pbbam/IRecordWriter.h"
+#include "pbbam/PbiBuilder.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamHeader;
+class BamRecord;
+class BamRecordImpl;
+
+struct IndexedBamWriterConfig
+{
+    std::string outputFilename;
+    BamHeader header;
+
+    BamWriter::CompressionLevel bamCompressionLevel = BamWriter::DefaultCompression;
+    PbiBuilder::CompressionLevel pbiCompressionLevel = PbiBuilder::DefaultCompression;
+
+    // Number of threads used while writing to BAM file
+    size_t numBamThreads = 4;
+    // Number of threads used while writing to pbi file
+    size_t numPbiThreads = 4;
+    // Number of threads used while doing a trailing read of the BaM file being
+    // written (to help compute indexes)
+    size_t numGziThreads = 4;
+
+    // Max size in memory for temporary files before flushing to disk.
+    size_t tempFileBufferSize = 0x10000;
+};
+///
+/// \brief The IndexedBamWriter class
+///
+///
+///
+///
+///
+///
+class IndexedBamWriter : public IRecordWriter
+{
+public:
+    ///
+    /// \brief IndexedBamWriter
+    ///
+    /// \param[in] filename             path to output %BAM file
+    /// \param[in] header               BAM file header
+    ///
+    /// \param[in] bamCompressionLevel  zlib compression level for output BAM
+    /// \param[in] numBamThreads        number of threads for BAM compression.
+    ///                                 If set to 0, the writer will attempt to
+    ///                                 determine a reasonable estimate. If set
+    ///                                 to 1, this will force single-threaded
+    ///                                 execution. No checks are made against an
+    ///                                 upper limit.
+    ///
+    /// \param[in] pbiCompressionLevel  zlib compression level for output PBI
+    /// \param[in] numPbiThreads        number of threads for PBI compression.
+    ///                                 If set to 0, the writer will attempt to
+    ///                                 determine a reasonable estimate. If set
+    ///                                 to 1, this will force single-threaded
+    ///                                 execution. No checks are made against an
+    ///                                 upper limit.
+    /// \param[in] numGziThreads        number of threads used by the trailing
+    ///                                 reader process used to help compute indexes.
+    ///                                 If set to 0, the writer will attempt to
+    ///                                 determine a reasonable estimate. If set
+    ///                                 to 1, this will force single-threaded
+    ///                                 execution. No checks are made against an
+    ///                                 upper limit.
+    /// \param[in] tempFileBufferBytes  Maximum number of bytes various temporary
+    ///                                 files can use before they flush to disk.
+    ///                                 Larger numbers require more resources but
+    ///                                 may increase disk IO efficiency.
+    ///
+    /// \throws std::runtime_error if there was a problem
+    ///
+    IndexedBamWriter(
+        const std::string& outputFilename, const BamHeader& header,
+        const BamWriter::CompressionLevel bamCompressionLevel = BamWriter::DefaultCompression,
+        const size_t numBamThreads = 4,
+        const PbiBuilder::CompressionLevel pbiCompressionLevel = PbiBuilder::DefaultCompression,
+        const size_t numPbiThreads = 4, const size_t numGziThreads = 4,
+        const size_t tempFileBufferSize = 0x10000);
+
+    /// \brief IndexedBamWRiter
+    ///
+    /// \param[in] config  Struct containing all the parameters used to construct
+    ///                    this object.  See documentation for other constructor
+    ///                    for more details
+    IndexedBamWriter(const IndexedBamWriterConfig& config)
+        : IndexedBamWriter(config.outputFilename, config.header, config.bamCompressionLevel,
+                           config.numBamThreads, config.pbiCompressionLevel, config.numPbiThreads,
+                           config.numGziThreads, config.tempFileBufferSize)
+    {
+    }
+
+    IndexedBamWriter(IndexedBamWriter&&) noexcept;
+    IndexedBamWriter& operator=(IndexedBamWriter&&) noexcept;
+    ~IndexedBamWriter();
+
+public:
+    ///
+    /// \brief TryFlush
+    ///
+    void TryFlush() override;
+
+    ///
+    /// \brief Write
+    ///
+    /// \param[in] record
+    ///
+    void Write(const BamRecord& record) override;
+
+    ///
+    /// \brief Write
+    ///
+    /// \param[in] record
+    ///
+    void Write(const BamRecordImpl& record) override;
+
+    /// \brief ReaderTrailingDistance
+    ///
+    /// Allows calling code to monitor how far behind (in bytes)
+    /// the reader thread trails behind the tip of the current
+    /// being written BAM file.  May be useful for diagnosing
+    /// performance issues if the reader thread falls enough behind
+    /// that caching is insufficient to prevent an IO hit from the
+    /// extra read operations.
+    ///
+    /// Note: Returns a "high water mark", not a current value.
+    size_t MaxReaderLag() const;
+
+private:
+    class IndexedBamWriterPrivate2;
+    std::unique_ptr<IndexedBamWriterPrivate2> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // INDEXEDBAMWRITER_H
diff --git a/include/pbbam/IndexedFastaReader.h b/include/pbbam/IndexedFastaReader.h

new file mode 100644 (file)

index 0000000..7f9ac8d
--- /dev/null
+++ b/include/pbbam/IndexedFastaReader.h
@@ -0,0 +1,147 @@
+// File Description
+/// \file IndexedFastaReader.h
+/// \brief Defines the IndexedFastaReader class.
+//
+// Author: David Alexander
+
+#ifndef INDEXEDFASTAREADER_H
+#define INDEXEDFASTAREADER_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "pbbam/Orientation.h"
+#include "pbbam/Position.h"
+
+namespace PacBio {
+namespace Data {
+
+class GenomicInterval;
+}
+
+namespace BAM {
+
+class BamRecord;
+
+/// \brief The IndexedFastaReader class provides random-access to FASTA file
+///        data.
+///
+class IndexedFastaReader
+{
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    explicit IndexedFastaReader(std::string filename);
+
+    IndexedFastaReader(const IndexedFastaReader&);
+    IndexedFastaReader(IndexedFastaReader&&) noexcept;
+    IndexedFastaReader& operator=(const IndexedFastaReader&);
+    IndexedFastaReader& operator=(IndexedFastaReader&&) noexcept;
+    ~IndexedFastaReader();
+
+    /// \}
+
+public:
+    /// name Sequence Access
+    /// \{
+
+    /// \brief Fetches FASTA sequence for desired interval.
+    ///
+    /// \param[in] id       reference sequence name
+    /// \param[in] begin    start position
+    /// \param[in] end      end position
+    ///
+    /// \returns sequence string at desired interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string Subsequence(const std::string& id, Position begin, Position end) const;
+
+    /// \brief Fetches FASTA sequence for desired interval.
+    ///
+    /// \param[in] interval desired interval
+    ///
+    /// \returns sequence string at desired interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string Subsequence(const Data::GenomicInterval& interval) const;
+
+    /// \brief Fetches FASTA sequence for desired interval.
+    ///
+    /// \param[in] htslibRegion htslib/samtools-formatted REGION string
+    ///                         representing the desired interval
+    ///
+    /// \returns sequence string at desired interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string Subsequence(const char* htslibRegion) const;
+
+    /// \brief Fetches FASTA sequence corresponding to a BamRecord, oriented and
+    ///        gapped as requested.
+    ///
+    /// For example, "native" orientation and "gapped" will return the reference
+    /// sequence with gaps inserted, as would align against the read in "native"
+    /// orientation.
+    ///
+    /// \param[in] bamRecord        input BamRecord to derive interval/CIGAR
+    ///                             data
+    /// \param[in] orientation      orientation of output
+    /// \param[in] gapped           if true, gaps/padding will be inserted, per
+    ///                             record's CIGAR info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns sequence string over the record's interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string ReferenceSubsequence(const BamRecord& bamRecord,
+                                     const Orientation orientation = Orientation::GENOMIC,
+                                     const bool gapped = false,
+                                     const bool exciseSoftClips = false) const;
+
+    /// \}
+
+public:
+    /// \name File Attributes
+    /// \{
+
+    /// \returns true if FASTA file contains a sequence matching \p name
+    bool HasSequence(const std::string& name) const;
+
+    /// \returns the names of the sequence at a specific index in the FASTA file
+    std::string Name(const size_t idx) const;
+
+    /// \returns the names of all sequences stored in the FASTA file
+    std::vector<std::string> Names() const;
+
+    /// \returns number of sequences stored in FASTA file
+    int NumSequences() const;
+
+    /// \returns length of FASTA sequence
+    ///
+    /// \throws std::runtime_error if length could not be determined
+    ///
+    int SequenceLength(const std::string& name) const;
+
+    /// \}
+
+private:
+    class IndexedFastaReaderPrivate;
+    std::unique_ptr<IndexedFastaReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // INDEXEDFASTAREADER_H
diff --git a/include/pbbam/IndexedFastqReader.h b/include/pbbam/IndexedFastqReader.h

new file mode 100644 (file)

index 0000000..e4fc6e4
--- /dev/null
+++ b/include/pbbam/IndexedFastqReader.h
@@ -0,0 +1,141 @@
+// File Description
+/// \file IndexedFastqReader.h
+/// \brief Defines the IndexedFastqReader class.
+//
+// Author: Derek Barnett
+
+#ifndef INDEXEDFASTQREADER_H
+#define INDEXEDFASTQREADER_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "pbbam/FastqReader.h"
+#include "pbbam/Orientation.h"
+#include "pbbam/Position.h"
+#include "pbbam/QualityValues.h"
+
+#include "internal/QueryBase.h"
+
+namespace PacBio {
+namespace Data {
+
+class GenomicInterval;
+}
+
+namespace BAM {
+
+class BamRecord;
+class IndexedFastqReaderImpl;
+
+/// \brief The IndexedFastaReader class provides random-access to FASTQ file
+///        data.
+///
+class IndexedFastqReader
+{
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    explicit IndexedFastqReader(std::string filename);
+
+    IndexedFastqReader(const IndexedFastqReader&);
+    IndexedFastqReader(IndexedFastqReader&&) noexcept;
+    IndexedFastqReader& operator=(const IndexedFastqReader& rhs);
+    IndexedFastqReader& operator=(IndexedFastqReader&&) noexcept;
+    ~IndexedFastqReader();
+
+    /// \}
+
+public:
+    /// name Sequence Access
+    /// \{
+
+    /// \brief Fetches sequence & qualities for desired interval.
+    ///
+    /// \param[in] id       reference sequence name
+    /// \param[in] start    start position
+    /// \param[in] end      end position
+    ///
+    /// \returns sequence/QV pair for desired interval
+    ///
+    /// \throws std::runtime_error on failure to fetch data
+    ///
+    std::pair<std::string, QualityValues> Subsequence(const std::string& id, Position start,
+                                                      Position end);
+
+    /// \brief Fetches sequence & qualities for desired interval.
+    ///
+    /// \param[in] interval desired interval
+    ///
+    /// \returns sequence/QV pair for desired interval
+    ///
+    /// \throws std::runtime_error on failure to fetch data
+    ///
+    std::pair<std::string, QualityValues> Subsequence(const Data::GenomicInterval& interval);
+
+    /// \brief Fetches sequence & qualities sequence corresponding to a BamRecord, oriented and
+    ///        gapped as requested.
+    ///
+    /// For example, "native" orientation and "gapped" will return the reference
+    /// sequence with gaps inserted, as would align against the read in "native"
+    /// orientation.
+    ///
+    /// \param[in] bamRecord        input BamRecord to derive interval/CIGAR
+    ///                             data
+    /// \param[in] orientation      orientation of output
+    /// \param[in] gapped           if true, gaps/padding will be inserted, per
+    ///                             record's CIGAR info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns sequence/QV pair over the record's interval
+    ///
+    /// \throws std::runtime_error on failure to fetch data
+    ///
+    std::pair<std::string, QualityValues> ReferenceSubsequence(
+        const BamRecord& bamRecord, const Orientation orientation = Orientation::GENOMIC,
+        const bool gapped = false, const bool exciseSoftClips = false);
+
+    /// \}
+
+public:
+    /// \name File Attributes
+    /// \{
+
+    /// \returns true if FASTQ file contains a sequence matching \p name
+    bool HasSequence(const std::string& name) const;
+
+    /// \returns the names of the sequence at a specific index in the FASTQ file
+    std::string Name(const size_t idx) const;
+
+    /// \returns the names of all sequences stored in the FASTQ file
+    std::vector<std::string> Names() const;
+
+    /// \returns number of sequences stored in FASTQ file
+    int NumSequences() const;
+
+    /// \returns length of FASTQ sequence
+    ///
+    /// \throws std::runtime_error if length could not be determined
+    ///
+    int SequenceLength(const std::string& name) const;
+
+    /// \}
+
+private:
+    std::unique_ptr<IndexedFastqReaderImpl> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // INDEXEDFASTQREADER_H
diff --git a/include/pbbam/Interval.h b/include/pbbam/Interval.h

new file mode 100644 (file)

index 0000000..2fb2205
--- /dev/null
+++ b/include/pbbam/Interval.h
@@ -0,0 +1,22 @@
+// File Description
+/// \file Interval.h
+/// \brief Defines the Interval class.
+//
+// Author: Derek Barnett
+
+#ifndef INTERVAL_H
+#define INTERVAL_H
+
+#include "pbbam/Config.h"
+
+#include <pbcopper/data/Interval.h>
+
+namespace PacBio {
+namespace BAM {
+
+using Interval PBBAM_DEPRECATED = Data::Interval;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // GENOMICINTERVAL_H
diff --git a/include/pbbam/LocalContextFlags.h b/include/pbbam/LocalContextFlags.h

new file mode 100644 (file)

index 0000000..2de7724
--- /dev/null
+++ b/include/pbbam/LocalContextFlags.h
@@ -0,0 +1,35 @@
+// File Description
+/// \file LocalContextFlags.h
+/// \brief Defines the LocalContextFlags enum & helper method(s).
+//
+// Author: Lance Hepler
+
+#ifndef LOCALCONTEXTFLAGS_H
+#define LOCALCONTEXTFLAGS_H
+
+#include "pbbam/Config.h"
+
+#include <pbcopper/data/LocalContextFlags.h>
+
+namespace PacBio {
+namespace BAM {
+
+using LocalContextFlags PBBAM_DEPRECATED = PacBio::Data::LocalContextFlags;
+
+// because LocalContextFlags was a C enum and not a
+// C++11 enum class, we need to import all enumerations
+// into the containing scope
+using PacBio::Data::LocalContextFlags::NO_LOCAL_CONTEXT;
+using PacBio::Data::LocalContextFlags::ADAPTER_BEFORE;
+using PacBio::Data::LocalContextFlags::ADAPTER_AFTER;
+using PacBio::Data::LocalContextFlags::BARCODE_BEFORE;
+using PacBio::Data::LocalContextFlags::BARCODE_AFTER;
+using PacBio::Data::LocalContextFlags::FORWARD_PASS;
+using PacBio::Data::LocalContextFlags::REVERSE_PASS;
+using PacBio::Data::LocalContextFlags::ADAPTER_BEFORE_BAD;
+using PacBio::Data::LocalContextFlags::ADAPTER_AFTER_BAD;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // LOCALCONTEXTFLAGS_H
diff --git a/include/pbbam/MD5.h b/include/pbbam/MD5.h

new file mode 100644 (file)

index 0000000..de3e3fb
--- /dev/null
+++ b/include/pbbam/MD5.h
@@ -0,0 +1,24 @@
+// File Description
+/// \file MD5.h
+/// \brief Defines basic MD5 hash utilities
+//
+// Author: Brett Bowman
+
+#ifndef MD5_H
+#define MD5_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief MD5 hash of a string as a 32-digit hexadecimal string
+///
+std::string MD5Hash(const std::string& str);
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // MD5_H
diff --git a/include/pbbam/MoveAppend.h b/include/pbbam/MoveAppend.h

new file mode 100644 (file)

index 0000000..5cb243d
--- /dev/null
+++ b/include/pbbam/MoveAppend.h
@@ -0,0 +1,52 @@
+// Author: Derek Barnett
+
+#ifndef MOVEAPPEND_H
+#define MOVEAPPEND_H
+
+#include "pbbam/Config.h"
+
+#include <iterator>
+#include <utility>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+// \brief Appends content of src vector to dst vector using move semantics.
+///
+/// \param[in]     src  Input vector that will be empty after execution
+/// \param[in,out] dst  Output vector that will be appended to
+///
+template <typename T>
+inline void MoveAppend(std::vector<T>& src, std::vector<T>& dst) noexcept
+{
+    if (dst.empty()) {
+        dst = std::move(src);
+    } else {
+        dst.reserve(dst.size() + src.size());
+        std::move(src.begin(), src.end(), std::back_inserter(dst));
+        src.clear();
+    }
+}
+
+/// \brief Appends content of src vector to dst vector using move semantics.
+///
+/// \param[in]     src  Input vector via perfect forwarding
+/// \param[in,out] dst  Output vector that will be appended to
+///
+template <typename T>
+inline void MoveAppend(std::vector<T>&& src, std::vector<T>& dst) noexcept
+{
+    if (dst.empty()) {
+        dst = std::move(src);
+    } else {
+        dst.reserve(dst.size() + src.size());
+        std::move(src.begin(), src.end(), std::back_inserter(dst));
+        src.clear();
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // MOVEAPPEND_H
diff --git a/include/pbbam/Orientation.h b/include/pbbam/Orientation.h

new file mode 100644 (file)

index 0000000..a1241e9
--- /dev/null
+++ b/include/pbbam/Orientation.h
@@ -0,0 +1,22 @@
+// File Description
+/// \file Orientation.h
+/// \brief Defines the Orientation enum.
+//
+// Author: Derek Barnett
+
+#ifndef ORIENTATION_H
+#define ORIENTATION_H
+
+#include "pbbam/Config.h"
+
+#include <pbcopper/data/Orientation.h>
+
+namespace PacBio {
+namespace BAM {
+
+using Orientation PBBAM_DEPRECATED = PacBio::Data::Orientation;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ORIENTATION_H
+\ No newline at end of file
diff --git a/include/pbbam/PbiBasicTypes.h b/include/pbbam/PbiBasicTypes.h

new file mode 100644 (file)

index 0000000..286251a
--- /dev/null
+++ b/include/pbbam/PbiBasicTypes.h
@@ -0,0 +1,78 @@
+// File Description
+/// \file PbiBasicTypes.h
+/// \brief Defines the basic data structures used in PBI lookups.
+//
+// Author: Derek Barnett
+
+#ifndef PBIBASICTYPES_H
+#define PBIBASICTYPES_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <utility>
+#include <vector>
+
+#include "pbbam/Compare.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The IndexResultBlock class represents a contiguous group of records
+///        returned from a PBI lookup.
+///
+/// Contiguous reads that satisfy a PBI lookup query will be merged down into a
+/// single block. This helps to minimize the number of seeks in subsequent read
+/// operations.
+///
+/// An PBI-enabled reader or query can iterate over a list of IndexResultBlocks;
+/// for each block, seeking to the first record and then sequentially reading
+/// 'numReads' consecutive records before needing to seek again.
+///
+struct PBBAM_EXPORT IndexResultBlock
+{
+public:
+    IndexResultBlock(size_t idx, size_t numReads);
+
+    IndexResultBlock() = default;
+
+public:
+    bool operator==(const IndexResultBlock& other) const;
+    bool operator!=(const IndexResultBlock& other) const;
+
+public:
+    size_t firstIndex_ = 0;  ///< index of block's first record in BAM/PBI files (e.g. i-th record)
+    size_t numReads_ = 0;    ///< number of reads in this block
+    int64_t virtualOffset_ = -1;  ///< virtual offset of first record in this block
+};
+
+/// \brief container of PBI result blocks
+///
+using IndexResultBlocks = std::deque<IndexResultBlock>;
+
+/// \brief container of raw PBI indices
+///
+/// This is the primary result of PbiFilter -associated classes. This raw list
+/// can participate in set operations (union, intersect) for compound filters,
+/// and then be merged down into IndexResultBlocks for actual data file
+/// random-access.
+///
+using IndexList = std::vector<size_t>;
+
+/// \brief pair representing a range of PBI indices: where interval
+///        is [first, second)
+///
+/// Used primarily by the PBI's CoordinateSortedData components.
+///
+/// \sa PbiReferenceEntry, PbiRawReferenceData, & ReferenceLookupData
+///
+using IndexRange = std::pair<size_t, size_t>;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/PbiBasicTypes.inl"
+
+#endif  // PBIBASICTYPES_H
diff --git a/include/pbbam/PbiBuilder.h b/include/pbbam/PbiBuilder.h

new file mode 100644 (file)

index 0000000..6d8639f
--- /dev/null
+++ b/include/pbbam/PbiBuilder.h
@@ -0,0 +1,179 @@
+// File Description
+/// \file PbiBuilder.h
+/// \brief Defines the PbiBuilder class.
+//
+// Author: Derek Barnett
+
+#ifndef PBIBUILDER_H
+#define PBIBUILDER_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+class PbiRawData;
+
+/// \brief The PbiBuilder class construct PBI index data from %BAM record data.
+///
+/// Records are added one-by-one. This allows for either whole-file indexing of
+/// existing %BAM files or for indexing "on-the-fly" alongside a %BAM file as it
+/// is generated.
+///
+/// For simple PBI creation from existing %BAM files, see PbiFile::CreateFrom.
+/// This is the recommended approach, unless finer control or additional
+/// processing is needed.
+///
+class PBBAM_EXPORT PbiBuilder
+{
+public:
+    /// \brief This enum allows you to control the compression level of the
+    ///        output PBI file.
+    ///
+    /// Values are equivalent to zlib compression levels. See its documentation
+    /// for more details: http://www.zlib.net/manual.html
+    ///
+    enum CompressionLevel
+    {
+        CompressionLevel_0 = 0,
+        CompressionLevel_1 = 1,
+        CompressionLevel_2 = 2,
+        CompressionLevel_3 = 3,
+        CompressionLevel_4 = 4,
+        CompressionLevel_5 = 5,
+        CompressionLevel_6 = 6,
+        CompressionLevel_7 = 7,
+        CompressionLevel_8 = 8,
+        CompressionLevel_9 = 9,
+
+        DefaultCompression = -1,
+        NoCompression = CompressionLevel_0,
+        FastCompression = CompressionLevel_1,
+        BestCompression = CompressionLevel_9
+    };
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Initializes builder to write data to \p pbiFilename.
+    ///
+    /// \param[in] pbiFilename      output filename
+    /// \param[in] compressionLevel zlib compression level
+    /// \param[in] numThreads       number of threads for compression. If set to
+    ///                             0, PbiBuilder will attempt to determine a
+    ///                             reasonable estimate. If set to 1, this will
+    ///                             force single-threaded execution. No checks
+    ///                             are made against an upper limit.
+    ///
+    /// \throws std::runtime_error if PBI file cannot be opened for writing
+    ///
+    PbiBuilder(const std::string& pbiFilename,
+               const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression,
+               const size_t numThreads = 4);
+
+    /// \brief Initializes builder to write data to \p pbiFilename.
+    ///
+    /// Reference data-tracking structures will be initialized to expect
+    /// \p numReferenceSequences. (This is useful so that we can mark any
+    /// references that lack observed data appropriately).
+    ///
+    /// \param[in] pbiFilename              output filename
+    /// \param[in] numReferenceSequences    number of possible reference
+    ///                                     sequences, e.g. BamHeader::NumSequences
+    /// \param[in] compressionLevel zlib compression level
+    /// \param[in] numThreads       number of threads for compression. If set to
+    ///                             0, PbiBuilder will attempt to determine a
+    ///                             reasonable estimate. If set to 1, this will
+    ///                             force single-threaded execution. No checks
+    ///                             are made against an upper limit.
+    ///
+    /// \throws std::runtime_error if PBI file cannot be opened for writing
+    ///
+    PbiBuilder(const std::string& pbiFilename, const size_t numReferenceSequences,
+               const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression,
+               const size_t numThreads = 4);
+
+    /// \brief Initializes builder to write data to \p pbiFilename.
+    ///
+    /// Reference data-tracking structures will be initialized to expect
+    /// \p numReferenceSequences, but only if \p isCoordinateSorted is true.
+    ///
+    /// \param[in] pbiFilename              output filename
+    /// \param[in] numReferenceSequences    number of possible reference
+    ///                                     sequences, e.g. BamHeader::NumSequences
+    /// \param[in] isCoordinateSorted       if false, disables reference
+    ///                                     sequence tracking
+    ///                                     (BamHeader::SortOrder != "coordinate")
+    /// \param[in] compressionLevel zlib compression level
+    /// \param[in] numThreads       number of threads for compression. If set to
+    ///                             0, PbiBuilder will attempt to determine a
+    ///                             reasonable estimate. If set to 1, this will
+    ///                             force single-threaded execution. No checks
+    ///                             are made against an upper limit.
+    ///
+    /// \throws std::runtime_error if PBI file cannot be opened for writing
+    ///
+    PbiBuilder(const std::string& pbiFilename, const size_t numReferenceSequences,
+               const bool isCoordinateSorted,
+               const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression,
+               const size_t numThreads = 4);
+
+    /// \brief Destroys builder, writing its data out to PBI file.
+    ///
+    ///
+    /// \note Exceptions are swallowed. Use Close() if you want to catch them.
+    ///
+    ~PbiBuilder() noexcept;
+
+    /// \}
+
+public:
+    /// \name Index Building
+    /// \{
+
+    /// \brief Adds \p record's data to underlying raw data structure.
+    ///
+    /// \note \p vOffset is a BGZF \b virtual offset into the %BAM file. To get
+    ///          this value, you should use one of the following: \n
+    ///        - while reading existing %BAM: BamReader::VirtualTell \n
+    ///        - while writing new %BAM:      BamWriter::Write(const BamRecord& record, int64_t* vOffset) \n
+    ///
+    ///
+    /// To build a PBI index while generating a %BAM file:
+    /// \include code/PbiBuilder_WithWriter.txt
+    ///
+    /// To build a PBI index from an existing %BAM file:
+    /// \include code/PbiBuilder_WithReader.txt
+    ///
+    /// \param[in] record   input BamRecord to pull index data from
+    /// \param[in] vOffset  \b virtual offset into %BAM file where record begins
+    ///
+    void AddRecord(const BamRecord& record, const int64_t vOffset);
+
+    /// \brief Writes data out to PBI file & closes builder.
+    ///
+    /// \note Any exceptions are thrown to caller. If you don't care about
+    ///       catching exceptions with file I/O, just let the builder go out of
+    ///       scope and data will be written, but exceptions swallowed (to avoid
+    ///       throwing from destructor).
+    ///
+    void Close();
+
+    /// \}
+
+private:
+    class PbiBuilderPrivate;
+    std::unique_ptr<PbiBuilderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBIBUILDER_H
diff --git a/include/pbbam/PbiFile.h b/include/pbbam/PbiFile.h

new file mode 100644 (file)

index 0000000..c1f1fff
--- /dev/null
+++ b/include/pbbam/PbiFile.h
@@ -0,0 +1,110 @@
+// File Description
+/// \file PbiFile.h
+/// \brief Defines the PbiFile enums, typedefs, and methods.
+//
+// Author: Derek Barnett
+
+#ifndef PBIFILE_H
+#define PBIFILE_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "pbbam/PbiBuilder.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamFile;
+
+struct PbiFile
+{
+
+    /// \brief This enum describes the PBI file sections
+    ///
+    enum Section
+    {
+        BASIC = 0x0000,      ///< BasicData     (required)
+        MAPPED = 0x0001,     ///< MappedData    (always optional)
+        REFERENCE = 0x0002,  ///< ReferenceData (always optional)
+        BARCODE = 0x0004,    ///< BarcodeData   (always optional)
+
+        ALL = BASIC | MAPPED | REFERENCE | BARCODE  ///< Synonym for 'all sections'
+    };
+
+    /// \brief Helper typedef for storing multiple Section flags.
+    ///
+    using Sections = uint16_t;
+
+    /// \brief This enum describes the PBI file version.
+    enum VersionEnum
+    {
+        Version_3_0_0 = 0x030000,  ///< v3.0.0
+        Version_3_0_1 = 0x030001,  ///< v3.0.1
+        Version_3_0_2 = 0x030002,  ///< v3.0.2
+
+        CurrentVersion = Version_3_0_2  ///< Synonym for the current PBI version.
+    };
+
+    ///
+    /// \brief The BasicField enum
+    ///
+    enum class BasicField
+    {
+        RG_ID,
+        Q_START,
+        Q_END,
+        ZMW,
+        READ_QUALITY,
+        CONTEXT_FLAG,
+        VIRTUAL_OFFSET
+    };
+
+    ///
+    /// \brief The MappedField enum
+    ///
+    enum class MappedField
+    {
+        T_ID,
+        T_START,
+        T_END,
+        A_START,
+        A_END,
+        N_M,
+        N_MM,
+        N_INS,
+        N_DEL,
+        MAP_QUALITY,
+        STRAND
+    };
+
+    ///
+    /// \brief The BarcodeField enum
+    ///
+    enum class BarcodeField
+    {
+        BC_FORWARD,
+        BC_REVERSE,
+        BC_QUALITY
+    };
+
+    /// \brief Builds PBI index data from the supplied %BAM file and writes a
+    ///        ".pbi" file.
+    ///
+    /// \param[in] bamFile source %BAM file
+    ///
+    /// \throws std::runtime_error if index file could not be created
+    ///
+    static void CreateFrom(
+        const BamFile& bamFile,
+        const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression,
+        const size_t numThreads = 4);
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBIFILE_H
diff --git a/include/pbbam/PbiFilter.h b/include/pbbam/PbiFilter.h

new file mode 100644 (file)

index 0000000..16c337d
--- /dev/null
+++ b/include/pbbam/PbiFilter.h
@@ -0,0 +1,250 @@
+// File Description
+/// \file PbiFilter.h
+/// \brief Defines the PbiFilter class & helper 'concept'.
+//
+// Author: Derek Barnett
+
+#ifndef PBIFILTER_H
+#define PBIFILTER_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <tuple>
+
+#include <boost/concept_check.hpp>
+
+#include "pbbam/DataSet.h"
+#include "pbbam/PbiBasicTypes.h"
+#include "pbbam/PbiRawData.h"
+#include "pbbam/Unused.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+struct PbiFilterPrivate;
+}
+
+/// \brief The PbiFilterConcept class provides compile-time enforcement of the
+///        required interface for PbiFilter's child filters.
+///
+template <typename T>
+struct PbiFilterConcept
+{
+    BOOST_CONCEPT_USAGE(PbiFilterConcept)
+    {
+        // All PBI filters (built-in or client-define) need only provide this
+        // interface:
+        //
+        //    bool Accepts(const PbiRawData& index, const size_t row) const;
+        //
+        PbiRawData index;
+        auto result = filter.Accepts(index, 0);
+        UNUSED(result);
+    }
+
+private:
+    T filter;
+    //    PbiRawData index;
+};
+
+/// \brief The PbiFilter class provides a mechanism for performing PBI-enabled
+///        lookups.
+///
+/// The PbiFilter API is designed to be flexible, both built-in and for
+/// client-side customization. Built-in filters are provided for common queries,
+/// and client code can define and use custom filters as well. More complex
+/// filtering rules can be constructed via composition of simpler child filters.
+///
+/// Filter objects used as children of PbiFilter need only provide a method that
+/// matches this signature:
+///
+/// \include code/PbiFilter_Interface.txt
+///
+/// This requirement is enforced internally, using the PbiFilterConcept to
+/// require a compatible interface without requiring inheritance. This approach
+/// allows composition of heterogeneous filter types without worrying about a
+/// class hierarchy, pointer ownership across library/client boundaries, etc.
+///
+/// Thus a client application can define a custom filter if the built-in filters
+/// do not quite meet requirements. This filter may then be used in further
+/// PbiFilter composition, or directly to PbiFilterQuery
+///
+/// \include code/PbiFilter_CustomFilter.txt
+///
+/// As mentioned above, complex filters can be built up using multiple "child"
+/// filters. These complex filters are constructed by using either
+/// PbiFilter::Union (logical-OR over all direct children) or
+/// PbiFilter::Intersection (logical-AND over direct children).
+///
+/// \include code/PbiFilter_Composition.txt
+///
+class PBBAM_EXPORT PbiFilter
+{
+public:
+    enum CompositionType
+    {
+        INTERSECT,
+        UNION
+    };
+
+public:
+    /// \name Set Operations
+    /// \{
+
+    /// \brief Creates a PbiFilter that acts as an intersection of the input
+    ///        filters.
+    ///
+    /// A record must satisfy \b all of this filter's direct "child" filters.
+    ///
+    /// \param[in] filters  vector of child filters
+    /// \returns composite filter
+    ///
+    static PbiFilter Intersection(std::vector<PbiFilter> filters);
+
+    /// \brief Creates a PbiFilter that acts as a union of the input filters.
+    ///
+    /// A record must satisfy \b any of this filter's direct "child" filters.
+    ///
+    /// \param[in] filters  vector of child filters
+    /// \returns composite filter
+    ///
+    static PbiFilter Union(std::vector<PbiFilter> filters);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a PbiFilter from a %DataSet's described filters.
+    ///
+    /// A DataSet may contain a Filters element, itself a list of Filter
+    /// elements. Each Filter element will contain a Properties element, itself
+    /// a list of Property elements.
+    ///
+    /// The Filters hierarchy looks like this (in its XML output):
+    /// \verbinclude examples/plaintext/PbiFilter_DataSetXmlFilters.txt
+    ///
+    /// The resulting PbiFilter represents a union over all Filter elements,
+    /// with each Filter element requiring an intersection of all of its
+    /// Property criteria. These Property elements are mapped to built-in PBI
+    /// filter types. To use the labels in the example XML above, the filter
+    /// created here is equivalent to:
+    ///
+    /// (A && B) || (C && D)
+    ///
+    /// If a DataSet lacks any Filters, then an empty PbiFilter will be created
+    /// - corresponding to the dataset's entire contents.
+    ///
+    /// \param[in] dataset  maybe containing filters
+    /// \returns composite filter
+    ///
+    static PbiFilter FromDataSet(const DataSet& dataset);
+
+public:
+    /// \brief Creates an empty filter.
+    ///
+    /// \note An empty filter will result in all records being returned, e.g.
+    ///       for query iteration.
+    ///
+    /// \param[in] type composition type. Any additional child filters added to
+    ///                 this composite will be treated according to this type.
+    ///                 If INTERSECT, a record must match all child filters. If
+    ///                 UNION, a record must match any child filter.
+    ///
+    PbiFilter(const CompositionType type = INTERSECT);
+
+    /// \brief Creates a composite filter (of INTERSECT type) with an initial
+    ///        child filter.
+    ///
+    /// \note T must satisfy PbiFilterConcept
+    ///
+    /// \param[in] filter initial child filter
+    ///
+    template <typename T>
+    PbiFilter(T filter);
+
+    /// \brief Creates composite filter (of INTERSECT type) with a list of
+    ///        initial child filters.
+    ///
+    /// \param[in] filters initial child filters
+    ///
+    PbiFilter(std::vector<PbiFilter> filters);
+
+    PbiFilter(const PbiFilter&);
+    PbiFilter(PbiFilter&&) noexcept = default;
+    PbiFilter& operator=(const PbiFilter&);
+    PbiFilter& operator=(PbiFilter&&) noexcept = default;
+
+    /// \}
+
+public:
+    /// \name Composition
+    /// \{
+
+    /// \brief Adds a new child filter of type T.
+    ///
+    /// \param[in] filter   additional child filter. Type T must satisfy
+    ///                     PbiFilterConcept.
+    /// \returns reference to this filter
+    ///
+    template <typename T>
+    PbiFilter& Add(T filter);
+
+    /// \brief Adds a new child filter.
+    ///
+    /// \param[in] filter   additional child filter
+    /// \returns reference to this filter
+    ///
+    PbiFilter& Add(PbiFilter filter);
+
+    /// \brief Add child filters.
+    ///
+    /// \param[in] filters  additional child filters
+    /// \returns reference to this filter
+    ///
+    PbiFilter& Add(std::vector<PbiFilter> filters);
+
+    /// \returns true if this filter has no child filters.
+    bool IsEmpty() const;
+
+    /// \returns number of child filters
+    size_t NumChildren() const;
+
+    /// \returns filter type (intersect, union)
+    CompositionType Type() const;
+
+    /// \}
+
+public:
+    /// \name Lookup
+    /// \{
+
+    /// \brief Performs the PBI index lookup, combining child results a
+    ///        composite filter.
+    ///
+    /// \param[in] idx  PBI (raw) index object
+    /// \param[in] row  record number in %BAM/PBI files
+    ///
+    /// \returns true if record at \p row passes this filter criteria,
+    ///          including children (if any)
+    ///
+    bool Accepts(const BAM::PbiRawData& idx, const size_t row) const;
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::PbiFilterPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/PbiFilterTypes.h"
+#include "pbbam/internal/PbiFilter.inl"
+
+#endif  // PBIFILTER_H
diff --git a/include/pbbam/PbiFilterQuery.h b/include/pbbam/PbiFilterQuery.h

new file mode 100644 (file)

index 0000000..5d2ca66
--- /dev/null
+++ b/include/pbbam/PbiFilterQuery.h
@@ -0,0 +1,80 @@
+// File Description
+/// \file PbiFilterQuery.h
+/// \brief Defines the PbiFilterQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef PBIFILTERQUERY_H
+#define PBIFILTERQUERY_H
+
+#include "pbbam/Config.h"
+
+#include <vector>
+
+#include "pbbam/PbiFilter.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The PbiFilter class provides iterable access to a DataSet's %BAM
+///        records, limiting results to those matching filter criteria.
+///
+/// Example:
+/// \include code/PbiFilterQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT PbiFilterQuery : public internal::IQuery
+{
+public:
+    ///
+    /// \brief Creates a new PbiFilterQuery, limiting record results to only
+    ///        those matching filter criteria defined in the DataSet XML.
+    ///
+    /// \param[in] dataset input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         PBI files.
+    ///
+    PbiFilterQuery(const DataSet& dataset);
+
+    PbiFilterQuery(const DataSet& dataset, const PbiIndexCache& cache);
+
+    /// \brief Creates a new PbiFilterQuery, limiting record results to only
+    ///        those matching filter criteria
+    ///
+    /// \param[in] filter   filtering criteria
+    /// \param[in] dataset  input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         PBI files.
+    ///
+    PbiFilterQuery(const PbiFilter& filter, const DataSet& dataset);
+
+    PbiFilterQuery(const PbiFilter& filter, const DataSet& dataset, const PbiIndexCache& cache);
+
+    ~PbiFilterQuery() override;
+
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+    /// \brief Return number of records that pass the provided filter
+    ///
+    uint32_t NumReads() const;
+
+private:
+    class PbiFilterQueryPrivate;
+    std::unique_ptr<PbiFilterQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBIFILTERQUERY_H
diff --git a/include/pbbam/PbiFilterTypes.h b/include/pbbam/PbiFilterTypes.h

new file mode 100644 (file)

index 0000000..1ea0bfa
--- /dev/null
+++ b/include/pbbam/PbiFilterTypes.h
@@ -0,0 +1,929 @@
+// File Description
+/// \file PbiFilterTypes.h
+/// \brief Defines the built-in PBI filters.
+//
+// Author: Derek Barnett
+
+#ifndef PBIFILTERTYPES_H
+#define PBIFILTERTYPES_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include <boost/optional.hpp>
+
+#include "pbbam/Compare.h"
+#include "pbbam/PbiFile.h"
+#include "pbbam/PbiFilter.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+
+/// \internal
+///
+/// Provides basic container for value/compare-type pair
+///
+template <typename T>
+class FilterBase
+{
+public:
+    T value_;
+    boost::optional<std::vector<T>> multiValue_;
+    Compare::Type cmp_;
+
+protected:
+    FilterBase(T value, const Compare::Type cmp);
+    FilterBase(std::vector<T> values, const Compare::Type cmp = Compare::CONTAINS);
+
+    bool CompareHelper(const T& lhs) const;
+
+private:
+    bool CompareSingleHelper(const T& lhs) const;
+    bool CompareMultiHelper(const T& lhs) const;
+};
+
+/// \internal
+///
+/// Dispatches the lookup to BarcodeLookupData
+///
+template <typename T, PbiFile::BarcodeField field>
+class BarcodeDataFilterBase : public FilterBase<T>
+{
+public:
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+protected:
+    BarcodeDataFilterBase(T value, const Compare::Type cmp);
+    BarcodeDataFilterBase(std::vector<T> values, const Compare::Type cmp = Compare::CONTAINS);
+};
+
+/// \internal
+///
+/// Dispatches the lookup to BasicLookupData
+///
+template <typename T, PbiFile::BasicField field>
+class BasicDataFilterBase : public FilterBase<T>
+{
+public:
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+protected:
+    BasicDataFilterBase(T value, const Compare::Type cmp);
+    BasicDataFilterBase(std::vector<T> values, const Compare::Type cmp = Compare::CONTAINS);
+};
+
+/// \internal
+///
+/// Dispatches the lookup to MappedLookupData
+///
+template <typename T, PbiFile::MappedField field>
+class MappedDataFilterBase : public FilterBase<T>
+{
+public:
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+protected:
+    MappedDataFilterBase(T value, const Compare::Type cmp);
+    MappedDataFilterBase(std::vector<T> values, const Compare::Type cmp = Compare::CONTAINS);
+};
+
+}  // namespace internal
+
+/// \brief The PbiAlignedEndFilter class provides a PbiFilter-compatible filter
+///        on aligned end.
+///
+/// Example: \include code/PbiAlignedEndFilter.txt
+///
+/// \sa BamRecord::AlignedEnd
+///
+class PbiAlignedEndFilter
+    : public internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::A_END>
+{
+public:
+    /// \brief Creates a filter on aligned end.
+    ///
+    /// \param[in] position value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiAlignedEndFilter(const uint32_t position, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiAlignedLengthFilter class provides a PbiFilter-compatible
+///        filter on aligned length.
+///
+/// Example: \include code/PbiAlignedLengthFilter.txt
+///
+/// \sa BamRecord::AlignedEnd, BamRecord::AlignedStart
+///
+class PbiAlignedLengthFilter : public internal::FilterBase<uint32_t>
+{
+public:
+    /// \brief Creates a filter on aligned length.
+    ///
+    /// \param[in] length value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiAlignedLengthFilter(const uint32_t length, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \brief The PbiAlignedStartFilter class provides a PbiFilter-compatible
+///        filter on aligned start.
+///
+/// Example: \include code/PbiAlignedStartFilter.txt
+///
+/// \sa BamRecord::AlignedStart
+///
+class PbiAlignedStartFilter
+    : public internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::A_START>
+{
+public:
+    /// \brief Creates a filter on aligned start.
+    ///
+    /// \param[in] position value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiAlignedStartFilter(const uint32_t position, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiAlignedStrandFilter class provides a PbiFilter-compatible
+///        filter on aligned strand.
+///
+/// Example: \include code/PbiAlignedStrandFilter.txt
+///
+/// \sa BamRecord::AlignedStrand
+///
+class PbiAlignedStrandFilter
+    : public internal::MappedDataFilterBase<Strand, PbiFile::MappedField::STRAND>
+{
+public:
+    /// \brief Creates a strand filter.
+    ///
+    /// \param[in] strand  strand value to compare on
+    /// \param[in] cmp     compare type
+    ///
+    PbiAlignedStrandFilter(const Strand strand, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiBarcodeFilter class provides a PbiFilter-compatible filter on
+///        barcode ID.
+///
+/// Any record with this barcode ID (forward or reverse) will pass this filter.
+///
+/// Example: \include code/PbiBarcodeFilter.txt
+///
+/// \sa BamRecord::BarcodeForward, BamRecord::BarcodeReverse
+///
+class PbiBarcodeFilter
+{
+public:
+    /// \brief Creates a single-value barcode filter.
+    ///
+    /// \param[in] barcode  barcode ID to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodeFilter(const int16_t barcode, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a whitelisted or blacklisted barcode filter.
+    ///
+    /// \note If compare type is Compare::CONTAINS, accepted records will match
+    ///       at least one value from whitelist, in bc_forward.
+    ///       If compare type is Compare::NOT_CONTAINS, accepted records will
+    ///       match no barcodes in the blacklist.
+    ///
+    /// \param[in] barcodes barcode IDs
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodeFilter(std::vector<int16_t> barcodes, const Compare::Type cmp = Compare::CONTAINS);
+
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    PbiFilter compositeFilter_;
+};
+
+/// \brief The PbiBarcodeForwardFilter class provides a PbiFilter-compatible
+///        filter on forward barcode ID.
+///
+/// Example: \include code/PbiBarcodeForwardFilter.txt
+///
+/// \sa BamRecord::BarcodeForward
+///
+class PbiBarcodeForwardFilter
+    : public internal::BarcodeDataFilterBase<int16_t, PbiFile::BarcodeField::BC_FORWARD>
+{
+public:
+    /// \brief Creates a single-value forward barcode filter.
+    ///
+    /// \param[in] bcFwdId  (forward) barcode ID to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodeForwardFilter(const int16_t bcFwdId, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a whitelisted or blacklisted forward barcode filter.
+    ///
+    /// \note If compare type is Compare::CONTAINS, accepted records will match
+    ///       at least one value from whitelist, in bc_forward.
+    ///       If compare type is Compare::NOT_CONTAINS, accepted records will
+    ///       match no values in the blacklist.
+    ///
+    /// \param[in] barcodes barcode IDs
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodeForwardFilter(std::vector<int16_t> barcodes,
+                            const Compare::Type cmp = Compare::CONTAINS);
+};
+
+/// \brief The PbiBarcodeQualityFilter class provides a PbiFilter-compatible
+///        filter on  barcode quality.
+///
+/// Example: \include code/PbiBarcodeQualityFilter.txt
+///
+/// \sa BamRecord::BarcodeQuality
+///
+class PbiBarcodeQualityFilter
+    : public internal::BarcodeDataFilterBase<uint8_t, PbiFile::BarcodeField::BC_QUALITY>
+{
+public:
+    /// \brief Creates a single-value barcode quality filter.
+    ///
+    /// \param[in] bcQuality    barcode quality to compare on
+    /// \param[in] cmp          compare type
+    ///
+    PbiBarcodeQualityFilter(const uint8_t bcQuality, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiBarcodeReverseFilter class provides a PbiFilter-compatible
+///        filter on forward barcode ID.
+///
+/// Example: \include code/PbiBarcodeReverseFilter.txt
+///
+/// \sa BamRecord::BarcodeReverse
+///
+class PbiBarcodeReverseFilter
+    : public internal::BarcodeDataFilterBase<int16_t, PbiFile::BarcodeField::BC_REVERSE>
+{
+public:
+    /// \brief Creates a single-value reverse barcode filter.
+    ///
+    /// \param[in] bcRevId  (reverse) barcode ID to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodeReverseFilter(const int16_t bcRevId, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a whitelisted or blacklisted reverse barcode filter.
+    ///
+    /// \note If compare type is Compare::CONTAINS, accepted records will match
+    ///       at least one value from whitelist, in bc_forward.
+    ///       If compare type is Compare::NOT_CONTAINS, accepted records will
+    ///       match no values in the blacklist.
+    ///
+    /// \param[in] barcodes barcode IDs
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodeReverseFilter(std::vector<int16_t> barcodes,
+                            const Compare::Type cmp = Compare::CONTAINS);
+};
+
+/// \brief The PbiBarcodesFilter class provides a PbiFilter-compatible filter on
+///        both forward & reverse barcode IDs.
+///
+/// A record must match both IDs to pass the filter.
+///
+/// Example: \include code/PbiBarcodesFilter.txt
+///
+/// \sa BamRecord::Barcodes
+///
+class PbiBarcodesFilter
+{
+public:
+    /// \brief Creates a barcodes filter from a std::pair of IDs.
+    ///
+    /// pair.first -> BarcodeForward\n
+    /// pair.second -> BarcodeReverse
+    ///
+    /// \param[in] barcodes barcode IDs to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodesFilter(const std::pair<int16_t, int16_t> barcodes,
+                      const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a barcodes filter from forward & reverse IDs.
+    ///
+    /// \param[in] bcForward    forward barcode ID to compare on
+    /// \param[in] bcReverse    reverse barcode ID to compare on
+    /// \param[in] cmp          compare type
+    ///
+    PbiBarcodesFilter(const int16_t bcForward, const int16_t bcReverse,
+                      const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    PbiFilter compositeFilter_;
+};
+
+/// \brief The PbiIdentityFilter class provides a PbiFilter-compatible filter on
+///        read identity (% aligned match).
+///
+/// Read identity is equivalent to: 1.0 - (nMM + nDel + nIns)/readLength.
+///
+/// Example: \include code/PbiIdentityFilter.txt
+///
+class PbiIdentityFilter : public internal::FilterBase<float>
+{
+public:
+    /// \brief Creates a read identity filter.
+    ///
+    /// \param[in] identity value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiIdentityFilter(const float identity, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \brief The PbiLocalContextFilter class provides a PbiFilter-compatible
+///        filter on local context (adapter, barcode, etc.).
+///
+/// The primary Compare::Type operators intended for this filter are:
+/// Compare::EQUAL, Compare::NOT_EQUAL, Compare::CONTAINS, and
+/// Compare::NOT_CONTAINS.
+///
+/// Example: \include code/PbiLocalContextFilter.txt
+///
+class PbiLocalContextFilter
+    : public internal::BasicDataFilterBase<LocalContextFlags, PbiFile::BasicField::CONTEXT_FLAG>
+{
+public:
+    PbiLocalContextFilter(const LocalContextFlags& flags, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiMapQualityFilter class provides a PbiFilter-compatible filter on
+///        mapping quality.
+///
+/// Example: \include code/PbiMapQualityFilter.txt
+///
+/// \sa BamRecord::MapQuality
+///
+class PbiMapQualityFilter
+    : public internal::MappedDataFilterBase<uint8_t, PbiFile::MappedField::MAP_QUALITY>
+{
+public:
+    /// \brief Creates a map quality filter.
+    ///
+    /// \param[in] mapQual  value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiMapQualityFilter(const uint8_t mapQual, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiMovieNameFilter class provides a PbiFilter-compatible filter
+///        on movie name.
+///
+/// Example: \include code/PbiMovieNameFilter.txt
+///
+/// \sa BamRecord::MovieName
+///
+class PbiMovieNameFilter
+{
+public:
+    /// \brief Creates a single-value movie name filter.
+    ///
+    /// \param[in] movieName    movie name to compare on
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match movie name, exactly.
+    ///
+    PbiMovieNameFilter(const std::string& movieName, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a whitelisted or blacklisted movie name filter.
+    ///
+    /// \note If compare type is Compare::CONTAINS, accepted records will match
+    ///       at least one value from whitelist, in bc_forward.
+    ///       If compare type is Compare::NOT_CONTAINS, accepted records will
+    ///       match no values in the blacklist.
+    ///
+    /// \param[in] movieNames   movie names
+    /// \param[in] cmp          compare type
+    ///
+    PbiMovieNameFilter(const std::vector<std::string>& movieNames,
+                       const Compare::Type cmp = Compare::CONTAINS);
+
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    PbiFilter compositeFilter_;
+    Compare::Type cmp_;
+};
+
+/// \brief The PbiNumDeletedBasesFilter class provides a PbiFilter-compatible
+///        filter on the number of deleted bases.
+///
+/// Example: \include code/PbiNumDeletedBasesFilter.txt
+///
+/// \sa BamRecord::NumDeletedBases
+///
+class PbiNumDeletedBasesFilter
+    : public internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_DEL>
+{
+public:
+    /// \brief Creates a filter on the number of deleted bases.
+    ///
+    /// \param[in] numDeletions value to compare on
+    /// \param[in] cmp          compare type
+    ///
+    PbiNumDeletedBasesFilter(const size_t numDeletions, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiNumInsertededBasesFilter class provides a PbiFilter-compatible
+///        filter on the number of inserted bases.
+///
+/// Example: \include code/PbiNumInsertedBasesFilter.txt
+///
+/// \sa BamRecord::NumInsertedBases
+///
+class PbiNumInsertedBasesFilter
+    : public internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_INS>
+{
+public:
+    /// \brief Creates a filter on the number of inserted bases.
+    ///
+    /// \param[in] numInsertions    value to compare on
+    /// \param[in] cmp              compare type
+    ///
+    PbiNumInsertedBasesFilter(const size_t numInsertions, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiNumMatchesFilter class provides a PbiFilter-compatible filter
+///        on the number of matched bases.
+///
+/// Example: \include code/PbiNumMatchesFilter.txt
+///
+/// \sa BamRecord::NumMatches
+///
+class PbiNumMatchesFilter : public internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_M>
+{
+public:
+    /// \brief Creates a filter on the number of matched bases.
+    ///
+    /// \param[in] numMatchedBases  value to compare on
+    /// \param[in] cmp              compare type
+    ///
+    PbiNumMatchesFilter(const size_t numMatchedBases, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiNumMismatchesFilter class provides a PbiFilter-compatible
+///        filter on the number of mismatched bases.
+///
+/// Example: \include code/PbiNumMismatchesFilter.txt
+///
+/// \sa BamRecord::NumMismatches
+///
+class PbiNumMismatchesFilter
+    : public internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_MM>
+{
+public:
+    /// \brief Creates a filter on the number of mismatched bases.
+    ///
+    /// \param[in] numMismatchedBases   value to compare on
+    /// \param[in] cmp                  compare type
+    ///
+    PbiNumMismatchesFilter(const size_t numMismatchedBases,
+                           const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiQueryEndFilter class provides a PbiFilter-compatible filter
+///        on query end.
+///
+/// Example: \include code/PbiQueryEndFilter.txt
+///
+/// \sa BamRecord::QueryEnd
+///
+class PbiQueryEndFilter : public internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::Q_END>
+{
+public:
+    /// \brief Creates a filter on query end position.
+    ///
+    /// \param[in] position value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiQueryEndFilter(const int32_t position, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiQueryLengthFilter class provides a PbiFilter-compatible filter
+///        on query length.
+///
+/// queryLength = (queryEnd - queryStart)
+///
+/// Example: \include code/PbiQueryLengthFilter.txt
+///
+/// \sa BamRecord::QueryEnd, BamRecord::QueryStart
+///
+class PbiQueryLengthFilter : public internal::FilterBase<int32_t>
+{
+public:
+    /// \brief Creates a filter on query length
+    ///
+    /// \param[in] length   value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiQueryLengthFilter(const int32_t length, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \brief The PbiQueryNameFilter class provides a PbiFilter-compatible filter
+///        on name length.
+///
+/// Example: \include code/PbiQueryNameFilter.txt
+///
+/// \sa BamRecord::FullName
+///
+class PbiQueryNameFilter
+{
+public:
+    /// \brief Creates a single-value query name filter.
+    ///
+    /// \param[in] qname    query name to compare on
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match query name, exactly.
+    ///
+    PbiQueryNameFilter(const std::string& qname, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a whitelisted or blacklisted query name filter.
+    ///
+    /// \note If compare type is Compare::CONTAINS, accepted records will match
+    ///       at least one value from whitelist, in bc_forward.
+    ///       If compare type is Compare::NOT_CONTAINS, accepted records will
+    ///       match no values in the blacklist.
+    ///
+    /// \param[in] queryNames   query names
+    /// \param[in] cmp          compare type
+    ///
+    PbiQueryNameFilter(const std::vector<std::string>& queryNames,
+                       const Compare::Type cmp = Compare::CONTAINS);
+
+    PbiQueryNameFilter(const PbiQueryNameFilter& other);
+    ~PbiQueryNameFilter();
+
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    struct PbiQueryNameFilterPrivate;
+    std::unique_ptr<PbiQueryNameFilterPrivate> d_;
+};
+
+/// \brief The PbiQueryStartFilter class provides a PbiFilter-compatible filter
+///        on query start.
+///
+/// Example: \include code/PbiQueryStartFilter.txt
+///
+/// \sa BamRecord::QueryStart
+///
+class PbiQueryStartFilter
+    : public internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::Q_START>
+{
+public:
+    /// \brief Creates a filter on query start position.
+    ///
+    /// \param[in] position value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiQueryStartFilter(const int32_t position, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiReadAccuracyFilter class provides a PbiFilter-compatible filter
+///        on read accuracy.
+///
+/// Example: \include code/PbiReadAccuracyFilter.txt
+///
+/// \sa BamRecord::ReadAccuracy
+///
+class PbiReadAccuracyFilter
+    : public internal::BasicDataFilterBase<Accuracy, PbiFile::BasicField::READ_QUALITY>
+{
+public:
+    /// \brief Creates a filter on read accuracy.
+    ///
+    /// \param[in] accuracy value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiReadAccuracyFilter(const Accuracy accuracy, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiReadGroupFilter class provides a PbiFilter-compatible filter
+///        on read group.
+///
+/// Example: \include code/PbiReadGroupFilter.txt
+///
+/// \sa BamRecord::ReadGroup,
+///     BamRecord::ReadGroupId,
+///     BamRecord::ReadGroupNumericId
+///
+class PbiReadGroupFilter
+{
+public:
+    /// \brief Creates a filter on read group (numeric) ID value
+    ///
+    /// \param[in] rgId     numeric read group ID
+    /// \param[in] cmp      compare type
+    ///
+    /// \sa BamRecord::ReadGroupNumericId
+    ///
+    PbiReadGroupFilter(const int32_t rgId, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a filter on printable read group ID value
+    ///
+    /// \param[in] rgId     read group ID string
+    /// \param[in] cmp      compare type
+    ///
+    /// \sa BamRecord::ReadGroupId
+    ///
+    PbiReadGroupFilter(const std::string& rgId, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a filter on read group (object).
+    ///
+    /// \param[in] rg   read group object
+    /// \param[in] cmp  compare type
+    ///
+    /// \sa BamRecord::ReadGroup
+    ///
+    PbiReadGroupFilter(const ReadGroupInfo& rg, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a whitelisted or blacklisted filter on read group numeric IDs.
+    ///
+    /// \note If compare type is Compare::CONTAINS, accepted records will match
+    ///       at least one value from whitelist, in bc_forward.
+    ///       If compare type is Compare::NOT_CONTAINS, accepted records will
+    ///       match no values in the blacklist.
+    ///
+    /// \param[in] rgIds    numeric read group IDs
+    /// \param[in] cmp      compare type
+    ///
+    PbiReadGroupFilter(const std::vector<int32_t>& rgIds,
+                       const Compare::Type cmp = Compare::CONTAINS);
+
+    /// \brief Creates a whitelisted or blacklisted filter on read group string IDs.
+    ///
+    /// \note If compare type is Compare::CONTAINS, accepted records will match
+    ///       at least one value from whitelist, in bc_forward.
+    ///       If compare type is Compare::NOT_CONTAINS, accepted records will
+    ///       match no values in the blacklist.
+    ///
+    /// \param[in] rgIds    read group ID strings
+    /// \param[in] cmp      compare type
+    ///
+    PbiReadGroupFilter(const std::vector<std::string>& rgIds,
+                       const Compare::Type cmp = Compare::CONTAINS);
+
+    /// \brief Creates a whitelisted or blacklisted filter on read group objects.
+    ///
+    /// \note If compare type is Compare::CONTAINS, accepted records will match
+    ///       at least one value from whitelist, in bc_forward.
+    ///       If compare type is Compare::NOT_CONTAINS, accepted records will
+    ///       match no values in the blacklist.
+    ///
+    /// \param[in] readGroups   ReadGroupInfo objects
+    /// \param[in] cmp          compare type
+    ///
+    PbiReadGroupFilter(const std::vector<ReadGroupInfo>& readGroups,
+                       const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    // RGID number => barcode(s) filter
+    std::unordered_map<int32_t, boost::optional<std::vector<std::pair<int16_t, int16_t>>>> lookup_;
+    Compare::Type cmp_;
+};
+
+/// \brief The PbiReferenceEndFilter class provides a PbiFilter-compatible
+///        filter on reference end.
+///
+/// Example: \include code/PbiReferenceEndFilter.txt
+///
+/// \sa BamRecord::ReferenceEnd
+///
+class PbiReferenceEndFilter
+    : public internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::T_END>
+{
+public:
+    /// \brief Creates a filter on reference end.
+    ///
+    /// \param[in] tEnd     value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiReferenceEndFilter(const uint32_t tEnd, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiReferenceIdFilter class provides a PbiFilter-compatible
+///        filter on reference ID.
+///
+/// Example: \include code/PbiReferenceIdFilter.txt
+///
+/// \sa BamRecord::ReferenceId
+///
+class PbiReferenceIdFilter
+    : public internal::MappedDataFilterBase<int32_t, PbiFile::MappedField::T_ID>
+{
+public:
+    /// \brief Creates a single-value reference ID filter.
+    ///
+    /// \param[in] tId  reference ID to compare on
+    /// \param[in] cmp  compare type
+    ///
+    PbiReferenceIdFilter(const int32_t tId, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a whitelisted or blacklisted reference ID filter.
+    ///
+    /// \note If compare type is Compare::CONTAINS, accepted records will match
+    ///       at least one value from whitelist, in bc_forward.
+    ///       If compare type is Compare::NOT_CONTAINS, accepted records will
+    ///       match no values in the blacklist.
+    ///
+    /// \param[in] tIds reference IDs
+    /// \param[in] cmp  compare type
+    ///
+    PbiReferenceIdFilter(std::vector<int32_t> tIds, const Compare::Type cmp = Compare::CONTAINS);
+};
+
+/// \brief The PbiReferenceNameFilter class provides a PbiFilter-compatible
+///        filter on reference name.
+///
+/// Example: \include code/PbiReferenceNameFilter.txt
+///
+/// \sa BamRecord::ReferenceName
+///
+class PbiReferenceNameFilter
+{
+public:
+    /// \brief Creates a single-value reference name filter.
+    ///
+    /// \param[in] rname    reference ID to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiReferenceNameFilter(std::string rname, Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a whitelisted or blacklisted reference name filter.
+    ///
+    /// \note If compare type is Compare::CONTAINS, accepted records will match
+    ///       at least one value from whitelist, in bc_forward.
+    ///       If compare type is Compare::NOT_CONTAINS, accepted records will
+    ///       match no values in the blacklist.
+    ///
+    /// \param[in] rnames   reference names
+    /// \param[in] cmp      compare type
+    ///
+    PbiReferenceNameFilter(std::vector<std::string> rnames,
+                           const Compare::Type cmp = Compare::CONTAINS);
+
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    mutable bool initialized_ = false;
+    mutable PbiFilter subFilter_;
+    std::string rname_;
+    boost::optional<std::vector<std::string>> rnameWhitelist_;
+    Compare::Type cmp_;
+
+    // marked const so we can delay setup of filter in Accepts(), once we have
+    // access to PBI/BAM input. modified values marked mutable accordingly
+    void Initialize(const PbiRawData& idx) const;
+
+    void Validate() const;
+};
+
+/// \brief The PbiReferenceStartFilter class provides a PbiFilter-compatible
+///        filter on reference start.
+///
+/// Example: \include code/PbiReferenceStartFilter.txt
+///
+/// \sa BamRecord::ReferenceStart
+///
+class PbiReferenceStartFilter
+    : public internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::T_START>
+{
+public:
+    /// \brief Creates a filter on reference start.
+    ///
+    /// \param[in] tStart   value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiReferenceStartFilter(const uint32_t tStart, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiZmwFilter class provides a PbiFilter-compatible filter on
+///        ZMW hole number.
+///
+/// Example: \include code/PbiZmwFilter.txt
+///
+/// \sa BamRecord::HoleNumber
+///
+class PbiZmwFilter : public internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::ZMW>
+{
+public:
+    /// \brief Creates a single-value ZMW hole number filter.
+    ///
+    /// \param[in] zmw  value to compare on
+    /// \param[in] cmp  compare type
+    ///
+    PbiZmwFilter(const int32_t zmw, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a whitelisted or blacklisted ZMW hole number filter.
+    ///
+    /// \note If compare type is Compare::CONTAINS, accepted records will match
+    ///       at least one value from whitelist, in bc_forward.
+    ///       If compare type is Compare::NOT_CONTAINS, accepted records will
+    ///       match no values in the blacklist.
+    ///
+    /// \param[in] zmws ZMW hole numbers
+    /// \param[in] cmp  compare type
+    ///
+    PbiZmwFilter(std::vector<int32_t> zmws, const Compare::Type cmp = Compare::CONTAINS);
+};
+
+// ----------------------------------------------
+// NOTE: modulo filtering only enabled for ZMW.
+//
+// I need to generalize more if we're going to use
+// this on more fields.
+// ----------------------------------------------
+
+enum class FilterHash
+{
+    UNSIGNED_LONG_CAST,
+    BOOST_HASH_COMBINE,
+};
+
+class PbiZmwModuloFilter
+{
+public:
+    PbiZmwModuloFilter(const uint32_t denominator, const uint32_t value,
+                       const FilterHash hashtype = FilterHash::UNSIGNED_LONG_CAST,
+                       const Compare::Type = Compare::EQUAL);
+
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    uint32_t denominator_;
+    uint32_t value_;
+    FilterHash hash_;
+    Compare::Type cmp_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/PbiFilterTypes.inl"
+
+#endif  // PBIFILTERTYPES_H
diff --git a/include/pbbam/PbiIndexedBamReader.h b/include/pbbam/PbiIndexedBamReader.h

new file mode 100644 (file)

index 0000000..1dd8557
--- /dev/null
+++ b/include/pbbam/PbiIndexedBamReader.h
@@ -0,0 +1,124 @@
+// File Description
+/// \file PbiIndexedBamReader.h
+/// \brief Defines the PbiIndexedBamReader class.
+//
+// Author: Derek Barnett
+
+#ifndef PBIINDEXEDBAMREADER_H
+#define PBIINDEXEDBAMREADER_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamReader.h"
+#include "pbbam/PbiBasicTypes.h"
+#include "pbbam/PbiFilter.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The PbiIndexedBamReader class provides read-only iteration over %BAM
+///        records, limited to some filtering criteria.
+///
+/// The PacBio BAM index (*.pbi) is used to allow random-access operations.
+///
+class PBBAM_EXPORT PbiIndexedBamReader : public BamReader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Constructs %BAM reader, with an initial filter.
+    ///
+    /// All reads that satisfy the filter will be available.
+    ///
+    /// \param[in] filter       PbiFilter or compatible object
+    /// \param[in] bamFilename  input %BAM filename
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(PbiFilter filter, const std::string& bamFilename);
+    PbiIndexedBamReader(PbiFilter filter, const std::string& bamFilename,
+                        const std::shared_ptr<PbiRawData>& index);
+
+    /// \brief Constructs %BAM reader, with an initial filter.
+    ///
+    /// All reads that satisfy the filter will be available.
+    ///
+    /// \param[in] filter       PbiFilter or compatible object
+    /// \param[in] bamFile      input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(PbiFilter filter, BamFile bamFile);
+    PbiIndexedBamReader(PbiFilter filter, BamFile bamFile,
+                        const std::shared_ptr<PbiRawData>& index);
+
+    /// \brief Constructs %BAM reader, with no initial filter.
+    ///
+    /// Useful for delaying either specifying the filtering criteria or
+    /// performing the PBI lookups.
+    ///
+    /// \param[in] bamFilename  input %BAM filename
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(const std::string& bamFilename);
+    PbiIndexedBamReader(const std::string& bamFilename, const std::shared_ptr<PbiRawData>& index);
+
+    /// \brief Constructs %BAM reader, with no initial filter.
+    ///
+    /// Useful for delaying either specifying the filtering criteria or
+    /// performing the PBI lookups.
+    ///
+    /// \param[in] bamFile      input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(BamFile bamFile);
+    PbiIndexedBamReader(BamFile bamFile, const std::shared_ptr<PbiRawData>& index);
+
+    ~PbiIndexedBamReader() override;
+
+    /// \}
+
+    /// \name Filtering & Index Data
+    /// \{
+
+    const BamFile& File() const;
+
+    /// \returns the current filter active on this reader
+    const PbiFilter& Filter() const;
+
+    uint32_t NumReads() const;
+
+    /// \brief Sets a new filter on the reader.
+    ///
+    /// \param[in] filter
+    /// \returns reference to this reader
+    ///
+    PbiIndexedBamReader& Filter(PbiFilter filter);
+
+    /// \return list of index blocks (chunks of passing reads) currently in use
+    const IndexResultBlocks& IndexBlocks() const;
+
+    /// \}
+
+protected:
+    int ReadRawData(BGZF* bgzf, bam1_t* b) override;
+
+private:
+    class PbiIndexedBamReaderPrivate;
+    std::unique_ptr<PbiIndexedBamReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBIINDEXEDBAMREADER_H
diff --git a/include/pbbam/PbiRawData.h b/include/pbbam/PbiRawData.h

new file mode 100644 (file)

index 0000000..e602e50
--- /dev/null
+++ b/include/pbbam/PbiRawData.h
@@ -0,0 +1,465 @@
+// File Description
+/// \file PbiRawData.h
+/// \brief Defines the classes used for working with raw PBI data.
+//
+// Author: Derek Barnett
+
+#ifndef PBIRAWDATA_H
+#define PBIRAWDATA_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "pbbam/PbiFile.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamFile;
+class BamRecord;
+class DataSet;
+
+/// \brief The PbiRawBarcodeData class represents the raw data stored in the
+///        "BarcodeData" section of the PBI index.
+///
+class PBBAM_EXPORT PbiRawBarcodeData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty data structure, preallocating space for a known
+    ///        number of records.
+    PbiRawBarcodeData(uint32_t numReads);
+
+    PbiRawBarcodeData() = default;
+
+    /// \}
+
+public:
+    /// \name Index Construction
+    /// \{
+
+    /// \brief Adds a record's barcode data.
+    ///
+    /// \param[in] b    %BAM record
+    ///
+    void AddRecord(const BamRecord& b);
+
+    /// \}
+
+public:
+    /// \name Raw Data Containers
+    /// \{
+
+    std::vector<int16_t> bcForward_;
+    std::vector<int16_t> bcReverse_;
+    std::vector<int8_t> bcQual_;
+
+    /// \}
+};
+
+/// \brief The PbiRawMappedData class represents the raw data stored in the
+///        "MappedData" section of the PBI index.
+///
+class PBBAM_EXPORT PbiRawMappedData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty data structure, preallocating space for a known
+    ///        number of records.
+    PbiRawMappedData(uint32_t numReads);
+
+    PbiRawMappedData() = default;
+
+    /// \}
+
+public:
+    /// \name Index Construction
+    /// \{
+
+    /// \brief Adds a record's mapping data.
+    ///
+    /// \param[in] b    %BAM record
+    ///
+    void AddRecord(const BamRecord& b);
+
+    /// \}
+
+public:
+    /// \name Index Data Query
+    /// \{
+
+    /// \brief Calculates the number of deleted bases for a particular record.
+    ///
+    /// Convenvience method. Equivalent to:
+    /// \code{.cpp}
+    /// NumDeletedAndInsertedBasesAt(i).first;
+    /// \endcode
+    ///
+    /// \param[in] recordIndex  i-th record
+    /// \returns number of deleted bases
+    ///
+    uint32_t NumDeletedBasesAt(size_t recordIndex) const;
+
+    /// \brief Calculates the number of inserted bases for a particular record.
+    ///
+    /// Convenvience method. Equivalent to:
+    /// \code{.cpp}
+    /// NumDeletedAndInsertedBasesAt(i).second;
+    /// \endcode
+    ///
+    /// \param[in] recordIndex  i-th record
+    /// \returns number of inserted bases
+    ///
+    uint32_t NumInsertedBasesAt(size_t recordIndex) const;
+
+    /// \brief Calculates the number of deleted & inserted bases for a
+    ///        particular record.
+    ///
+    /// \param[in] recordIndex  i-th record in the data set
+    /// \returns a pair consisting of (numDeletions,numInsertions)
+    ///
+    std::pair<uint32_t, uint32_t> NumDeletedAndInsertedBasesAt(size_t recordIndex) const;
+
+    /// \}
+
+public:
+    /// \name Raw Data Containers
+    /// \{
+
+    std::vector<int32_t> tId_;
+    std::vector<uint32_t> tStart_;
+    std::vector<uint32_t> tEnd_;
+    std::vector<uint32_t> aStart_;
+    std::vector<uint32_t> aEnd_;
+    std::vector<uint8_t> revStrand_;
+    std::vector<uint32_t> nM_;
+    std::vector<uint32_t> nMM_;
+    std::vector<uint8_t> mapQV_;
+
+    /// \}
+};
+
+/// \brief The PbiReferenceEntryClass represents a single reference in the PBI
+///        CoordinateSorted section.
+///
+/// A reference entry consists of an associated reference ID (tId), as well as
+/// start and end indices into the %BAM or PBI.
+///
+/// \note Rows are given in the interval [start, end).
+///
+class PBBAM_EXPORT PbiReferenceEntry
+{
+public:
+    using ID = uint32_t;
+    using Row = uint32_t;
+
+public:
+    static const ID UNMAPPED_ID;
+    static const Row UNSET_ROW;
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a default entry.
+    ///
+    /// - default ID:   PbiReferenceEntry::UNMAPPED_ID \n
+    /// - default rows: PbiReferenceEntry::UNSET_ROW
+    ///
+    PbiReferenceEntry();
+
+    /// \brief Creates a reference entry, with no rows set.
+    ///
+    /// - default rows: PbiReferenceEntry::UNSET_ROW
+    ///
+    PbiReferenceEntry(ID id);
+
+    /// \brief Creates a reference entry, with rows set.
+    ///
+    PbiReferenceEntry(ID id, Row beginRow, Row endRow);
+
+    bool operator==(const PbiReferenceEntry& other) const;
+
+    /// \}
+
+public:
+    /// \name Reference Data Members
+    /// \{
+
+    ID tId_;
+    Row beginRow_;
+    Row endRow_;
+
+    /// \}
+};
+
+/// \brief The PbiRawReferenceData class represents the raw data stored in the
+///        "CoordinateSortedData" section of the PBI index.
+///
+class PBBAM_EXPORT PbiRawReferenceData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty data structure, preallocating space for a
+    ///        number of references.
+    ///
+    /// This constructor is recommended as this is the safest way to ensure that
+    /// references without observed mappings are included in the final output.
+    ///
+    PbiRawReferenceData(uint32_t numRefs);
+
+    PbiRawReferenceData() = default;
+
+    /// \}
+
+public:
+    /// \name Raw Data Containers
+    /// \{
+
+    std::vector<PbiReferenceEntry> entries_;
+
+    /// \}
+};
+
+/// \brief The PbiRawBasicData class represents the raw data stored in the
+///        "BasicData" section of the PBI index.
+///
+class PBBAM_EXPORT PbiRawBasicData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty data structure, preallocating space for a known
+    ///        number of records.
+    PbiRawBasicData(uint32_t numReads);
+
+    PbiRawBasicData() = default;
+
+    /// \}
+
+public:
+    /// \name Index Construction
+    /// \{
+
+    /// \brief Adds a record's mapping data.
+    ///
+    /// \param[in] b        %BAM record
+    /// \param[in] offset   \b virtual file offset where record begins
+    ///
+    void AddRecord(const BamRecord& b, int64_t offset);
+
+    /// \}
+
+public:
+    /// \name Raw Data Containers
+    /// \{
+
+    std::vector<int32_t> rgId_;
+    std::vector<int32_t> qStart_;
+    std::vector<int32_t> qEnd_;
+    std::vector<int32_t> holeNumber_;
+    std::vector<float> readQual_;
+    std::vector<uint8_t> ctxtFlag_;
+    std::vector<int64_t> fileOffset_;
+    std::vector<uint16_t> fileNumber_;
+
+    /// \}
+};
+
+/// \brief The PbiRawData class provides an representation of raw PBI index
+///        data, used mostly for construction or I/O.
+///
+/// The PbiRawData class itself provides access to a few high-level attributes
+/// (e.g. version, number of records, etc.). The actual index data is stored
+/// in its member components:
+///     PbiRawBasicData,
+///     PbiRawMappedData,
+///     PbiRawReferenceData, &
+///     PbiRawBarcodeData .
+///
+class PBBAM_EXPORT PbiRawData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Loads raw PBI data from a file.
+    ///
+    /// \param[in] pbiFilename      ".pbi" filename
+    ///
+    /// \throws std::runtime_error if file contents cannot be loaded properly
+    ///
+    PbiRawData(std::string pbiFilename);
+
+    /// \brief Loads a raw, aggregate PBI data from a dataset
+    ///
+    /// This constructor creates a raw index object that contains an aggregation
+    /// of index data across the dataset.
+    ///
+    /// \note ReferenceData (the per-reference table for coordinate-sorted data)
+    ///       is not currently available for the index aggregate. All other
+    ///       per-record data sections will be present.
+    ///
+    /// \param[in] dataset  DataSet object
+    ///
+    /// \throws std::runtime_error if file(s) contents cannot be loaded properly
+    ///
+    explicit PbiRawData(const DataSet& dataset);
+
+    PbiRawData() = default;
+
+    /// \}
+
+public:
+    /// \name PBI General Attributes
+    /// \{
+
+    /// \returns true if index has BarcodeData section
+    bool HasBarcodeData() const;
+
+    /// \returns true if index has MappedData section
+    bool HasMappedData() const;
+
+    /// \returns true if index has ReferenceData section
+    bool HasReferenceData() const;
+
+    /// \returns true if index has \b section
+    /// \param[in] section PbiFile::Section identifier
+    ///
+    bool HasSection(const PbiFile::Section section) const;
+
+    /// \returns index filename ("*.pbi")
+    ///
+    /// \note Returns an empty string if the underlying data was calculated in
+    ///       code or aggregated from a DataSet, rather than loaded from a
+    ///       single PBI file.
+    ///
+    std::string Filename() const;
+
+    /// \returns enum flags representing the file sections present
+    PbiFile::Sections FileSections() const;
+
+    /// \returns the number of records in the PBI(s)
+    uint32_t NumReads() const;
+
+    /// \returns the PBI file's version
+    PbiFile::VersionEnum Version() const;
+
+    /// \}
+
+public:
+    /// \name Raw Data Components
+    /// \{
+
+    /// \returns const reference to BarcodeData lookup structure
+    ///
+    /// May be empty, check result of HasBarcodeData.
+    ///
+    const PbiRawBarcodeData& BarcodeData() const;
+
+    /// \returns const reference to BasicData lookup structure
+    const PbiRawBasicData& BasicData() const;
+
+    /// \returns const reference to MappedData lookup structure
+    ///
+    /// May be empty, check result of HasMappedData.
+    ///
+    const PbiRawMappedData& MappedData() const;
+
+    /// \returns const reference to reference data lookup structure
+    ///
+    /// May be empty, check result of HasReferenceData.
+    ///
+    const PbiRawReferenceData& ReferenceData() const;
+
+    /// \}
+
+public:
+    /// \name PBI General Attributes
+    /// \{
+
+    /// \brief Sets the file section flags.
+    ///
+    /// \param[in] sections     section flags
+    /// \returns reference to this index
+    ///
+    PbiRawData& FileSections(PbiFile::Sections sections);
+
+    /// \brief Sets the number of indexed records.
+    ///
+    /// \param[in] num  number of records
+    /// \returns reference to this index
+    ///
+    PbiRawData& NumReads(uint32_t num);
+
+    /// \brief Sets PBI file version.
+    ///
+    /// \param[in] version  file version
+    /// \returns reference to this index
+    ///
+    PbiRawData& Version(PbiFile::VersionEnum version);
+
+    /// \}
+
+public:
+    /// \name Raw Data Components
+    /// \{
+
+    /// \returns reference to BarcodeData lookup structure
+    ///
+    /// May be empty, check result of HasBarcodeData.
+    ///
+    PbiRawBarcodeData& BarcodeData();
+
+    /// \returns reference to BasicData lookup structure
+    PbiRawBasicData& BasicData();
+
+    /// \returns reference to MappedData lookup structure
+    ///
+    /// May be empty, check result of HasMappedData.
+    ///
+    PbiRawMappedData& MappedData();
+
+    /// \returns reference to reference data lookup structure
+    ///
+    /// May be empty, check result of HasReferenceData.
+    ///
+    PbiRawReferenceData& ReferenceData();
+
+    /// \}
+
+private:
+    std::string filename_;
+    PbiFile::VersionEnum version_ = PbiFile::CurrentVersion;
+    PbiFile::Sections sections_ = PbiFile::ALL;
+    uint32_t numReads_ = 0;
+    PbiRawBarcodeData barcodeData_;
+    PbiRawMappedData mappedData_;
+    PbiRawReferenceData referenceData_;
+    PbiRawBasicData basicData_;
+};
+
+// PBI index caching
+
+using PbiIndexCache = std::shared_ptr<std::vector<std::shared_ptr<PbiRawData>>>;
+
+PbiIndexCache MakePbiIndexCache(const DataSet& dataset);
+PbiIndexCache MakePbiIndexCache(const std::vector<BamFile>&);
+PbiIndexCache MakePbiIndexCache(const BamFile& bamFile);
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBIRAWDATA_H
diff --git a/include/pbbam/Position.h b/include/pbbam/Position.h

new file mode 100644 (file)

index 0000000..6fce22e
--- /dev/null
+++ b/include/pbbam/Position.h
@@ -0,0 +1,35 @@
+// File Description
+/// \file Position.h
+/// \brief Defines the Position typedef.
+//
+// Author: Derek Barnett
+
+#ifndef POSITION_H
+#define POSITION_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+
+#include <pbcopper/data/Position.h>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This type is used to refer to genomic positions.
+/// \typedef typedef int32_t PacBio::BAM::Position
+///
+/// We use a signed integer because SAM/BAM uses the -1 value to indicate
+/// unknown or unmapped positions.
+///
+using Position PBBAM_DEPRECATED = PacBio::Data::Position;
+
+/// \brief This constant is widely used as a "missing" or "invalid" position
+///        marker.
+///
+PBBAM_DEPRECATED constexpr Position UnmappedPosition{-1};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // POSITION_H
diff --git a/include/pbbam/ProgramInfo.h b/include/pbbam/ProgramInfo.h

new file mode 100644 (file)

index 0000000..f7101da
--- /dev/null
+++ b/include/pbbam/ProgramInfo.h
@@ -0,0 +1,178 @@
+// File Description
+/// \file ProgramInfo.h
+/// \brief Defines the ProgramInfo class.
+//
+// Author: Derek Barnett
+
+#ifndef PROGRAMINFO_H
+#define PROGRAMINFO_H
+
+#include "pbbam/Config.h"
+
+#include <map>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ProgramInfo class represents a program entry (\@PG) in the SAM
+///        header.
+///
+class PBBAM_EXPORT ProgramInfo
+{
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \brief Creates a ProgramInfo object from SAM-formatted text.
+    ///
+    /// \param[in] sam  SAM-formatted text
+    /// \returns program info object
+    ///
+    static ProgramInfo FromSam(const std::string& sam);
+
+    /// \brief Converts a ProgramInfo object to its SAM-formatted text.
+    ///
+    /// \param[in] prog     input ProgramInfo object
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    static std::string ToSam(const ProgramInfo& prog);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a program info object with an ID.
+    ///
+    /// \param[in] id       program ID (\@PG:ID)
+    ///
+    ProgramInfo(std::string id);
+
+    ProgramInfo() = default;
+
+    /// \}
+
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \returns true if program info is valid
+    ///
+    /// Currently this checks to see that ProgramInfo::Id does not contain an
+    /// empty string.
+    ///
+    bool IsValid() const;
+
+    /// \brief Converts this object to its SAM-formatted text.
+    ///
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    std::string ToSam() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns string value of \@PG:CL
+    std::string CommandLine() const;
+
+    /// \returns any non-standard tags added to the \@PG entry
+    ///
+    /// Result map consists of {tagName => value}.
+    ///
+    std::map<std::string, std::string> CustomTags() const;
+
+    /// \returns string value of \@PG:DS
+    std::string Description() const;
+
+    /// \returns string value of \@PG:ID
+    std::string Id() const;
+
+    /// \returns string value of \@PG:PN
+    std::string Name() const;
+
+    /// \returns string value of \@PG:PP
+    std::string PreviousProgramId() const;
+
+    /// \returns string value of \@PG:VN
+    std::string Version() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Sets the value for \@PG:CL
+    ///
+    /// \param[in] cmd      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& CommandLine(std::string cmd);
+
+    /// \brief Sets a new collection of non-standard tags.
+    ///
+    /// Custom tag map entries should consist of {tagName => value}.
+    ///
+    /// \param[in] custom      new tags
+    /// \returns reference to this object
+    ///
+    ProgramInfo& CustomTags(std::map<std::string, std::string> custom);
+
+    /// \brief Sets the value for \@PG:DS
+    ///
+    /// \param[in] description      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& Description(std::string description);
+
+    /// \brief Sets the value for \@PG:ID
+    ///
+    /// \param[in] id      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& Id(std::string id);
+
+    /// \brief Sets the value for \@PG:PN
+    ///
+    /// \param[in] name      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& Name(std::string name);
+
+    /// \brief Sets the value for \@PG:PP
+    ///
+    /// \param[in] id      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& PreviousProgramId(std::string id);
+
+    /// \brief Sets the value for \@PG:VN
+    ///
+    /// \param[in] version      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& Version(std::string version);
+
+    /// \}
+
+private:
+    std::string commandLine_;        // CL:<CommandLine>
+    std::string description_;        // DS:<Description>
+    std::string id_;                 // ID:<ID>  * must be unique for valid SAM *
+    std::string name_;               // PN:<Name>
+    std::string previousProgramId_;  // PP:<PreviousProgramID>
+    std::string version_;            // VN:<Version>
+
+    // custom attributes
+    std::map<std::string, std::string> custom_;  // tag => value
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PROGRAMINFO_H
diff --git a/include/pbbam/PulseBehavior.h b/include/pbbam/PulseBehavior.h

new file mode 100644 (file)

index 0000000..b01fe50
--- /dev/null
+++ b/include/pbbam/PulseBehavior.h
@@ -0,0 +1,27 @@
+// File Description
+/// \file PulseBehavior.h
+/// \brief Defines the PulseBehavior enum.
+//
+// Author: Derek Barnett
+
+#ifndef PULSEBEHAVIOR_H
+#define PULSEBEHAVIOR_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the pulsecall modes supported by BamRecord tag
+///        accessors.
+///
+enum class PulseBehavior
+{
+    BASECALLS_ONLY,  ///< "Squashed" pulses not included, only basecalls.
+    ALL              ///< All pulses included.
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PULSEBEHAVIOR_H
diff --git a/include/pbbam/PulseExclusionReason.h b/include/pbbam/PulseExclusionReason.h

new file mode 100644 (file)

index 0000000..c5a6ad3
--- /dev/null
+++ b/include/pbbam/PulseExclusionReason.h
@@ -0,0 +1,30 @@
+// File Description
+/// \file PulseExclusionReason.h
+/// \brief Defines the PulseExclusionReason enum.
+//
+// Author: Derek Barnett
+
+#ifndef PULSE_EXCLUSION_REASON_H
+#define PULSE_EXCLUSION_REASON_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the possible pulse exclusion reasons
+///
+enum class PulseExclusionReason : uint8_t
+{
+    BASE = 0,
+    SHORT_PULSE,
+    BURST,
+    PAUSE
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PULSE_EXCLUSION_REASON_H
diff --git a/include/pbbam/QNameQuery.h b/include/pbbam/QNameQuery.h

new file mode 100644 (file)

index 0000000..50ebda8
--- /dev/null
+++ b/include/pbbam/QNameQuery.h
@@ -0,0 +1,60 @@
+// File Description
+/// \file QNameQuery.h
+/// \brief Defines the QNameQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef QNAMEQUERY_H
+#define QNAMEQUERY_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The QNameQuery class provides iterable access to a DataSet's records,
+///        with each iteration of the query returning a contiguous block of
+///        records that share a name.
+///
+/// There is no random-access here. It is simply a sequential read-through,
+/// grouping contiguous results that share a BamRecord::FullName.
+///
+/// \note The name is not ideal - but for legacy reasons, it will remain as-is
+///       for now. It will likely become something more explicit, like
+///       "SequentialQNameGroupQuery", so that the name "QNameQuery" will be
+///       available for a built-in query on a QNAME filter (or whitelist). This
+///       will make it more consistent with other queries (ReadAccuracyQuery,
+///       SubreadLengthQuery, ZmwQuery, etc).
+///
+class PBBAM_EXPORT QNameQuery : public internal::IGroupQuery
+{
+public:
+    /// \brief Creates a new QNameQuery.
+    ///
+    /// \param[in] dataset      input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM files
+    ///
+    QNameQuery(const DataSet& dataset);
+    ~QNameQuery() override;
+
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(std::vector<BamRecord>& records) override;
+
+private:
+    class QNameQueryPrivate;
+    std::unique_ptr<QNameQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // QNAMEQUERY_H
diff --git a/include/pbbam/QualityValue.h b/include/pbbam/QualityValue.h

new file mode 100644 (file)

index 0000000..14b9e5b
--- /dev/null
+++ b/include/pbbam/QualityValue.h
@@ -0,0 +1,26 @@
+// File Description
+/// \file QualityValue.h
+/// \brief Defines the QualityValue class.
+//
+// Author: Derek Barnett
+
+#ifndef QUALITYVALUE_H
+#define QUALITYVALUE_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <pbcopper/data/QualityValue.h>
+
+namespace PacBio {
+namespace BAM {
+
+using QualityValue PBBAM_DEPRECATED = PacBio::Data::QualityValue;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // QUALITYVALUE_H
diff --git a/include/pbbam/QualityValues.h b/include/pbbam/QualityValues.h

new file mode 100644 (file)

index 0000000..5158244
--- /dev/null
+++ b/include/pbbam/QualityValues.h
@@ -0,0 +1,28 @@
+// File Description
+/// \file QualityValues.h
+/// \brief Defines the QualityValues class.
+//
+// Author: Derek Barnett
+
+#ifndef QUALITYVALUES_H
+#define QUALITYVALUES_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <pbcopper/data/QualityValues.h>
+
+#include "pbbam/QualityValue.h"
+
+namespace PacBio {
+namespace BAM {
+
+using QualityValues PBBAM_DEPRECATED = PacBio::Data::QualityValues;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // QUALITYVALUES_H
diff --git a/include/pbbam/ReadAccuracyQuery.h b/include/pbbam/ReadAccuracyQuery.h

new file mode 100644 (file)

index 0000000..6fcea10
--- /dev/null
+++ b/include/pbbam/ReadAccuracyQuery.h
@@ -0,0 +1,69 @@
+// File Description
+/// \file ReadAccuracyQuery.h
+/// \brief Defines the ReadAccuracyQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef READACCURACYQUERY_H
+#define READACCURACYQUERY_H
+
+#include "pbbam/Config.h"
+
+#include <vector>
+
+#include "pbbam/Accuracy.h"
+#include "pbbam/Compare.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ReadAccuracyQuery class provides iterable access to a DataSet's
+///        %BAM records, limiting results to those matching a read accuracy
+///        criterion.
+///
+/// Example:
+/// \include code/ReadAccuracyQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT ReadAccuracyQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new ReadAccuracyQuery, limiting record results to only
+    ///        those matching a read accuracy criterion.
+    ///
+    /// \param[in] accuracy     read accuracy value
+    /// \param[in] compareType  compare operator
+    /// \param[in] dataset      input data source(s)
+    ///
+    /// \sa BamRecord::ReadAccuracy
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or PBI
+    ///         files.
+    ///
+    ReadAccuracyQuery(const Accuracy accuracy, const Compare::Type compareType,
+                      const DataSet& dataset);
+
+    ~ReadAccuracyQuery() override;
+
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+    uint32_t NumReads() const;
+
+private:
+    class ReadAccuracyQueryPrivate;
+    std::unique_ptr<ReadAccuracyQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // READACCURACYQUERY_H
diff --git a/include/pbbam/ReadGroupInfo.h b/include/pbbam/ReadGroupInfo.h

new file mode 100644 (file)

index 0000000..3d86142
--- /dev/null
+++ b/include/pbbam/ReadGroupInfo.h
@@ -0,0 +1,713 @@
+// File Description
+/// \file ReadGroupInfo.h
+/// \brief Defines the ReadGroupInfo class.
+//
+// Author: Derek Barnett
+
+#ifndef READGROUPINFO_H
+#define READGROUPINFO_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+
+#include <boost/optional.hpp>
+
+#include "pbbam/exception/InvalidSequencingChemistryException.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum describes the base features that may be present in a read
+///        group's records.
+///
+/// This information is stored in its description (\@RG:DS).
+///
+enum class BaseFeature
+{
+    DELETION_QV,
+    DELETION_TAG,
+    INSERTION_QV,
+    MERGE_QV,
+    SUBSTITUTION_QV,
+    SUBSTITUTION_TAG,
+    IPD,
+    PULSE_WIDTH,
+    PKMID,
+    PKMEAN,
+    PKMID2,
+    PKMEAN2,
+    LABEL,
+    LABEL_QV,
+    ALT_LABEL,
+    ALT_LABEL_QV,
+    PULSE_MERGE_QV,
+    PULSE_CALL,
+    PRE_PULSE_FRAMES,
+    PULSE_CALL_WIDTH,
+    START_FRAME,
+    PULSE_EXCLUSION
+};
+
+/// \brief This enum describes the encoding types used for frame data within a
+///        read group's records.
+///
+/// This information is stored in its description (\@RG:DS).
+///
+enum class FrameCodec
+{
+    RAW,
+    V1
+};
+
+/// \brief This enum describes the experimental design of the barcodes within a
+///        read group's records.
+///
+/// This information is stored in its description (\@RG:DS).
+///
+enum class BarcodeModeType
+{
+    NONE,
+    SYMMETRIC,
+    ASYMMETRIC,
+    TAILED
+};
+
+/// \brief This enum describes the type of value encoded by barcode quality,
+///        within a read group's records.
+///
+/// This information is stored in its description (\@RG:DS).
+///
+enum class BarcodeQualityType
+{
+    NONE,
+    SCORE,
+    PROBABILITY
+};
+
+/// \brief This enum describes the instrument type / platform model,
+///        within a read group's records.
+///
+/// This information is stored in its description (\@RG:PM).
+///
+enum class PlatformModelType
+{
+    ASTRO,
+    RS,
+    SEQUEL,
+    SEQUELII
+};
+
+/// \brief The ReadGroupInfo class represents a read group entry (\@RG) in the
+///        SAM header.
+///
+class PBBAM_EXPORT ReadGroupInfo
+{
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \brief Creates a ReadGroupInfo object from SAM-formatted text.
+    ///
+    /// \param[in] sam  SAM-formatted text
+    /// \returns read group info object
+    ///
+    static ReadGroupInfo FromSam(const std::string& sam);
+
+    /// \brief Converts a ReadGroupInfo object to its SAM-formatted text.
+    ///
+    /// \param[in] rg     input ReadGroupInfo object
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    static std::string ToSam(const ReadGroupInfo& rg);
+
+    ///
+    /// \brief GetBaseId
+    ///
+    /// \param  id
+    /// \return the hash portion only of a read group ID, with (optional)
+    ///         barcode labels removed)
+    ///
+    /// \sa ReadGroupInfo::BaseId
+    ///
+    static std::string GetBaseId(const std::string& id);
+
+    /// \brief Converts a read group ID (string) to its numeric value.
+    ///
+    /// \note Accepts the optional barcode-labeled IDs. These will be stripped
+    ///       and number calculated from the base value.
+    ///
+    /// \param[in] rgId     read group ID string
+    /// \returns numeric value of ID
+    ///
+    static int32_t IdToInt(const std::string& rgId);
+
+    /// \brief Converts a read group ID number to its string representation.
+    ///
+    /// \param[in] id     read group ID number
+    /// \returns hexadecimal string representation of ID
+    ///
+    static std::string IntToId(const int32_t id);
+
+    /// \returns sequencing chemistry from (bindingKig, sequencingKit,
+    ///          basecallerVersion)
+    ///
+    static std::string SequencingChemistryFromTriple(const std::string& bindingKit,
+                                                     const std::string& sequencingKit,
+                                                     const std::string& basecallerVersion);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty read group of UNKNOWN read type.
+    ReadGroupInfo();
+
+    /// \brief Creates a read group info object with an ID.
+    ///
+    /// \note \p id can be a "standard" ID or contain barcode labels.
+    ///
+    /// \param[in] id   string representation of read group ID
+    ///
+    ReadGroupInfo(std::string id);
+
+    /// \brief Creates a read group info object from a movie name & read type.
+    ///
+    /// \param[in] movieName    sequencing movie name
+    /// \param[in] readType     string version of record type
+    ///
+    /// \sa RecordType
+    ///
+    ReadGroupInfo(std::string movieName, std::string readType);
+
+    /// \brief Creates a read group info object from a movie name, read type,
+    ///        and platform model.
+    ///
+    /// \param[in] movieName    sequencing movie name
+    /// \param[in] readType     string version of record type
+    /// \param[in] platform     platform model type
+    ///
+    /// \sa RecordType
+    ///
+    ReadGroupInfo(std::string movieName, std::string readType, PlatformModelType platform);
+
+    /// \brief Creates a read group info object with an ID.
+    ///
+    /// \param[in] baseId       string representation of numeric read group ID
+    /// \param[in] barcodes     barcode pair for this read group
+    ///
+    ReadGroupInfo(std::string baseId, std::pair<uint16_t, uint16_t> barcodes);
+
+    /// \brief Creates a read group info object from a movie name & read type.
+    ///
+    /// \param[in] movieName    sequencing movie name
+    /// \param[in] readType     string version of record type
+    /// \param[in] barcodes     barcode pair for this read group
+    ///
+    /// \sa RecordType
+    ///
+    ReadGroupInfo(std::string movieName, std::string readType,
+                  std::pair<uint16_t, uint16_t> barcodes);
+
+    /// \brief Creates a read group info object from a movie name, read type,
+    ///        and platform model.
+    ///
+    /// \param[in] movieName    sequencing movie name
+    /// \param[in] readType     string version of record type
+    /// \param[in] platform     platform model type
+    /// \param[in] barcodes     barcode pair for this read group
+    ///
+    /// \sa RecordType
+    ///
+    ReadGroupInfo(std::string movieName, std::string readType, PlatformModelType platform,
+                  std::pair<uint16_t, uint16_t> barcodes);
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    bool operator==(const ReadGroupInfo& other) const;
+
+    /// Enable sort on RG:ID
+    bool operator<(const ReadGroupInfo& other) const;
+
+    /// \}
+
+public:
+    /// \name Conversion & Validation
+    /// \{
+
+    /// \returns true if read group info is valid
+    ///
+    /// Currently this checks to see that ReadGroupInfo::Id does not contain an
+    /// empty string.
+    ///
+    bool IsValid() const;
+
+    /// \brief Converts this object to its SAM-formatted text.
+    ///
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    std::string ToSam() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns the number of barcode sequences in BarcodeFile, as stored in
+    ///          the description tag (\@RG:DS)
+    ///
+    /// \throws std::runtime_error if barcode data not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    size_t BarcodeCount() const;
+
+    /// \returns name of FASTA file containing barcode sequences, as stored in
+    ///          the description tag (\@RG:DS)
+    ///
+    /// \throws std::runtime_error if barcode data not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    std::string BarcodeFile() const;
+
+    /// \returns MD5 hash of the contents of BarcodeFile, as stored in the
+    ///          description tag (\@RG:DS)
+    ///
+    /// \throws std::runtime_error if barcode data not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    std::string BarcodeHash() const;
+
+    /// \returns experimental design type of barcodes, as stored in the
+    ///          description tag (\@RG:DS)
+    ///
+    /// \throws std::runtime_error if barcode data not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    BarcodeModeType BarcodeMode() const;
+
+    /// \returns type of value encoded by the 'bq' tag, as stored in the
+    ///          description tag (\@RG:DS)
+    ///
+    /// \throws std::runtime_error if barcode data is not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    BarcodeQualityType BarcodeQuality() const;
+
+    /// \returns barcode pair stored in the read group ID (\@RG:ID)
+    ///
+    /// \note This does **NOT** refer to any data in the description (DS) tag.
+    ///
+    boost::optional<std::pair<uint16_t, uint16_t>> Barcodes() const;
+
+    /// \returns forward barcode label stored in the read group ID (\@RG:ID)
+    ///
+    /// \note This does **NOT** refer to any data in the description (DS) tag.
+    ///
+    boost::optional<uint16_t> BarcodeForward() const;
+
+    /// \returns reverse barcode label stored in the read group ID (\@RG:ID)
+    ///
+    /// \note This does **NOT** refer to any data in the description (DS) tag.
+    ///
+    boost::optional<uint16_t> BarcodeReverse() const;
+
+    /// \returns basecaller version number (e.g. "2.1")
+    std::string BasecallerVersion() const;
+
+    /// \returns tag name in use for the specified for base feature
+    std::string BaseFeatureTag(BaseFeature feature) const;
+
+    /// \returns the hash portion only of a read group ID, with (optional)
+    ///          barcode labels removed)
+    ///
+    /// For most read groups (without barcode labels), this will be the same as
+    /// ID(). However, for those read groups with barcoded-labels, this method
+    /// will return the ID without those labels.
+    ///
+    /// Id() should be preferred over this method in most cases. This is
+    /// intended for use with hash-string or integers directly.
+    ///
+    /// For "ID:12345678":
+    ///     rg.Id()     -> "12345678"
+    ///     rg.BaseId() -> "12345678"
+    ///
+    /// For "ID:12345678/0--0":
+    ///     rg.Id()   -> "12345678/0--0";
+    ///     rg.BaseId -> "12345678"
+    ///
+    /// \sa Id
+    ///
+    std::string BaseId() const;
+
+    /// \returns binding kit part number (e.g. "100236500")
+    std::string BindingKit() const;
+
+    /// \returns true if reads are classified as spike-in controls
+    bool Control() const;
+
+    /// \returns any non-standard tags added to the \@PG entry
+    ///
+    /// Result map consists of {tagName => value}.
+    ///
+    std::map<std::string, std::string> CustomTags() const;
+
+    /// \returns string value of \@RG:DT
+    std::string Date() const;
+
+    /// \returns string value of \@RG:FO
+    std::string FlowOrder() const;
+
+    /// \returns frame rate in Hz
+    std::string FrameRateHz() const;
+
+    /// \returns true if the read group description (\@RG:DS) contains barcode data
+    ///
+    /// \note This does **NOT** refer to the optional barcode labels.
+    ///
+    bool HasBarcodeData() const;
+
+    /// \returns true if read group has an entry for the specified base feature
+    bool HasBaseFeature(BaseFeature feature) const;
+
+    /// \returns full string value of \@RG:ID, whether optional barcode labels
+    ///          are present
+    ///
+    /// This method should be perferred over BaseId() in most cases,
+    /// e.g. mapping between header info.
+    ///
+    /// For "ID:12345678":
+    ///     rg.Id()     -> "12345678"
+    ///     rg.BaseId() -> "12345678"
+    ///
+    /// For "ID:12345678/0--0":
+    ///     rg.Id()   -> "12345678/0--0";
+    ///     rg.BaseId -> "12345678"
+    ///
+    /// \sa BaseId
+    ///
+    std::string Id() const;
+
+    /// \returns codec type in use for IPD
+    FrameCodec IpdCodec() const;
+
+    /// \returns string value of \@RG:KS
+    std::string KeySequence() const;
+
+    /// \returns string value of \@RG:LB
+    std::string Library() const;
+
+    /// \returns movie name (stored in \@RG:PU)
+    std::string MovieName() const;
+
+    /// \returns string value of \@RG:PL
+    std::string Platform() const;
+
+    /// \returns string value of \@RG:PM
+    PlatformModelType PlatformModel() const;
+
+    /// \returns string value of \@RG:PI
+    std::string PredictedInsertSize() const;
+
+    /// \returns string value of \@RG:PG
+    std::string Programs() const;
+
+    /// \returns codec type in use for PulseWidth
+    FrameCodec PulseWidthCodec() const;
+
+    /// \returns string value of read type
+    std::string ReadType() const;
+
+    /// \returns string value of \@RG:SM
+    std::string Sample() const;
+
+    /// \returns string value of \@RG:CN
+    std::string SequencingCenter() const;
+
+    /// \returns sequencing chemistry name
+    std::string SequencingChemistry() const;
+
+    /// \returns sequencing kit part number
+    std::string SequencingKit() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Sets barcode data for the read group's description tag.
+    ///
+    /// Barcode fields are either absent or all must be present.
+    ///
+    /// \param[in] barcodeFile      barcode filename
+    /// \param[in] barcodeHash      MD5 hash of barcode file
+    /// \param[in] barcodeCount     number of records in barcode file
+    /// \param[in] barcodeMode      experimental design of barcodes
+    /// \param[in] barcodeQuality   type of barcode quality value
+    ///
+    /// \sa BarcodeFile \n
+    ///     BarcodeHash \n
+    ///     BarcodeCount \n
+    ///     BarcodeMode \n
+    ///     BarcodeQuality \n
+    ///     ReadGroupInfo::ClearBarcodeData
+    ///
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& BarcodeData(std::string barcodeFile, std::string barcodeHash,
+                               size_t barcodeCount, BarcodeModeType barcodeMode,
+                               BarcodeQualityType barcodeQuality);
+
+    /// \brief Sets the basecaller version number.
+    ///
+    /// \param[in] versionNumber   new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& BasecallerVersion(std::string versionNumber);
+
+    /// \brief Sets the tag to be used for a particular base feature.
+    ///
+    /// \param[in] feature      feature type begin updated
+    /// \param[in] tag          new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& BaseFeatureTag(BaseFeature feature, std::string tag);
+
+    /// \brief Sets the binding kit part number.
+    ///
+    /// \param[in] kitNumber    new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& BindingKit(std::string kitNumber);
+
+    /// \brief Removes all barcode data from this read group.
+    ///
+    /// \returns reference to this read group
+    ///
+    ReadGroupInfo& ClearBarcodeData();
+
+    /// \brief Removes all base features from this read group.
+    ///
+    /// \returns reference to this read group
+    ///
+    ReadGroupInfo& ClearBaseFeatures();
+
+    /// \brief Sets whether read group's records are classifed as spike-in
+    ///        controls.
+    ///
+    /// \param[in] ctrl     true if records are spike-in controls
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Control(bool ctrl);
+
+    /// \brief Sets a new collection of non-standard tags.
+    ///
+    /// Custom tag map entries should consist of {tagName => value}.
+    ///
+    /// \param[in] custom      new tags
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& CustomTags(std::map<std::string, std::string> custom);
+
+    /// \brief Sets the value for \@RG:DT
+    ///
+    /// \param[in] date      new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Date(std::string date);
+
+    /// \brief Sets the value for \@RG:FO
+    ///
+    /// \param[in] order     new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& FlowOrder(std::string order);
+
+    /// \brief Sets the frame rate.
+    ///
+    /// \param[in] frameRateHz     string value of frame rate in Hz
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& FrameRateHz(std::string frameRateHz);
+
+    /// \brief Sets the read group's ID.
+    ///
+    /// \param[in] id     string value of ID
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Id(std::string id);
+
+    /// \brief Sets the read group's ID, from movie name & read type
+    ///
+    /// \param[in] movieName    sequencing movie name
+    /// \param[in] readType     string version of read type
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Id(const std::string& movieName, const std::string& readType);
+
+    /// \brief Sets the codec type used for IPD
+    ///
+    /// \param[in] codec    codec type
+    /// \param[in] tag      IPD tag
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& IpdCodec(FrameCodec codec, std::string tag = std::string());
+
+    /// \brief Sets the value for \@RG:KS
+    ///
+    /// \param[in] sequence      new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& KeySequence(std::string sequence);
+
+    /// \brief Sets the value for \@RG:LB
+    ///
+    /// \param[in] library      new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Library(std::string library);
+
+    /// \brief Sets the value for movie name (stored in \@RG:PU).
+    ///
+    /// \param[in] movieName    new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& MovieName(std::string movieName);
+
+    /// \brief Sets the value for \@RG:PI
+    ///
+    /// \param[in] size         new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& PredictedInsertSize(std::string size);
+
+    /// \brief Sets the value for \@RG:PG
+    ///
+    /// \param[in] programs     new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Programs(std::string programs);
+
+    /// \brief Sets the value for \@RG:PM
+    ///
+    /// \param[in] platformModel new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& PlatformModel(PlatformModelType platform);
+
+    /// \brief Sets the codec type used for PulseWidth
+    ///
+    /// \param[in] codec    codec type
+    /// \param[in] tag      pulse width tag
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& PulseWidthCodec(FrameCodec codec, std::string tag = std::string());
+
+    /// \brief Sets the read type.
+    ///
+    /// \param[in] type    new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& ReadType(std::string type);
+
+    /// \brief Removes a particular base feature from this read group.
+    ///
+    /// \param[in] feature      feature to remove
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& RemoveBaseFeature(BaseFeature feature);
+
+    /// \brief Sets the value for \@RG:SM
+    ///
+    /// \param[in] sample       new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Sample(std::string sample);
+
+    /// \brief Sets the value for \@RG:CN
+    ///
+    /// \param[in] center       new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& SequencingCenter(std::string center);
+
+    /// \brief Sets the sequencing kit part number.
+    ///
+    /// \param[in] kitNumber    new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& SequencingKit(std::string kitNumber);
+
+    /// \}
+
+private:
+    std::string id_;                   // ID * must be unique for valid SAM *
+    std::string sequencingCenter_;     // CN
+    std::string date_;                 // DT * (ISO-8601) *
+    std::string flowOrder_;            // FO
+    std::string keySequence_;          // KS
+    std::string library_;              // LB
+    std::string programs_;             // PG
+    std::string predictedInsertSize_;  // PI
+    std::string movieName_;            // PU
+    std::string sample_;               // SM
+
+    PlatformModelType platformModel_ = PlatformModelType::SEQUEL;  // PM
+
+    // DS:<Description> components
+    std::string readType_;
+    std::string bindingKit_;
+    std::string sequencingKit_;
+    std::string basecallerVersion_;
+    mutable std::string sequencingChemistry_;
+    std::string frameRateHz_;
+    bool control_ = false;
+    FrameCodec ipdCodec_ = FrameCodec::V1;
+    FrameCodec pulseWidthCodec_ = FrameCodec::V1;
+    bool hasBarcodeData_ = false;
+    std::string barcodeFile_;
+    std::string barcodeHash_;
+    size_t barcodeCount_ = 0;
+    BarcodeModeType barcodeMode_ = BarcodeModeType::NONE;
+    BarcodeQualityType barcodeQuality_ = BarcodeQualityType::NONE;
+    std::map<BaseFeature, std::string> features_;
+
+    // (optional) barcode label handling
+    boost::optional<std::pair<uint16_t, uint16_t>> barcodes_ = boost::none;
+    std::string baseId_;
+
+    // custom attributes
+    std::map<std::string, std::string> custom_;  // tag => value
+
+private:
+    std::string EncodeSamDescription() const;
+    void DecodeSamDescription(const std::string& description);
+    void DecodeBarcodeKey(const std::string& key, std::string value);
+    void DecodeFrameCodecKey(const std::string& key, std::string value);
+};
+
+/// \brief Creates a read group ID from a movie name & read type.
+///
+/// \param[in] movieName    sequencing movie name
+/// \param[in] readType     string version of read type
+///
+/// \returns hexadecimal string read group ID
+///
+PBBAM_EXPORT
+std::string MakeReadGroupId(const std::string& movieName, const std::string& readType);
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // READGROUPINFO_H
diff --git a/include/pbbam/RecordType.h b/include/pbbam/RecordType.h

new file mode 100644 (file)

index 0000000..856d831
--- /dev/null
+++ b/include/pbbam/RecordType.h
@@ -0,0 +1,56 @@
+// File Description
+/// \file RecordType.h
+/// \brief Defines the RecordType enum.
+//
+// Author: Derek Barnett
+
+#ifndef RECORDTYPE_H
+#define RECORDTYPE_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the possible PacBio BAM record types.
+///
+/// \sa ReadGroupInfo::ReadType
+///
+enum class RecordType
+{
+    ZMW,         ///< Polymerase read
+    HQREGION,    ///< High-quality region
+    SUBREAD,     ///< Subread
+    CCS,         ///< Circular consensus sequence
+    SCRAP,       ///< Additional sequence (barcodes, adapters, etc.)
+    UNKNOWN,     ///< Unknown read type
+    TRANSCRIPT,  ///< Transcript
+
+    POLYMERASE = ZMW  ///< \deprecated as of PacBio BAM spec v 3.0.4 (use RecordType::ZMW instead)
+};
+
+///
+/// \brief IsCcsOrTranscript
+///
+/// CCS & Transcript type records handle queryStart/End in the same way. This
+/// status is checked in several places, so this is a convenient helper.
+///
+/// \param[in] type
+///
+bool IsCcsOrTranscript(const RecordType type);
+
+///
+/// \brief Returns string representation of RecordType
+///
+/// \param type
+/// \return std::string
+/// \throws std::runtime_error if type is unrecognized
+///
+std::string ToString(const RecordType type);
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // RECORDTYPE_H
diff --git a/include/pbbam/SNR.h b/include/pbbam/SNR.h

new file mode 100644 (file)

index 0000000..95a9f1c
--- /dev/null
+++ b/include/pbbam/SNR.h
@@ -0,0 +1,27 @@
+// File Description
+/// \file SNR.h
+/// \brief Defines the SNR struct
+//
+// Author: Lance Hepler, Derek Barnett
+
+#ifndef SNR_H
+#define SNR_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <vector>
+
+#include <pbcopper/data/SNR.h>
+
+namespace PacBio {
+namespace BAM {
+
+using SNR PBBAM_DEPRECATED = PacBio::Data::SNR;
+
+PBBAM_DEPRECATED constexpr auto ClampSNR = PacBio::Data::ClampSNR;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // SNR_H
diff --git a/include/pbbam/SamTagCodec.h b/include/pbbam/SamTagCodec.h

new file mode 100644 (file)

index 0000000..0c9ad89
--- /dev/null
+++ b/include/pbbam/SamTagCodec.h
@@ -0,0 +1,68 @@
+// File Description
+/// \file SamTagCodec.h
+/// \brief Defines the SamTagCodec class.
+//
+// Author: Derek Barnett
+
+#ifndef SAMTAGCODEC_H
+#define SAMTAGCODEC_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+#include "pbbam/TagCollection.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The SamTagCodec class provides text-based encoding/decoding of %BAM
+///        tag data.
+///
+/// \note SamTagCodec is mostly an implementation and/or testing detail, and may
+///       be removed from the public API.
+///
+class PBBAM_EXPORT SamTagCodec
+{
+public:
+    /// \name Tag Collection Methods
+    /// \{
+
+    /// \brief Creates a TagCollection from SAM-formatted tag data.
+    ///
+    /// \param[in] tagString    SAM-formmated string
+    /// \returns resulting tag collection
+    ///
+    static TagCollection Decode(const std::string& tagString);
+
+    /// \brief Creates SAM-formatted string from a Tag.
+    ///
+    /// \param[in] name 2-character tag name
+    /// \param[in] tag  Tag instance containing data
+    ///
+    /// \return SAM-formatted string
+    ///
+    static std::string Encode(const std::string& name, const PacBio::BAM::Tag& tag);
+
+    /// \brief Creates SAM-formatted string from a TagCollection.
+    ///
+    /// \param[in] tags     TagCollection containing tag data
+    /// \returns SAM-formatted string
+    ///
+    static std::string Encode(const PacBio::BAM::TagCollection& tags);
+};
+
+///
+/// \brief creates a tag per the SAM/BAM text format
+///
+/// \param tag    tag name
+/// \param value  tag value
+///
+/// \return formatted tag string
+///
+std::string MakeSamTag(std::string tag, std::string value);
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // SAMTAGCODEC_H
diff --git a/include/pbbam/SamWriter.h b/include/pbbam/SamWriter.h

new file mode 100644 (file)

index 0000000..f0409bc
--- /dev/null
+++ b/include/pbbam/SamWriter.h
@@ -0,0 +1,96 @@
+// File Description
+/// \file SamWriter.h
+/// \brief Defines the SamWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef SAMWRITER_H
+#define SAMWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/IRecordWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The SamWriter class provides a writing interface for creating
+///        new SAM files.
+///
+/// \note The underlying buffered data may not be flushed to the file until the
+///       destructor is called. Trying to access the file (reading, stat-ing,
+///       indexing, etc.) before the SamWriter is destroyed yields undefined
+///       behavior. Enclose the SamWriter in some form of local scope (curly
+///       braces, a separate function, etc.) to ensure that its destructor is
+///       called before proceeding to read-based operations.
+///
+/// \code{.cpp}
+///  {
+///     SamWriter w(...);
+///     // write data
+///  }
+///  // now safe to access the new file
+/// \endcode
+///
+///
+class SamWriter : public IRecordWriter
+{
+public:
+    /// \brief Opens a SAM file for writing & writes the header information.
+    ///
+    /// \note Set \p filename to "-" for stdout.
+    ///
+    /// \param[in] filename     path to output SAM file
+    /// \param[in] header       BamHeader object
+    ///
+    /// \throws std::runtime_error if there was a problem opening the file for
+    ///         writing or if an error occurred while writing the header
+    ///
+    SamWriter(std::string filename, const BamHeader& header);
+
+    /// Fully flushes all buffered data & closes file.
+    ///
+    SamWriter(SamWriter&&) noexcept;
+    SamWriter& operator=(SamWriter&&) noexcept;
+    ~SamWriter() override;
+
+    /// \brief Try to flush any buffered data to file.
+    ///
+    /// \note The underlying implementation may not necessarily flush buffered
+    ///       data immediately, especially in a multithreaded writer situation.
+    ///       Let the SamWriter go out of scope to fully ensure flushing.
+    ///
+    /// \throws std::runtime_error if flush fails
+    ///
+    void TryFlush() override;
+
+    /// \brief Write a record to the output SAM file.
+    ///
+    /// \param[in] record BamRecord object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecord& record) override;
+
+    /// \brief Write a record to the output SAM file.
+    ///
+    /// \param[in] recordImpl BamRecordImpl object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecordImpl& recordImpl) override;
+
+private:
+    class SamWriterPrivate;
+    std::unique_ptr<SamWriterPrivate> d_;
+};
+
+}  // namesapce BAM
+}  // namespace PacBio
+
+#endif  // SAMWRITER_H
diff --git a/include/pbbam/SequenceInfo.h b/include/pbbam/SequenceInfo.h

new file mode 100644 (file)

index 0000000..5361b01
--- /dev/null
+++ b/include/pbbam/SequenceInfo.h
@@ -0,0 +1,188 @@
+// File Description
+/// \file SequenceInfo.h
+/// \brief Defines the SequenceInfo class.
+//
+// Author: Derek Barnett
+
+#ifndef SEQUENCEINFO_H
+#define SEQUENCEINFO_H
+
+#include "pbbam/Config.h"
+
+#include <map>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The SequenceInfo class represents a program entry (\@SQ) in the SAM
+///        header.
+///
+class PBBAM_EXPORT SequenceInfo
+{
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \brief Creates a SequenceInfo object from SAM-formatted text.
+    ///
+    /// \param[in] sam  SAM-formatted text
+    /// \returns program info object
+    ///
+    static SequenceInfo FromSam(const std::string& sam);
+
+    /// \brief Converts a SequenceInfo object to its SAM-formatted text.
+    ///
+    /// \param[in] seq     input SequenceInfo object
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    static std::string ToSam(const SequenceInfo& seq);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a sequence info object with name & (optional) length.
+    ///
+    /// \param[in] name       sequence name (\@SQ:SN)
+    /// \param[in] length     sequence length (\@SQ:LN)
+    ///
+    SequenceInfo(std::string name, std::string length = "0");
+
+    SequenceInfo() = default;
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    bool operator==(const SequenceInfo& other) const;
+    bool operator!=(const SequenceInfo& other) const;
+
+    /// \}
+
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \returns true if sequence info is valid
+    ///
+    /// Currently this checks to see that Name is non-empty and Length is within
+    /// the accepted range.
+    ///
+    bool IsValid() const;
+
+    /// \brief Converts this object to its SAM-formatted text.
+    ///
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    std::string ToSam() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns string value of \@SQ:AS
+    std::string AssemblyId() const;
+
+    /// \returns string value of \@SQ:M5
+    std::string Checksum() const;
+
+    /// \returns any non-standard tags added to the \@PG entry
+    ///
+    /// Result map consists of {tagName => value}.
+    ///
+    std::map<std::string, std::string> CustomTags() const;
+
+    /// \returns string value of \@SQ:LN
+    std::string Length() const;
+
+    /// \returns string value of \@SQ:SN
+    std::string Name() const;
+
+    /// \returns string value of \@SQ:SP
+    std::string Species() const;
+
+    /// \returns string value of \@SQ:UR
+    std::string Uri() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Sets the value for \@SQ:AS
+    ///
+    /// \param[in] id      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& AssemblyId(std::string id);
+
+    /// \brief Sets the value for \@SQ:M5
+    ///
+    /// \param[in] checksum      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Checksum(std::string checksum);
+
+    /// \brief Sets a new collection of non-standard tags.
+    ///
+    /// Custom tag map entries should consist of {tagName => value}.
+    ///
+    /// \param[in] custom      new tags
+    /// \returns reference to this object
+    ///
+    SequenceInfo& CustomTags(std::map<std::string, std::string> custom);
+
+    /// \brief Sets the value for \@SQ:LN
+    ///
+    /// \param[in] length      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Length(std::string length);
+
+    /// \brief Sets the value for \@SQ:SN
+    ///
+    /// \param[in] name      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Name(std::string name);
+
+    /// \brief Sets the value for \@SQ:SP
+    ///
+    /// \param[in] species     new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Species(std::string species);
+
+    /// \brief Sets the value for \@SQ:UR
+    ///
+    /// \param[in] uri      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Uri(std::string uri);
+
+    /// \}
+
+private:
+    std::string name_;        // SN:<Name>    * must be unique for valid SAM *
+    std::string length_;      // LN:<Length>  * must be within [0 - 2^31-1] *
+    std::string assemblyId_;  // AS:<AssemblyId>
+    std::string checksum_;    // M5:<Checksum>
+    std::string species_;     // SP:<Species>
+    std::string uri_;         // UR:<URI>
+
+    // custom attributes
+    std::map<std::string, std::string> custom_;  // tag => value
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // SEQUENCEINFO_H
diff --git a/include/pbbam/Strand.h b/include/pbbam/Strand.h

new file mode 100644 (file)

index 0000000..d27945f
--- /dev/null
+++ b/include/pbbam/Strand.h
@@ -0,0 +1,22 @@
+// File Description
+/// \file Strand.h
+/// \brief Defines the Strand enum.
+//
+// Author: Derek Barnett
+
+#ifndef STRAND_H
+#define STRAND_H
+
+#include "pbbam/Config.h"
+
+#include <pbcopper/data/Strand.h>
+
+namespace PacBio {
+namespace BAM {
+
+using Strand PBBAM_DEPRECATED = PacBio::Data::Strand;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // STRAND_H
diff --git a/include/pbbam/StringUtilities.h b/include/pbbam/StringUtilities.h

new file mode 100644 (file)

index 0000000..abc90e1
--- /dev/null
+++ b/include/pbbam/StringUtilities.h
@@ -0,0 +1,44 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_STRINGUTILITIES_H
+#define PBBAM_STRINGUTILITIES_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief Joins tokens into a single string
+///
+/// \param tokens   input strings
+/// \param delim    delimiter character
+///
+/// \return joined string
+///
+std::string Join(const std::vector<std::string>& tokens, const char delim);
+
+/// \brief Splits a string into tokens
+///
+/// \param[in] line     input string
+/// \param[in] delim    character to split on
+///
+/// \returns vector of tokens
+///
+std::vector<std::string> Split(const std::string& line, const char delim = '\t');
+
+/// \brief Remove all whitespace from input string (start, end, & internal)
+///
+/// \param[in] input    original string
+///
+/// \returns new string with no whitespace
+///
+std::string RemoveAllWhitespace(std::string input);
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBBAM_STRINGUTILITIES_H
diff --git a/include/pbbam/SubreadLengthQuery.h b/include/pbbam/SubreadLengthQuery.h

new file mode 100644 (file)

index 0000000..aa17fb6
--- /dev/null
+++ b/include/pbbam/SubreadLengthQuery.h
@@ -0,0 +1,67 @@
+// File Description
+/// \file SubreadLengthQuery.h
+/// \brief Defines the SubreadLengthQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef SUBREADLENGTHQUERY_H
+#define SUBREADLENGTHQUERY_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "pbbam/Compare.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The SubreadLengthQuery class provides iterable access to a DataSet's
+///        %BAM records, limiting results to those matching a subread length
+///        criterion.
+///
+/// Example:
+/// \include code/SubreadLengthQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT SubreadLengthQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new SubreadLengthQuery, limiting record results to only
+    ///        those matching a subread length criterion.
+    ///
+    /// \param[in] length       subread length value
+    /// \param[in] compareType  compare operator
+    /// \param[in] dataset      input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or PBI
+    ///         files.
+    ///
+    SubreadLengthQuery(const int32_t length, const Compare::Type compareType,
+                       const DataSet& dataset);
+
+    ~SubreadLengthQuery();
+
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+    uint32_t NumReads() const;
+
+private:
+    class SubreadLengthQueryPrivate;
+    std::unique_ptr<SubreadLengthQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // SUBREADLENGTHQUERY_H
diff --git a/include/pbbam/Tag.h b/include/pbbam/Tag.h

new file mode 100644 (file)

index 0000000..79cdaf7
--- /dev/null
+++ b/include/pbbam/Tag.h
@@ -0,0 +1,413 @@
+// File Description
+/// \file Tag.h
+/// \brief Defines the Tag class.
+//
+// Author: Derek Barnett
+
+#ifndef TAG_H
+#define TAG_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <boost/variant.hpp>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum is used to describe the exact (C++) data type held by a
+///        Tag.
+///
+enum class TagDataType
+{
+    INVALID = 0,       ///< boost::blank
+    INT8,              ///< int8_t
+    UINT8,             ///< uint8_t
+    INT16,             ///< int16_t
+    UINT16,            ///< uint16_t
+    INT32 = 5,         ///< int32_t
+    UINT32,            ///< uint32_t
+    FLOAT,             ///< float
+    STRING,            ///< std::string
+    INT8_ARRAY,        ///< std::vector<int8_t>
+    UINT8_ARRAY = 10,  ///< std::vector<uint8_t>
+    INT16_ARRAY,       ///< std::vector<int16_t>
+    UINT16_ARRAY,      ///< std::vector<uint16_t>
+    INT32_ARRAY,       ///< std::vector<int32_t>
+    UINT32_ARRAY,      ///< std::vector<uint32_t>
+    FLOAT_ARRAY = 15   ///< std::vector<float>
+};
+
+/// \brief This enum provides additional instructions on interpreting the tag's
+///        value.
+///
+/// Some C++ data types (e.g. std::string) may represent more than one BAM tag
+/// type ('H' vs 'Z'). Thus a TagModifier may be used to indicate how to
+/// properly distinguish between these shared data types.
+///
+enum class TagModifier
+{
+    /// \brief This value indicates that the tag has no modifiers set.
+    ///
+    NONE = 0,
+
+    /// \brief This modifier marks an integer as ASCII.
+    ///
+    /// SAM/BAM has the concept of an ASCII character that is distinct from an
+    /// 8-bit integer. However, there is no such pure separation in C++ - as
+    /// int8_t/uint8_t are likely implemented as typedefs around char/unsigned
+    /// char. Thus this modifier can be used to indicate a tag's value should be
+    /// interpreted as a printable, ASCII character.
+    ///
+    ASCII_CHAR,
+
+    /// \brief This modifier marks std::string data as "hex string", rather than
+    ///        a regular string.
+    ///
+    /// SAM/BAM has a distinction between regular strings and "Hex format"
+    /// strings. However, they are both manipulated in C++ via std::string. Thus
+    /// this modifier can be used to indicate that a tag's string data should be
+    /// interpreted as "Hex format" rather than a regular, literal string.
+    ///
+    HEX_STRING
+};
+
+/// \brief The Tag class represents a SAM/BAM record tag value.
+///
+/// SAM/BAM tags may store values from a variety of types: varying fixed-width
+/// integers, strings, arrays of data, etc.
+///
+/// The Tag class allow tags to be handled in a generic fashion, while
+/// maintaining a high level of type-safety. Only those types recognized by the
+/// SAM/BAM format are allowed, and extracting the value from a tag is subject
+/// to allowed conversion rules, as well.
+///
+// Inspired by (but greatly simplified & modified from) the boost::variant
+// wrapper approach taken by DynamicCpp (https://code.google.com/p/dynamic-cpp)
+//
+class PBBAM_EXPORT Tag
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a Tag from a signed 8-bit integer or character.
+    ///
+    /// Without a TagModifier, the resulting Tag will be annotated as containing
+    /// an 8-bit integer, whether the input \p value was an integer or a char.
+    /// For ASCII tags, use one of these methods:
+    /// \include code/Tag_AsciiCtor.txt
+    ///
+    Tag(int8_t value);
+
+    /// \brief Creates a Tag from a signed 8-bit integer or character,
+    ///        applying the provided modifier.
+    ///
+    /// This method allows direct construction of an ASCII character, rather
+    /// than an 8-bit integer (e.g. Tag('A', TagModifier::ASCII_CHAR) ).
+    ///
+    /// \throws runtime_error if \p modifier is not valid for int8_t data
+    ///
+    Tag(int8_t value, const TagModifier mod);
+
+    /// \brief Creates a Tag from an unsigned 8-bit integer or character.
+    ///
+    /// Without a TagModifier, the resulting Tag will be annotated as containing
+    /// an 8-bit unsigned integer, whether the input \p value was an integer or
+    /// a char. For ASCII tags, use one of these methods:
+    /// \include code/Tag_AsciiCtor.txt
+    ///
+    Tag(uint8_t value);
+
+    /// \brief Creates a Tag from 16-bit integer.
+    Tag(int16_t value);
+
+    /// \brief Creates a Tag from 16-bit unsigned integer.
+    Tag(uint16_t value);
+
+    /// \brief Creates a Tag from 32-bit signed integer.
+    Tag(int32_t value);
+
+    /// \brief Creates a Tag from 32-bit unsigned integer.
+    Tag(uint32_t value);
+
+    /// \brief Creates a Tag from floating-point value.
+    Tag(float value);
+
+    /// \brief Creates a Tag from string data.
+    Tag(std::string value);
+
+    /// \brief Creates a Tag from string data, adding modifier.
+    ///
+    /// \throws runtime_error if \p modifier is not valid for string data
+    ///
+    Tag(std::string value, TagModifier mod);
+
+    /// \brief Creates a Tag from a vector of 8-bit integers.
+    Tag(std::vector<int8_t> value);
+
+    /// \brief Creates a Tag from a vector of 8-bit unsigned integers.
+    Tag(std::vector<uint8_t> value);
+
+    /// \brief Creates a Tag from a vector of 16-bit integers.
+    Tag(std::vector<int16_t> value);
+
+    /// \brief Creates a Tag from a vector of 16-bit unsigned integers.
+    Tag(std::vector<uint16_t> value);
+
+    /// Constructs a Tag from a vector of 32-bit integers.
+    Tag(std::vector<int32_t> value);
+
+    /// \brief Creates a Tag from a vector of 32-bit unsigned integers.
+    Tag(std::vector<uint32_t> value);
+
+    /// \brief Creates a Tag from a vector of floating-point values.
+    Tag(std::vector<float> value);
+
+    Tag() = default;
+
+    Tag& operator=(boost::blank value);
+    Tag& operator=(int8_t value);
+    Tag& operator=(uint8_t value);
+    Tag& operator=(int16_t value);
+    Tag& operator=(uint16_t value);
+    Tag& operator=(int32_t value);
+    Tag& operator=(uint32_t value);
+    Tag& operator=(float value);
+    Tag& operator=(std::string value);
+    Tag& operator=(std::vector<int8_t> value);
+    Tag& operator=(std::vector<uint8_t> value);
+    Tag& operator=(std::vector<int16_t> value);
+    Tag& operator=(std::vector<uint16_t> value);
+    Tag& operator=(std::vector<int32_t> value);
+    Tag& operator=(std::vector<uint32_t> value);
+    Tag& operator=(std::vector<float> value);
+
+    bool operator==(const Tag& other) const;
+    bool operator!=(const Tag& other) const;
+
+    /// \}
+
+public:
+    /// \name Data Conversion & Validation
+    /// \{
+
+    /// \brief Converts the tag value to an ASCII character.
+    ///
+    /// Tag must hold an integral type, within the valid ASCII range [33-127].
+    ///
+    /// \returns ASCII character value
+    /// \throws std::runtime_error if not ASCII-compatible
+    ///
+    char ToAscii() const;
+
+    /// \returns tag data as signed 8-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    int8_t ToInt8() const;
+
+    /// \returns tag data as unsigned 8-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    uint8_t ToUInt8() const;
+
+    /// \returns tag data as signed 16-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    int16_t ToInt16() const;
+
+    /// \returns tag data as unsigned 16-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    uint16_t ToUInt16() const;
+
+    /// \returns tag data as signed 32-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    int32_t ToInt32() const;
+
+    /// \returns tag data as unsigned 32-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    uint32_t ToUInt32() const;
+
+    /// \returns tag data as float
+    /// \throws std::runtime_error if tag does not contain a value of
+    ///         explicit type: float
+    float ToFloat() const;
+
+    /// \returns tag data as std::string
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::string
+    std::string ToString() const;
+
+    /// \returns tag data as std::vector<int8_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<int8_t>
+    std::vector<int8_t> ToInt8Array() const;
+
+    /// \returns tag data as std::vector<uint8_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<uint8_t>
+    std::vector<uint8_t> ToUInt8Array() const;
+
+    /// \returns tag data as std::vector<int16_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<int16_t>
+    std::vector<int16_t> ToInt16Array() const;
+
+    /// \returns tag data as std::vector<uint16_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<uint16_t>
+    std::vector<uint16_t> ToUInt16Array() const;
+
+    /// \returns tag data as std::vector<int32_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<int32_t>
+    std::vector<int32_t> ToInt32Array() const;
+
+    /// \returns tag data as std::vector<uint32_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<uint32_t>
+    std::vector<uint32_t> ToUInt32Array() const;
+
+    /// \returns tag data as std::vector<float>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<float>
+    std::vector<float> ToFloatArray() const;
+
+    /// \}
+
+public:
+    /// \name Data Conversion & Validation
+    ///
+
+    /// \returns true if tag is null (e.g. default-constructed)
+    bool IsNull() const;
+
+    /// \returns true if tag contains a value of type: int8_t
+    bool IsInt8() const;
+
+    /// \returns true if tag contains a value of type: uint8_t
+    bool IsUInt8() const;
+
+    /// \returns true if tag contains a value of type: int16_t
+    bool IsInt16() const;
+
+    /// \returns true if tag contains a value of type: uint16_t
+    bool IsUInt16() const;
+
+    /// \returns true if tag contains a value of type: int32_t
+    bool IsInt32() const;
+
+    /// \returns true if tag contains a value of type: uint32_t
+    bool IsUInt32() const;
+
+    /// \returns true if tag contains a value of type: float
+    bool IsFloat() const;
+
+    /// \returns true if tag contains a value of type: std::string
+    bool IsString() const;
+
+    /// \returns true if tag contains a value of type: std::string \b AND has a
+    ///          TagModifier of TagModifier::HEX_STRING
+    bool IsHexString() const;
+
+    /// \returns true if tag contains a value of type: std::vector<int8_t>
+    bool IsInt8Array() const;
+
+    /// \returns true if tag contains a value of type: std::vector<uint8_t>
+    bool IsUInt8Array() const;
+
+    /// \returns true if tag contains a value of type: std::vector<int16_t>
+    bool IsInt16Array() const;
+
+    /// \returns true if tag contains a value of type: std::vector<uint16_t>
+    bool IsUInt16Array() const;
+
+    /// \returns true if tag contains a value of type: std::vector<int32_t>
+    bool IsInt32Array() const;
+
+    /// \returns true if tag contains a value of type: std::vector<uint32_t>
+    bool IsUInt32Array() const;
+
+    /// \returns true if tag contains a value of type: std::vector<float>
+    bool IsFloatArray() const;
+
+    /// \returns true if tag contains a value with any signed integer type
+    bool IsSignedInt() const;
+
+    /// \returns true if tag contains a value with any unsigned integer type
+    bool IsUnsignedInt() const;
+
+    /// \returns true if tag contains a value with any integer type
+    bool IsIntegral() const;
+
+    /// \returns true if tag contains a value with any integer or float type
+    bool IsNumeric() const;
+
+    /// \returns true if tag contains a vector containing signed integers
+    bool IsSignedArray() const;
+
+    /// \returns true if tag contains a vector containing unsigned integers
+    bool IsUnsignedArray() const;
+
+    /// \returns true if tag contains a vector containing integers
+    bool IsIntegralArray() const;
+
+    /// \returns true if tag contains a vector (integers or floats)
+    bool IsArray() const;
+
+    /// \}
+
+public:
+    /// \name Type & Modifier Attributes
+    /// \{
+
+    /// \returns enum value for current tag data
+    TagDataType Type() const;
+
+    /// \returns printable type name for current tag data
+    std::string Typename() const;
+
+    /// \returns true if tag data modifier \p m is set
+    bool HasModifier(const TagModifier m) const;
+
+    /// \returns current tag data modifier
+    TagModifier Modifier() const;
+
+    /// \brief Sets tag data modifier.
+    ///
+    /// \param[in] m    new modifier value
+    ///
+    /// \returns reference to this tag
+    Tag& Modifier(const TagModifier m);
+
+    /// \}
+
+private:
+    // clang-format off
+    // NOTE - keep this synced with TagDataType enum ordering
+    using var_t = boost::variant<boost::blank, // <-- default constructor creates variant of this type
+                                 int8_t,
+                                 uint8_t,
+                                 int16_t,
+                                 uint16_t,
+                                 int32_t,
+                                 uint32_t,
+                                 float,
+                                 std::string,
+                                 std::vector<int8_t>,
+                                 std::vector<uint8_t>,
+                                 std::vector<int16_t>,
+                                 std::vector<uint16_t>,
+                                 std::vector<int32_t>,
+                                 std::vector<uint32_t>,
+                                 std::vector<float> >;
+
+    var_t data_;
+    TagModifier modifier_ = TagModifier::NONE;
+    // clang-format on
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // TAG_H
diff --git a/include/pbbam/TagCollection.h b/include/pbbam/TagCollection.h

new file mode 100644 (file)

index 0000000..ef3de0a
--- /dev/null
+++ b/include/pbbam/TagCollection.h
@@ -0,0 +1,35 @@
+// File Description
+/// \file TagCollection.h
+/// \brief Defines the TagCollection class.
+//
+// Author: Derek Barnett
+
+#ifndef TAGCOLLECTION_H
+#define TAGCOLLECTION_H
+
+#include "pbbam/Config.h"
+
+#include <map>
+#include <string>
+
+#include "pbbam/Tag.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The TagCollection class represents a collection (or "dictionary") of
+///        tags.
+///
+/// Tags are mapped to their tag name, a 2-character string.
+///
+class PBBAM_EXPORT TagCollection : public std::map<std::string, Tag>
+{
+public:
+    /// \returns true if the collection contains a tag with \p name
+    bool Contains(const std::string& name) const;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // TAGCOLLECTION_H
diff --git a/include/pbbam/TextFileReader.h b/include/pbbam/TextFileReader.h

new file mode 100644 (file)

index 0000000..a053046
--- /dev/null
+++ b/include/pbbam/TextFileReader.h
@@ -0,0 +1,87 @@
+// File Description
+/// \file TextFileReader.h
+/// \brief Defines the TextFileReader class.
+//
+// Author: Derek Barnett
+
+#ifndef PBBAM_TEXTFILEREADER_H
+#define PBBAM_TEXTFILEREADER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The TextFileReader class provides line-by-line access to text files.
+///
+/// Supports plain text or gzipped (gzip or bgzip).
+///
+/// \note This is a general-purpose file reader. For FASTA/FASTQ, use the dedicated
+///       FastaReader or FastqReader for better performance.
+///
+class TextFileReader : public internal::QueryBase<std::string>
+{
+public:
+    ///
+    /// \brief Reads all lines from a text file
+    ///
+    /// \param fn    filename
+    /// \return vector of lines
+    ///
+    static std::vector<std::string> ReadAll(const std::string& fn);
+
+public:
+    ///
+    /// \brief TextLineReader
+    ///
+    /// \param filename
+    ///
+    explicit TextFileReader(std::string filename);
+
+    TextFileReader(TextFileReader&&) noexcept;
+    TextFileReader& operator=(TextFileReader&&) noexcept;
+    ~TextFileReader();
+
+public:
+    //
+    /// \brief GetNext
+    ///
+    /// Allows iteration with range-for:
+    /// \code{cpp}
+    ///
+    /// TextFileReader reader{fn};
+    /// for (const std::string& line : reader) {
+    ///     // do stuff with line
+    /// }
+    /// \endcode
+    ///
+    /// or you can iterate 'manually':
+    /// \code{cpp}
+    ///
+    /// TextFileReader reader{fn};
+    /// std::string line;
+    /// while (reader.GetNext(line)) {
+    ///     // do stuff with line
+    /// }
+    /// \endcode
+    ///
+    /// \param[out] line
+    /// \return success/failure
+    ///
+    bool GetNext(std::string& line) override;
+
+private:
+    class TextFileReaderPrivate;
+    std::unique_ptr<TextFileReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBBAM_TEXTFILEREADER_H
diff --git a/include/pbbam/TextFileWriter.h b/include/pbbam/TextFileWriter.h

new file mode 100644 (file)

index 0000000..5ce62d5
--- /dev/null
+++ b/include/pbbam/TextFileWriter.h
@@ -0,0 +1,60 @@
+
+// File Description
+/// \file TextFileWriter.h
+/// \brief Defines the TextFileWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef PBBAM_TEXTFILEWRITER_H
+#define PBBAM_TEXTFILEWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The TextFileWriter class provides line-by-line writing text files.
+///
+/// Supports plain text or gzipped. For explicitly-bgzipped text, use
+/// BgzipWriter instead.
+///
+/// \note This is a general-purpose file writer. For FASTA/FASTQ, use the
+///       dedicated FastaReader/FastqReader or BgzipFastaWriter/BgzipFastaWriter
+///       for better performance.
+///
+class TextFileWriter
+{
+public:
+    ///
+    /// \brief TextLineReader
+    ///
+    /// \param filename  suffix ".gz" indicates gzipped output
+    ///
+    explicit TextFileWriter(const std::string& filename);
+
+    TextFileWriter(TextFileWriter&&) noexcept;
+    TextFileWriter& operator=(TextFileWriter&&) noexcept;
+    ~TextFileWriter();
+
+public:
+    ///
+    /// \brief Write
+    ///
+    ///
+    /// \param line
+    ///
+    void Write(const std::string& line);
+
+private:
+    class TextFileWriterPrivate;
+    std::unique_ptr<TextFileWriterPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBBAM_TEXTFILEWRITER_H
+\ No newline at end of file
diff --git a/include/pbbam/Unused.h b/include/pbbam/Unused.h

new file mode 100644 (file)

index 0000000..6b72ff9
--- /dev/null
+++ b/include/pbbam/Unused.h
@@ -0,0 +1,15 @@
+#ifndef PBBAM_UNUSED_H
+#define PBBAM_UNUSED_H
+
+namespace PacBio {
+namespace BAM {
+
+template <typename T>
+void UNUSED(const T&)
+{
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBBAM_UNUSED_H
diff --git a/include/pbbam/Validator.h b/include/pbbam/Validator.h

new file mode 100644 (file)

index 0000000..a345107
--- /dev/null
+++ b/include/pbbam/Validator.h
@@ -0,0 +1,155 @@
+// File Description
+/// \file Validator.h
+/// \brief Defines the Validator class.
+//
+// Author: Derek Barnett
+
+#ifndef VALIDATOR_H
+#define VALIDATOR_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <limits>
+
+#include "pbbam/exception/ValidationException.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamFile;
+class BamHeader;
+class BamRecord;
+class ReadGroupInfo;
+
+/// \brief The Validator class provides validation for %BAM data.
+///
+/// There are 2 ways to use this class. If you are only compared with a quick &
+/// dirty, yes/no validation, then you can use the IsValid() methods. This will
+/// swallow the specific cause of the failure, but you don't have to catch an
+/// exception and handle it in your client code. If you want to know,
+/// specifically, what failed, then you can use the Validate*() methods that
+/// will throw a ValidationException if the object is invalid. This exception
+/// will provide more details as to what failed and why.
+///
+/// See documentation for Config.h for details on building pbbam with
+/// auto-validation enabled.
+///
+class PBBAM_EXPORT Validator
+{
+public:
+    /// \brief Checks that a %BAM file conforms to the %PacBio specification.
+    ///
+    /// When \p entireFile is false, this method only checks file metadata. If
+    /// \p entireFile is true, all records are checked as well.
+    ///
+    /// \param[in] file         %BAM header to validate
+    /// \param[in] entireFile   check records in addition to metadata
+    /// \returns true if \p file passes validation checks
+    ///
+    /// \sa Validator::ValidateFileMetdata, Validator::ValidateEntireFile
+    ///
+    static bool IsValid(const BamFile& file, const bool entireFile);
+
+    /// \brief Checks that a %BAM header conforms to the %PacBio specification.
+    ///
+    /// \returns true if \p header passes validation checks
+    ///
+    /// \sa Validator::Validate(const BamHeader& header)
+    ///
+    static bool IsValid(const BamHeader& header);
+
+    /// \brief Checks that a %BAM read group conforms to the %PacBio
+    ///        specification.
+    ///
+    /// \returns true if \p rg passes validation checks
+    ///
+    /// \sa Validator::Validate(const ReadGroupInfo& rg)
+    ///
+    static bool IsValid(const ReadGroupInfo& rg);
+
+    /// \brief Checks that a %BAM record conforms to the %PacBio specification.
+    ///
+    /// \returns true if \p record passes validation checks
+    ///
+    /// \sa Validator::Validate(const BamRecord& record)
+    ///
+    static bool IsValid(const BamRecord& record);
+
+    Validator() = delete;
+
+    /// \brief Checks that a %BAM file's header conforms to the
+    ///        %PacBio specification.
+    ///
+    /// This validation step checks the SAM/%BAM version number, sort order,
+    /// PacBioBAM version number, and calls Validate(readGroup) internally for
+    /// all read groups.
+    ///
+    /// \param[in] file         %BAM header to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p header fails validation checks
+    ///
+    static void Validate(const BamHeader& header,
+                         const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+    /// \brief Checks that a %BAM read group conforms to the %PacBio
+    ///        specification.
+    ///
+    /// \param[in] rg           %BAM read group to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p rg fails validation checks
+    ///
+    static void Validate(const ReadGroupInfo& rg,
+                         const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+    /// \brief Checks that a %BAM record conforms to the %PacBio specification.
+    ///
+    /// \param[in] record       %BAM record to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p record fails validation checks
+    ///
+    static void Validate(const BamRecord& record,
+                         const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+    /// \brief Checks that a %BAM file's (entire) contents conform to the
+    ///        %PacBio specification.
+    ///
+    /// This is equivalent to:
+    ///
+    /// \code
+    /// Validator::ValidateMetadata(file);
+    /// EntireFileQuery query(file);
+    /// for (const BamRecord& record : query)
+    ///     Validator::Validate(record);
+    /// \endcode
+    ///
+    /// \param[in] file         %BAM file to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p file fails validation checks
+    ///
+    static void ValidateEntireFile(const BamFile& file,
+                                   const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+    /// \brief Checks that a %BAM file's metadata conforms to the
+    ///        %PacBio specification.
+    ///
+    /// This validation step checks the filename, ensures EOF marker, and
+    /// presence of PBI. It also calls Validate(file.Header()) internally.
+    ///
+    /// \param[in] file         %BAM header to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p header fails validation checks
+    ///
+    static void ValidateFileMetadata(const BamFile& file,
+                                     const size_t maxErrors = std::numeric_limits<size_t>::max());
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VALIDATOR_H
diff --git a/include/pbbam/ZmwChunkedFastaReader.h b/include/pbbam/ZmwChunkedFastaReader.h

new file mode 100644 (file)

index 0000000..2ad208b
--- /dev/null
+++ b/include/pbbam/ZmwChunkedFastaReader.h
@@ -0,0 +1,100 @@
+// File Description
+/// \file ZmwChunkedFastaReader.h
+/// \brief Defines the ZmwChunkedFastaReader class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWCHUNKEDFASTAREADER_H
+#define ZMWCHUNKEDFASTAREADER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "pbbam/FastaSequence.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The ZmwChunkedFastaReader provides sequential access to FASTA records,
+///         where iteration is bounded by chunks of (unique) ZMWs.
+///
+class ZmwChunkedFastaReader : public internal::QueryBase<FastaSequence>
+{
+public:
+    ///
+    /// Construct a new ZMW-chunked FASTA reader.
+    ///
+    /// \param fn           FASTA file, must have a *.fai index
+    /// \param numChunks    desired number of chunks
+    ///
+    /// Actual chunk count may be smaller than the requested number, if the input
+    /// size is smaller.
+    ///
+    ZmwChunkedFastaReader(const std::string& fn, const size_t numChunks);
+
+    ZmwChunkedFastaReader(ZmwChunkedFastaReader&&) noexcept;
+    ZmwChunkedFastaReader& operator=(ZmwChunkedFastaReader&&) noexcept;
+    ~ZmwChunkedFastaReader();
+
+    ///
+    /// \returns the number of chunks available.
+    ///
+    /// Actual chunk count may be smaller than the requested number, if the input
+    /// size is smaller.
+    ///
+    size_t NumChunks() const;
+
+    ///
+    /// Sets current chunk to start iterating over.
+    ///
+    ZmwChunkedFastaReader& Chunk(size_t chunkId);
+
+    ///
+    /// \returns the current chunk in use
+    ///
+    size_t Chunk() const;
+
+public:
+    ///
+    /// \brief GetNext
+    ///
+    /// Allows iteration with range-for:
+    /// \code{cpp}
+    ///
+    /// ZmwChunkedFastaReader reader{fn, numChunks};
+    /// reader.Chunk(4);
+    /// for (const FastaSequence& seq : reader) {
+    ///     // do stuff with seq
+    /// }
+    /// \endcode
+    ///
+    /// or you can iterate 'manually':
+    /// \code{cpp}
+    ///
+    /// ZmwChunkedFastaReader reader{fn, numChunks};
+    /// reader.Chunk(4);
+    /// FastaSequence seq;
+    /// while (reader.GetNext(seq)) {
+    ///     // do stuff with seq
+    /// }
+    /// \endcode
+    ///
+    /// \param[out] record
+    /// \return success/failure
+    ///
+    bool GetNext(FastaSequence& record);
+
+private:
+    class ZmwChunkedFastaReaderPrivate;
+    std::unique_ptr<ZmwChunkedFastaReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWCHUNKEDFASTAREADER_H
diff --git a/include/pbbam/ZmwChunkedFastqReader.h b/include/pbbam/ZmwChunkedFastqReader.h

new file mode 100644 (file)

index 0000000..d0d20d9
--- /dev/null
+++ b/include/pbbam/ZmwChunkedFastqReader.h
@@ -0,0 +1,102 @@
+
+
+// File Description
+/// \file ZmwChunkedFastqReader.h
+/// \brief Defines the ZmwChunkedFastqReader class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWCHUNKEDFASTQREADER_H
+#define ZMWCHUNKEDFASTQREADER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "pbbam/FastqSequence.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The ZmwChunkedFastqReader provides sequential access to FASTQ records,
+///         where iteration is bounded by chunks of (unique) ZMWs.
+///
+class ZmwChunkedFastqReader : public internal::QueryBase<FastqSequence>
+{
+public:
+    ///
+    /// Construct a new ZMW-chunked FASTQ reader.
+    ///
+    /// \param fn           FASTQ file, must have a *.fai index
+    /// \param numChunks    desired number of chunks
+    ///
+    /// Actual chunk count may be smaller than the requested number, if the input
+    /// size is smaller.
+    ///
+    ZmwChunkedFastqReader(const std::string& fn, const size_t numChunks);
+
+    ZmwChunkedFastqReader(ZmwChunkedFastqReader&&) noexcept;
+    ZmwChunkedFastqReader& operator=(ZmwChunkedFastqReader&&) noexcept;
+    ~ZmwChunkedFastqReader();
+
+    ///
+    /// \returns the number of chunks available.
+    ///
+    /// Actual chunk count may be smaller than the requested number, if the input
+    /// size is smaller.
+    ///
+    size_t NumChunks() const;
+
+    ///
+    /// Sets current chunk to start iterating over.
+    ///
+    ZmwChunkedFastqReader& Chunk(size_t chunkId);
+
+    ///
+    /// \returns the current chunk in use
+    ///
+    size_t Chunk() const;
+
+public:
+    ///
+    /// \brief GetNext
+    ///
+    /// Allows iteration with range-for:
+    /// \code{cpp}
+    ///
+    /// ZmwChunkedFastqReader reader{fn, numChunks};
+    /// reader.Chunk(4);
+    /// for (const FastqSequence& seq : reader) {
+    ///     // do stuff with seq
+    /// }
+    /// \endcode
+    ///
+    /// or you can iterate 'manually':
+    /// \code{cpp}
+    ///
+    /// ZmwChunkedFastqReader reader{fn, numChunks};
+    /// reader.Chunk(4);
+    /// FastqSequence seq;
+    /// while (reader.GetNext(seq)) {
+    ///     // do stuff with seq
+    /// }
+    /// \endcode
+    ///
+    /// \param[out] record
+    /// \return success/failure
+    ///
+    bool GetNext(FastqSequence& record);
+
+private:
+    class ZmwChunkedFastqReaderPrivate;
+    std::unique_ptr<ZmwChunkedFastqReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWCHUNKEDFASTQREADER_H
diff --git a/include/pbbam/ZmwGroupQuery.h b/include/pbbam/ZmwGroupQuery.h

new file mode 100644 (file)

index 0000000..c708e0f
--- /dev/null
+++ b/include/pbbam/ZmwGroupQuery.h
@@ -0,0 +1,62 @@
+// File Description
+/// \file ZmwGroupQuery.h
+/// \brief Defines the ZmwGroupQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWGROUPQUERY_H
+#define ZMWGROUPQUERY_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ZmwGroupQuery class provides iterable access to a DataSet's
+///        %BAM records, limiting results to those matching a ZMW hole number
+///        whitelist, and grouping those results by hole number.
+///
+/// Example:
+/// \include code/ZmwGroupQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT ZmwGroupQuery : public internal::IGroupQuery
+{
+public:
+    /// \brief Creates a new ZmwGroupQuery, limiting record results to only
+    ///        those matching a ZMW hole number criterion.
+    ///
+    /// \param[in] zmwWhitelist     vector of allowed ZMW hole numbers
+    /// \param[in] dataset          input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         PBI files.
+    ///
+    ZmwGroupQuery(const std::vector<int32_t>& zmwWhitelist, const DataSet& dataset);
+    ~ZmwGroupQuery();
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(std::vector<BamRecord>& records) override;
+
+private:
+    class ZmwGroupQueryPrivate;
+    std::unique_ptr<ZmwGroupQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWGROUPQUERY_H
diff --git a/include/pbbam/ZmwQuery.h b/include/pbbam/ZmwQuery.h

new file mode 100644 (file)

index 0000000..37ee529
--- /dev/null
+++ b/include/pbbam/ZmwQuery.h
@@ -0,0 +1,62 @@
+// File Description
+/// \file ZmwQuery.h
+/// \brief Defines the ZmwQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWQUERY_H
+#define ZMWQUERY_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ZmwQuery class provides iterable access to a DataSet's
+///        %BAM records, limiting results to those matching a ZMW hole number
+///        whitelist.
+///
+/// Example:
+/// \include code/ZmwQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT ZmwQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new ZmwQuery, limiting record results to only
+    ///        those matching a ZMW hole number criterion.
+    ///
+    /// \param[in] zmwWhitelist     vector of allowed ZMW hole numbers
+    /// \param[in] dataset          input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         PBI files.
+    ///
+    ZmwQuery(std::vector<int32_t> zmwWhitelist, const DataSet& dataset);
+
+    ~ZmwQuery();
+
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+private:
+    class ZmwQueryPrivate;
+    std::unique_ptr<ZmwQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWQUERY_H
diff --git a/include/pbbam/ZmwType.h b/include/pbbam/ZmwType.h

new file mode 100644 (file)

index 0000000..6447cc7
--- /dev/null
+++ b/include/pbbam/ZmwType.h
@@ -0,0 +1,28 @@
+// File Description
+/// \file ZmwType.h
+/// \brief Defines the ZmwType enum.
+//
+// Author: Armin Töpfer
+
+#ifndef ZMWTYPE_H
+#define ZMWTYPE_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the different ZMW categories of scraps
+///
+enum class ZmwType : char
+{
+    CONTROL = 'C',
+    MALFORMED = 'M',
+    NORMAL = 'N',
+    SENTINEL = 'S'
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWTYPE_H
diff --git a/include/pbbam/ZmwTypeMap.h b/include/pbbam/ZmwTypeMap.h

new file mode 100644 (file)

index 0000000..3e0fffa
--- /dev/null
+++ b/include/pbbam/ZmwTypeMap.h
@@ -0,0 +1,31 @@
+// File Description
+/// \file ZmwTypeMap.h
+/// \brief Defines the ZmwTypeMap class.
+//
+// Author: Armin Töpfer
+
+#ifndef ZMWTYPEMAP_H
+#define ZMWTYPEMAP_H
+
+#include "pbbam/Config.h"
+
+#include <map>
+
+#include "pbbam/ZmwType.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ZmwTypeMap class provides mapping between char codes and
+///        ZmwType enum keys.
+///
+class ZmwTypeMap
+{
+public:
+    static std::map<char, ZmwType> ParseChar;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWTYPEMAP_H
diff --git a/include/pbbam/bed/BedReader.h b/include/pbbam/bed/BedReader.h

new file mode 100644 (file)

index 0000000..fdfad8c
--- /dev/null
+++ b/include/pbbam/bed/BedReader.h
@@ -0,0 +1,86 @@
+// File Description
+/// \file BedReader.h
+/// \brief Defines the BedReader class.
+//
+// Author: Derek Barnett
+
+#ifndef PBBAM_BED_BEDREADER_H
+#define PBBAM_BED_BEDREADER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "pbbam/GenomicInterval.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The BedReader provides sequential access to BED records.
+///
+/// Supports plain text or gzipped (gzip or bgzip).
+///
+class BedReader : public internal::QueryBase<GenomicInterval>
+{
+public:
+    ///
+    /// \brief Reads all BED intervals from a file
+    ///
+    /// \param fn   BED filename
+    /// \return vector of intervals
+    ///
+    static std::vector<GenomicInterval> ReadAll(const std::string& fn);
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    explicit BedReader(const std::string& fn);
+
+    BedReader(BedReader&&) noexcept;
+    BedReader& operator=(BedReader&&) noexcept;
+    ~BedReader();
+
+    /// \}
+
+public:
+    ///
+    /// \brief GetNext
+    ///
+    /// Allows iteration with range-for:
+    /// \code{cpp}
+    ///
+    /// BedReader reader{fn};
+    /// for (const auto& interval : reader) {
+    ///     // do stuff with seq
+    /// }
+    /// \endcode
+    ///
+    /// or you can iterate 'manually':
+    /// \code{cpp}
+    ///
+    /// BedReader reader{fn};
+    /// GenomicInterval interval;
+    /// while (reader.GetNext(interval)) {
+    ///     // do stuff with interval
+    /// }
+    /// \endcode
+    ///
+    /// \param[out] record
+    /// \return success/failure
+    ///
+    bool GetNext(GenomicInterval& interval);
+
+private:
+    class BedReaderPrivate;
+    std::unique_ptr<BedReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBBAM_BED_BEDREADER_H
diff --git a/include/pbbam/bed/BedWriter.h b/include/pbbam/bed/BedWriter.h

new file mode 100644 (file)

index 0000000..1b0b1fd
--- /dev/null
+++ b/include/pbbam/bed/BedWriter.h
@@ -0,0 +1,42 @@
+// File Description
+/// \file BedWriter.h
+/// \brief Defines the BedWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef PBBAM_BEDWRITER_H
+#define PBBAM_BEDWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+
+namespace PacBio {
+namespace Data {
+
+class GenomicInterval;
+}
+
+namespace BAM {
+
+class BedWriter
+{
+public:
+    explicit BedWriter(const std::string& fn);
+
+    BedWriter(BedWriter&&) noexcept;
+    BedWriter& operator=(BedWriter&&) noexcept;
+    ~BedWriter();
+
+public:
+    void Write(const Data::GenomicInterval& interval);
+
+private:
+    class BedWriterPrivate;
+    std::unique_ptr<BedWriterPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FASTQWRITER_H
diff --git a/include/pbbam/ccs/CCSHeader.h b/include/pbbam/ccs/CCSHeader.h

new file mode 100644 (file)

index 0000000..22835f9
--- /dev/null
+++ b/include/pbbam/ccs/CCSHeader.h
@@ -0,0 +1,25 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_CCS_CCSHEADER_H
+#define PBBAM_CCS_CCSHEADER_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+namespace PacBio {
+namespace CCS {
+
+struct CCSHeader
+{
+    std::string MovieName;
+    std::string BindingKit;
+    std::string SequencingKit;
+    std::string BasecallerVersion;
+    std::string FrameRate;
+};
+
+}  // namespace CCS
+}  // namespace PacBio
+
+#endif  // PBBAM_CCS_CCSHEADER_H
diff --git a/include/pbbam/ccs/CCSPbiBuilder.h b/include/pbbam/ccs/CCSPbiBuilder.h

new file mode 100644 (file)

index 0000000..f733a24
--- /dev/null
+++ b/include/pbbam/ccs/CCSPbiBuilder.h
@@ -0,0 +1,54 @@
+#ifndef PBBAM_CCS_CCSPBIBUILDER_H
+#define PBBAM_CCS_CCSPBIBUILDER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+
+#include "pbbam/PbiBuilder.h"
+#include "pbbam/ccs/CCSHeader.h"
+#include "pbbam/ccs/CCSRecord.h"
+
+namespace PacBio {
+namespace CCS {
+
+struct CCSHeader;
+struct CCSRecord;
+
+struct CCSPbiBuilderConfig
+{
+    using PbiBuilder = PacBio::BAM::PbiBuilder;
+
+    // zlib compression level for PBI file
+    PbiBuilder::CompressionLevel CompressionLevel = PbiBuilder::DefaultCompression;
+
+    // Number of threads to use in PBI file compression. Only active during
+    // CCSPbiBuilder::Close().
+    size_t NumThreads = 4;
+};
+
+class CCSPbiBuilder
+{
+public:
+public:
+    CCSPbiBuilder(const std::string& pbiFilename, const std::string& movieName,
+                  const CCSPbiBuilderConfig& config = CCSPbiBuilderConfig());
+    CCSPbiBuilder(const std::string& pbiFilename, const CCSHeader& header,
+                  const CCSPbiBuilderConfig& config = CCSPbiBuilderConfig());
+    ~CCSPbiBuilder();
+
+public:
+    void AddRecord(const CCSRecord& record);
+    void Close();
+    const std::string& MovieName() const;
+
+private:
+    class CCSPbiBuilderPrivate;
+    std::unique_ptr<CCSPbiBuilderPrivate> d_;
+};
+
+}  // namespace CCS
+}  // namespace PacBio
+
+#endif  //  PBBAM_CCS_CCSPBIBUILDER_H
diff --git a/include/pbbam/ccs/CCSRecord.h b/include/pbbam/ccs/CCSRecord.h

new file mode 100644 (file)

index 0000000..85be3dd
--- /dev/null
+++ b/include/pbbam/ccs/CCSRecord.h
@@ -0,0 +1,40 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_CCS_CCSRECORD_H
+#define PBBAM_CCS_CCSRECORD_H
+
+#include "pbbam/Config.h"
+
+#include "pbbam/Accuracy.h"
+#include "pbbam/Frames.h"
+#include "pbbam/LocalContextFlags.h"
+#include "pbbam/Position.h"
+#include "pbbam/SNR.h"
+
+namespace PacBio {
+namespace CCS {
+
+struct CCSRecord
+{
+    int32_t HoleNumber = 0;
+
+    PacBio::BAM::Position QueryStart = 0;
+
+    PacBio::BAM::Position QueryEnd = 0;
+
+    PacBio::BAM::LocalContextFlags LocalContextFlags =
+        PacBio::BAM::LocalContextFlags::NO_LOCAL_CONTEXT;
+
+    PacBio::BAM::Accuracy Accuracy = 0.0f;
+
+    PacBio::Data::SNR SignalToNoise = {0.0, 0.0, 0.0, 0.0};
+
+    std::string Sequence;
+
+    PacBio::BAM::Frames PulseWidths;
+};
+
+}  // namespace CCS
+}  // namespace PacBio
+
+#endif  // PBBAM_CCS_CCSRECORD_H
diff --git a/include/pbbam/ccs/CCSRecordFormat.h b/include/pbbam/ccs/CCSRecordFormat.h

new file mode 100644 (file)

index 0000000..123491a
--- /dev/null
+++ b/include/pbbam/ccs/CCSRecordFormat.h
@@ -0,0 +1,29 @@
+#ifndef PBBAM_CCS_CCSRECORDFORMAT_H
+#define PBBAM_CCS_CCSRECORDFORMAT_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+#include <vector>
+
+#include "pbbam/ccs/CCSHeader.h"
+#include "pbbam/ccs/CCSRecord.h"
+
+namespace PacBio {
+namespace CCS {
+
+struct CCSRecordFormat
+{
+    // header
+    static CCSHeader DeserializeHeader(const std::vector<std::string>& lines);
+    static std::vector<std::string> SerializeHeader(const CCSHeader& header);
+
+    // record
+    static CCSRecord DeserializeRecord(const std::string& line);
+    static std::string SerializeRecord(const CCSRecord& record);
+};
+
+}  // namespace CCS
+}  // namespace PacBio
+
+#endif  //  PBBAM_CCS_CCSRECORDFORMAT_H
diff --git a/include/pbbam/ccs/CCSRecordReader.h b/include/pbbam/ccs/CCSRecordReader.h

new file mode 100644 (file)

index 0000000..cc77491
--- /dev/null
+++ b/include/pbbam/ccs/CCSRecordReader.h
@@ -0,0 +1,42 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_CCS_CCSRECORDREADER_H
+#define PBBAM_CCS_CCSRECORDREADER_H
+
+#include "pbbam/Config.h"
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+
+#include "pbbam/ccs/CCSHeader.h"
+#include "pbbam/ccs/CCSRecord.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace CCS {
+
+///
+/// Reads CCSRecords from stdin
+//
+class CCSRecordReader : public PacBio::BAM::internal::QueryBase<CCSRecord>
+{
+public:
+    CCSRecordReader();
+    CCSRecordReader(std::istream& in);
+    ~CCSRecordReader();
+
+public:
+    const CCSHeader& Header() const;
+
+    bool GetNext(CCSRecord& record);
+
+private:
+    class CCSRecordReaderPrivate;
+    std::unique_ptr<CCSRecordReaderPrivate> d_;
+};
+
+}  // namespace CCS
+}  // namespace PacBio
+
+#endif  // PBBAM_CCS_CCSRECORD_H
diff --git a/include/pbbam/ccs/CCSRecordWriter.h b/include/pbbam/ccs/CCSRecordWriter.h

new file mode 100644 (file)

index 0000000..0e6edfe
--- /dev/null
+++ b/include/pbbam/ccs/CCSRecordWriter.h
@@ -0,0 +1,50 @@
+#ifndef PBBAM_CCS_CCSRECORDWRITER_H
+#define PBBAM_CCS_CCSRECORDWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+
+#include "pbbam/ccs/CCSHeader.h"
+#include "pbbam/ccs/CCSRecord.h"
+
+namespace PacBio {
+namespace CCS {
+
+///
+/// Writes CCSRecords to stdout
+///
+class CCSRecordWriter
+{
+public:
+    ///
+    /// \brief Construct a new CCSRecordWriter object
+    ///
+    /// \param header
+    ///
+    CCSRecordWriter(const CCSHeader& header);
+
+    CCSRecordWriter(const CCSHeader& header, std::ostream& out);
+
+    ~CCSRecordWriter();
+
+    ///
+    /// \brief
+    ///
+    /// \param record
+    /// \return true
+    /// \return false
+    ///
+    void Write(const CCSRecord& record);
+
+private:
+    class CCSRecordWriterPrivate;
+    std::unique_ptr<CCSRecordWriterPrivate> d_;
+};
+
+}  // namespace CCS
+}  // namespace PacBio
+
+#endif  //  PBBAM_CCS_CCSRECORDWRITER_H
diff --git a/include/pbbam/exception/BundleChemistryMappingException.h b/include/pbbam/exception/BundleChemistryMappingException.h

new file mode 100644 (file)

index 0000000..1a45754
--- /dev/null
+++ b/include/pbbam/exception/BundleChemistryMappingException.h
@@ -0,0 +1,45 @@
+// File Description
+/// \file BundleChemistryMappingException.h
+/// \brief Defines the BundleChemistryMappingException class.
+//
+// Author: Derek Barnett, Lance Hepler
+
+#ifndef BUNDLECHEMISTRYMAPPINGEXCEPTION_H
+#define BUNDLECHEMISTRYMAPPINGEXCEPTION_H
+
+#include "pbbam/Config.h"
+
+#include <exception>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BundleChemistryMappingException class represents an exception
+///        that will be thrown when an invalid sequencing chemistry combination
+///        is encountered.
+///
+class BundleChemistryMappingException : public std::exception
+{
+public:
+    BundleChemistryMappingException(std::string mappingXml, std::string msg)
+        : mappingXml_(std::move(mappingXml))
+        , what_(std::string("invalid ") + mappingXml_ + ": " + std::move(msg))
+    {
+    }
+
+    // This is a work around for the Intel PHI compiler (icpc)
+    ~BundleChemistryMappingException() throw() {}
+
+public:
+    const char* what() const noexcept override { return what_.c_str(); }
+
+protected:
+    std::string mappingXml_;
+    std::string what_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BUNDLECHEMISTRYMAPPINGEXCEPTION_H
diff --git a/include/pbbam/exception/InvalidSequencingChemistryException.h b/include/pbbam/exception/InvalidSequencingChemistryException.h

new file mode 100644 (file)

index 0000000..0836f38
--- /dev/null
+++ b/include/pbbam/exception/InvalidSequencingChemistryException.h
@@ -0,0 +1,63 @@
+// File Description
+/// \file InvalidSequencingChemistryException.h
+/// \brief Defines the InvalidSequencingChemistryException class.
+//
+// Author: Derek Barnett
+
+#ifndef INVALIDSEQUENCINGCHEMISTRYEXCEPTION_H
+#define INVALIDSEQUENCINGCHEMISTRYEXCEPTION_H
+
+#include "pbbam/Config.h"
+
+#include <exception>
+#include <sstream>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The InvalidSequencingChemistryException class represents an exception
+///        that will be thrown when an invalid sequencing chemistry combination
+///        is encountered.
+///
+class InvalidSequencingChemistryException : public std::exception
+{
+public:
+    InvalidSequencingChemistryException(std::string bindingKit, std::string sequencingKit,
+                                        std::string basecallerVersion)
+        : bindingKit_(std::move(bindingKit))
+        , sequencingKit_(std::move(sequencingKit))
+        , basecallerVersion_(std::move(basecallerVersion))
+    {
+        std::ostringstream s;
+        s << "unsupported sequencing chemistry combination:\n"
+          << "    binding kit:        " << bindingKit_ << '\n'
+          << "    sequencing kit:     " << sequencingKit_ << '\n'
+          << "    basecaller version: " << basecallerVersion_ << '\n';
+        what_ = s.str();
+    }
+
+    // This is a work around for the Intel PHI compiler (icpc)
+    ~InvalidSequencingChemistryException() throw() {}
+
+public:
+    const std::string& BindingKit() const { return bindingKit_; }
+
+    const std::string& SequencingKit() const { return sequencingKit_; }
+
+    const std::string& BasecallerVersion() const { return basecallerVersion_; }
+
+public:
+    const char* what() const noexcept override { return what_.c_str(); }
+
+protected:
+    std::string bindingKit_;
+    std::string sequencingKit_;
+    std::string basecallerVersion_;
+    std::string what_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // INVALIDSEQUENCINGCHEMISTRYEXCEPTION_H
diff --git a/include/pbbam/exception/ValidationException.h b/include/pbbam/exception/ValidationException.h

new file mode 100644 (file)

index 0000000..804f135
--- /dev/null
+++ b/include/pbbam/exception/ValidationException.h
@@ -0,0 +1,60 @@
+// File Description
+/// \file ValidationException.h
+/// \brief Defines the ValidationException class.
+//
+// Author: Derek Barnett
+
+#ifndef VALIDATIONEXCEPTION_H
+#define VALIDATIONEXCEPTION_H
+
+#include "pbbam/Config.h"
+
+#include <map>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ValidationExecption represents an exception that will be thrown
+///        when any error is encountered using the Validator API. In addition to
+///        a default display message, it provides programmatic access to all
+///        reported error messages.
+///
+/// \sa Validator::Validate(const BamRecord& record)
+///
+class ValidationException : public std::runtime_error
+{
+public:
+    using ErrorList = std::vector<std::string>;
+    using ErrorMap = std::map<std::string, ErrorList>;
+
+public:
+    ValidationException(ErrorMap fileErrors, ErrorMap readGroupErrors, ErrorMap recordErrors);
+
+    // This is a work around for the Intel PHI compiler (icpc)
+    ~ValidationException() throw() {}
+
+public:
+    const ErrorMap& FileErrors() const;
+    const ErrorMap& ReadGroupErrors() const;
+    const ErrorMap& RecordErrors() const;
+
+    const char* what() const noexcept override;
+
+private:
+    ErrorMap fileErrors_;
+    ErrorMap readGroupErrors_;
+    ErrorMap recordErrors_;
+    std::string msg_;
+
+private:
+    void FormatMessage();
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VALIDATIONEXCEPTION_H
diff --git a/include/pbbam/internal/Compare.inl b/include/pbbam/internal/Compare.inl

new file mode 100644 (file)

index 0000000..663556f
--- /dev/null
+++ b/include/pbbam/internal/Compare.inl
@@ -0,0 +1,43 @@
+// File Description
+/// \file Compare.inl
+/// \brief Inline implementations for the Compare class & inner classes.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Compare.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template <typename T, T> struct MemberFnProxy;
+
+template<typename T, typename R, typename... Args, R (T::*fn)(Args...)const>
+struct MemberFnProxy<R (T::*)(Args...)const, fn>
+{
+    static R call(const T& obj, Args&&... args)
+    {
+        return (obj.*fn)(std::forward<Args>(args)...);
+    }
+};
+
+} // namespace internal
+
+template<typename ValueType,
+         typename Compare::MemberFunctionBaseHelper<ValueType>::MemberFnType fn,
+         typename CompareType>
+inline bool Compare::MemberFunctionBase<ValueType, fn, CompareType>::operator()(const BamRecord& lhs,
+                                                                                const BamRecord& rhs) const
+{
+    using MemberFnTypeImpl = typename Compare::MemberFunctionBaseHelper<ValueType>::MemberFnType;
+    using Proxy = internal::MemberFnProxy<MemberFnTypeImpl, fn>;
+
+    CompareType cmp;
+    return cmp(Proxy::call(lhs), Proxy::call(rhs));
+}
+
+inline bool Compare::None::operator()(const BamRecord&, const BamRecord&) const
+{ return false; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/CompositeBamReader.inl b/include/pbbam/internal/CompositeBamReader.inl

new file mode 100644 (file)

index 0000000..fcc6ab5
--- /dev/null
+++ b/include/pbbam/internal/CompositeBamReader.inl
@@ -0,0 +1,360 @@
+// File Description
+/// \file CompositeBamReader.inl
+/// \brief Inline implementations for the composite BAM readers, for
+///        working with multiple input files.
+//
+// Author: Derek Barnett
+
+#include "pbbam/CompositeBamReader.h"
+
+#include <algorithm>
+#include <iostream>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// -----------------------------------
+// Merging helpers
+// -----------------------------------
+
+inline CompositeMergeItem::CompositeMergeItem(std::unique_ptr<BamReader> rdr)
+    : reader{std::move(rdr)}
+{
+}
+
+inline CompositeMergeItem::CompositeMergeItem(std::unique_ptr<BamReader> rdr, BamRecord rec)
+    : reader{std::move(rdr)}, record{std::move(rec)}
+{
+}
+
+template <typename CompareType>
+inline bool CompositeMergeItemSorter<CompareType>::operator()(const CompositeMergeItem& lhs,
+                                                              const CompositeMergeItem& rhs) const
+{
+    const auto& l = lhs.record;
+    const auto& r = rhs.record;
+    return CompareType()(l, r);
+}
+
+}  // namespace internal
+
+// -----------------------------------
+// GenomicIntervalCompositeBamReader
+// -----------------------------------
+
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(
+    const std::vector<BamFile>& bamFiles)
+    : GenomicIntervalCompositeBamReader{bamFiles, MakeBaiIndexCache(bamFiles)}
+{
+}
+
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(
+    const std::vector<BamFile>& bamFiles, const BaiIndexCache& cache)
+{
+    indexCache_ = cache;
+
+    filenames_.reserve(bamFiles.size());
+    for (const auto& bamFile : bamFiles)
+        filenames_.push_back(bamFile.Filename());
+}
+
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(const DataSet& dataset)
+    : GenomicIntervalCompositeBamReader{dataset.BamFiles()}
+{
+}
+
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(
+    const DataSet& dataset, const BaiIndexCache& cache)
+    : GenomicIntervalCompositeBamReader{dataset.BamFiles(), cache}
+{
+}
+
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(
+    const GenomicInterval& interval, const std::vector<BamFile>& bamFiles)
+    : GenomicIntervalCompositeBamReader{interval, bamFiles, MakeBaiIndexCache(bamFiles)}
+{
+}
+
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(
+    const GenomicInterval& interval, const std::vector<BamFile>& bamFiles,
+    const BaiIndexCache& cache)
+    : GenomicIntervalCompositeBamReader{bamFiles, cache}
+{
+    Interval(interval);
+}
+
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(
+    const GenomicInterval& interval, const DataSet& dataset)
+    : GenomicIntervalCompositeBamReader{interval, dataset.BamFiles()}
+{
+}
+
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(
+    const GenomicInterval& interval, const DataSet& dataset, const BaiIndexCache& cache)
+    : GenomicIntervalCompositeBamReader{interval, dataset.BamFiles(), cache}
+{
+}
+
+inline bool GenomicIntervalCompositeBamReader::GetNext(BamRecord& record)
+{
+    // nothing left to read
+    if (mergeItems_.empty()) return false;
+
+    // non-destructive 'pop' of first item from queue
+    auto firstIter = mergeItems_.begin();
+    auto firstItem =
+        internal::CompositeMergeItem{std::move(firstIter->reader), std::move(firstIter->record)};
+    mergeItems_.pop_front();
+
+    // store its record in our output record
+    std::swap(record, firstItem.record);
+
+    // try fetch 'next' from first item's reader
+    // if successful, re-insert it into container & re-sort on our new values
+    // otherwise, this item will go out of scope & reader destroyed
+    if (firstItem.reader->GetNext(firstItem.record)) {
+        mergeItems_.push_front(std::move(firstItem));
+        UpdateSort();
+    }
+
+    // return success
+    return true;
+}
+
+inline const GenomicInterval& GenomicIntervalCompositeBamReader::Interval() const
+{
+    return interval_;
+}
+
+inline GenomicIntervalCompositeBamReader& GenomicIntervalCompositeBamReader::Interval(
+    const GenomicInterval& interval)
+{
+    // reset readers
+    mergeItems_.clear();
+
+    // create readers for files
+    std::deque<internal::CompositeMergeItem> updatedMergeItems;
+    std::vector<std::string> missingBai;
+    for (size_t i = 0; i < filenames_.size(); ++i) {
+        const BamFile bamFile{filenames_.at(i)};
+        if (bamFile.StandardIndexExists()) {
+            internal::CompositeMergeItem item{std::unique_ptr<BamReader>{
+                new BaiIndexedBamReader{interval, std::move(bamFile), indexCache_->at(i)}}};
+            if (item.reader->GetNext(item.record)) updatedMergeItems.push_back(std::move(item));
+            // else not an error, simply no data matching interval
+        } else {
+            // maybe handle PBI-backed interval searches if BAI missing, but for now treat as error
+            missingBai.push_back(bamFile.Filename());
+        }
+    }
+
+    // throw if any files missing BAI
+    if (!missingBai.empty()) {
+        std::ostringstream e;
+        e << "GenomicIntervalCompositeBamReader: failed to open because the following files are "
+             "missing a *.bai index:\n";
+        for (const auto& fn : missingBai)
+            e << "  " << fn << '\n';
+        throw std::runtime_error{e.str()};
+    }
+
+    // update our actual container and return
+    mergeItems_ = std::move(updatedMergeItems);
+    UpdateSort();
+    return *this;
+}
+
+struct OrderByPosition
+{
+    static inline bool less_than(const BamRecord& lhs, const BamRecord& rhs)
+    {
+        const int32_t lhsId = lhs.ReferenceId();
+        const int32_t rhsId = rhs.ReferenceId();
+        if (lhsId == -1) return false;
+        if (rhsId == -1) return true;
+
+        if (lhsId == rhsId)
+            return lhs.ReferenceStart() < rhs.ReferenceStart();
+        else
+            return lhsId < rhsId;
+    }
+
+    static inline bool equals(const BamRecord& lhs, const BamRecord& rhs)
+    {
+        return lhs.ReferenceId() == rhs.ReferenceId() &&
+               lhs.ReferenceStart() == rhs.ReferenceStart();
+    }
+};
+
+struct PositionSorter
+    : std::binary_function<internal::CompositeMergeItem, internal::CompositeMergeItem, bool>
+{
+    bool operator()(const internal::CompositeMergeItem& lhs,
+                    const internal::CompositeMergeItem& rhs) const
+    {
+        const BamRecord& l = lhs.record;
+        const BamRecord& r = rhs.record;
+        return OrderByPosition::less_than(l, r);
+    }
+};
+
+inline void GenomicIntervalCompositeBamReader::UpdateSort()
+{
+    std::sort(mergeItems_.begin(), mergeItems_.end(), PositionSorter{});
+}
+
+// ------------------------------
+// PbiRequestCompositeBamReader
+// ------------------------------
+
+template <typename OrderByType>
+inline PbiFilterCompositeBamReader<OrderByType>::PbiFilterCompositeBamReader(
+    const PbiFilter& filter, const std::vector<BamFile>& bamFiles)
+    : PbiFilterCompositeBamReader{filter, bamFiles, MakePbiIndexCache(bamFiles)}
+{
+}
+
+template <typename OrderByType>
+inline PbiFilterCompositeBamReader<OrderByType>::PbiFilterCompositeBamReader(
+    const PbiFilter& filter, const std::vector<BamFile>& bamFiles, const PbiIndexCache& cache)
+    : indexCache_{cache}, numReads_{0}
+{
+    filenames_.reserve(bamFiles.size());
+    for (const auto& bamFile : bamFiles)
+        filenames_.push_back(bamFile.Filename());
+    Filter(filter);
+}
+
+template <typename OrderByType>
+inline PbiFilterCompositeBamReader<OrderByType>::PbiFilterCompositeBamReader(
+    const PbiFilter& filter, const DataSet& dataset)
+    : PbiFilterCompositeBamReader{filter, dataset.BamFiles()}
+{
+}
+
+template <typename OrderByType>
+inline PbiFilterCompositeBamReader<OrderByType>::PbiFilterCompositeBamReader(
+    const PbiFilter& filter, const DataSet& dataset, const PbiIndexCache& cache)
+    : PbiFilterCompositeBamReader{filter, dataset.BamFiles(), cache}
+{
+}
+
+template <typename OrderByType>
+inline bool PbiFilterCompositeBamReader<OrderByType>::GetNext(BamRecord& record)
+{
+    // nothing left to read
+    if (mergeQueue_.empty()) return false;
+
+    // non-destructive 'pop' of first item from queue
+    auto firstIter = mergeQueue_.begin();
+    value_type firstItem{std::move(firstIter->reader), std::move(firstIter->record)};
+    mergeQueue_.pop_front();
+
+    // store its record in our output record
+    std::swap(record, firstItem.record);
+
+    // try fetch 'next' from first item's reader
+    // if successful, re-insert it into container & re-sort on our new values
+    // otherwise, this item will go out of scope & reader destroyed
+    if (firstItem.reader->GetNext(firstItem.record)) {
+        mergeQueue_.push_front(std::move(firstItem));
+        UpdateSort();
+    }
+
+    // return success
+    return true;
+}
+
+template <typename OrderByType>
+inline PbiFilterCompositeBamReader<OrderByType>& PbiFilterCompositeBamReader<OrderByType>::Filter(
+    const PbiFilter& filter)
+{
+    // std::cerr << "PbiFilterCompositeBamReader<OrderByType>::Filter()\n";
+
+    // reset reader queue
+    mergeQueue_.clear();
+
+    // create readers for files
+    container_type updatedMergeItems;
+    std::vector<std::string> missingPbi;
+    for (size_t i = 0; i < filenames_.size(); ++i) {
+        const BamFile bamFile{filenames_.at(i)};
+        if (bamFile.PacBioIndexExists()) {
+            auto item = internal::CompositeMergeItem{std::unique_ptr<BamReader>{
+                new PbiIndexedBamReader{filter, std::move(bamFile), indexCache_->at(i)}}};
+            if (item.reader->GetNext(item.record)) updatedMergeItems.push_back(std::move(item));
+            // else not an error, simply no data matching filter
+        } else
+            missingPbi.push_back(filenames_.at(i));
+    }
+
+    // throw if any files missing PBI
+    if (!missingPbi.empty()) {
+        std::ostringstream e;
+        e << "PbiFilterCompositeBamReader: failed to open because the following files are "
+             "missing a *.pbi index:\n";
+        for (const auto& fn : missingPbi)
+            e << "  " << fn << '\n';
+        throw std::runtime_error{e.str()};
+    }
+
+    // update our actual container, store num matching reads, sort & and return
+    mergeQueue_ = std::move(updatedMergeItems);
+    numReads_ = 0;
+    for (const auto& item : mergeQueue_) {
+        auto* pbiReader = dynamic_cast<PbiIndexedBamReader*>(item.reader.get());
+        numReads_ += pbiReader->NumReads();
+    }
+    UpdateSort();
+    return *this;
+}
+
+template <typename OrderByType>
+inline uint32_t PbiFilterCompositeBamReader<OrderByType>::NumReads() const
+{
+    return numReads_;
+}
+
+template <typename OrderByType>
+inline void PbiFilterCompositeBamReader<OrderByType>::UpdateSort()
+{
+    std::stable_sort(mergeQueue_.begin(), mergeQueue_.end(), merge_sorter_type{});
+}
+
+// ------------------------------
+// SequentialCompositeBamReader
+// ------------------------------
+
+inline SequentialCompositeBamReader::SequentialCompositeBamReader(std::vector<BamFile> bamFiles)
+{
+    for (auto&& bamFile : bamFiles)
+        readers_.emplace_back(std::make_unique<BamReader>(std::move(bamFile)));
+}
+
+inline SequentialCompositeBamReader::SequentialCompositeBamReader(const DataSet& dataset)
+    : SequentialCompositeBamReader{dataset.BamFiles()}
+{
+}
+
+inline bool SequentialCompositeBamReader::GetNext(BamRecord& record)
+{
+    // try first reader, if successful return true
+    // else pop reader and try next, until all readers exhausted
+    while (!readers_.empty()) {
+        auto& reader = readers_.front();
+        if (reader->GetNext(record))
+            return true;
+        else
+            readers_.pop_front();
+    }
+
+    // no readers available
+    return false;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/include/pbbam/internal/DataSetBaseTypes.h b/include/pbbam/internal/DataSetBaseTypes.h

new file mode 100644 (file)

index 0000000..8d3c13a
--- /dev/null
+++ b/include/pbbam/internal/DataSetBaseTypes.h
@@ -0,0 +1,149 @@
+// Author: Derek Barnett
+
+#ifndef DATASETBASETYPES_H
+#define DATASETBASETYPES_H
+
+#include "pbbam/Config.h"
+
+#include "pbbam/internal/DataSetElement.h"
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+class DataSetMetadata;
+class Extensions;
+class ExternalResources;
+class FileIndices;
+class Filters;
+class Properties;
+class Provenance;
+
+namespace internal {
+
+class BaseEntityType : public DataSetElement
+{
+protected:
+    BaseEntityType(const std::string& label, const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+    BaseEntityType(const std::string& label, const FromInputXml& fromInputXml,
+                   const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+
+public:
+    const std::string& CreatedAt() const;
+    const std::string& Description() const;
+    const PacBio::BAM::Extensions& Extensions() const;
+    const std::string& Format() const;
+    const std::string& ModifiedAt() const;
+    const std::string& Name() const;
+    const std::string& ResourceId() const;
+    const std::string& Tags() const;
+    const std::string& Version() const;
+
+    std::string& CreatedAt();
+    std::string& Description();
+    PacBio::BAM::Extensions& Extensions();
+    std::string& Format();
+    std::string& ModifiedAt();
+    std::string& Name();
+    std::string& ResourceId();
+    std::string& Tags();
+    std::string& Version();
+
+    BaseEntityType& CreatedAt(const std::string& createdAt);
+    BaseEntityType& Description(const std::string& description);
+    BaseEntityType& Extensions(const PacBio::BAM::Extensions& extensions);
+    BaseEntityType& Format(const std::string& format);
+    BaseEntityType& ModifiedAt(const std::string& modifiedAt);
+    BaseEntityType& Name(const std::string& name);
+    BaseEntityType& ResourceId(const std::string& resourceId);
+    BaseEntityType& Tags(const std::string& tags);
+    BaseEntityType& Version(const std::string& version);
+};
+
+class DataEntityType : public BaseEntityType
+{
+protected:
+    DataEntityType(const std::string& label, const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+    DataEntityType(const std::string& label, const FromInputXml& fromInputXml,
+                   const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+
+public:
+    const std::string& Checksum() const;
+    const std::string& EncodedValue() const;
+    const std::string& MetaType() const;
+    const std::string& SimpleValue() const;
+    const std::string& TimeStampedName() const;
+    const std::string& UniqueId() const;
+    const std::string& ValueDataType() const;
+
+    std::string& Checksum();
+    std::string& EncodedValue();
+    std::string& MetaType();
+    std::string& SimpleValue();
+    std::string& TimeStampedName();
+    std::string& UniqueId();
+    std::string& ValueDataType();
+
+    DataEntityType& Checksum(const std::string& checksum);
+    DataEntityType& EncodedValue(const std::string& encodedValue);
+    DataEntityType& MetaType(const std::string& metatype);
+    DataEntityType& SimpleValue(const std::string& simpleValue);
+    DataEntityType& TimeStampedName(const std::string& timeStampedName);
+    DataEntityType& UniqueId(const std::string& uuid);
+    DataEntityType& ValueDataType(const std::string& valueDataType);
+};
+
+class StrictEntityType : public BaseEntityType
+{
+protected:
+    StrictEntityType(const std::string& metatype, const std::string& label,
+                     const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+    StrictEntityType(const std::string& metatype, const std::string& label,
+                     const FromInputXml& fromInputXml,
+                     const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+
+public:
+    const std::string& MetaType() const;
+    const std::string& TimeStampedName() const;
+    const std::string& UniqueId() const;
+
+    std::string& MetaType();
+    std::string& TimeStampedName();
+    std::string& UniqueId();
+
+    StrictEntityType& MetaType(const std::string& metatype);
+    StrictEntityType& TimeStampedName(const std::string& timeStampedName);
+    StrictEntityType& UniqueId(const std::string& uuid);
+};
+
+class InputOutputDataType : public StrictEntityType
+{
+protected:
+    InputOutputDataType(const std::string& metatype, const std::string& filename,
+                        const std::string& label, const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+    InputOutputDataType(const std::string& metatype, const std::string& filename,
+                        const std::string& label, const FromInputXml& fromInputXml,
+                        const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+};
+
+class IndexedDataType : public InputOutputDataType
+{
+protected:
+    IndexedDataType(const std::string& metatype, const std::string& filename,
+                    const std::string& label, const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+    IndexedDataType(const std::string& metatype, const std::string& filename,
+                    const std::string& label, const FromInputXml& fromInputXml,
+                    const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+
+public:
+    const PacBio::BAM::FileIndices& FileIndices() const;
+    PacBio::BAM::FileIndices& FileIndices();
+    IndexedDataType& FileIndices(const PacBio::BAM::FileIndices& indices);
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // DATASETBASETYPES_H
diff --git a/include/pbbam/internal/DataSetElement.h b/include/pbbam/internal/DataSetElement.h

new file mode 100644 (file)

index 0000000..38c7a3f
--- /dev/null
+++ b/include/pbbam/internal/DataSetElement.h
@@ -0,0 +1,202 @@
+// Author: Derek Barnett
+
+#ifndef DATASETELEMENT_H
+#define DATASETELEMENT_H
+
+#include "pbbam/Config.h"
+
+#include "pbbam/DataSetXsd.h"
+
+#include <algorithm>
+#include <cassert>
+#include <map>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <boost/utility/string_ref.hpp>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class XmlName
+{
+    //    qualified name
+    //       |
+    //  --------------
+    // <pbns:node_name >
+    //  ---- ---------
+    //   |        |
+    //  prefix    local name
+
+public:
+    XmlName(std::string fullName, bool verbatim = false);
+    XmlName(const std::string& localName, const std::string& prefix);
+
+public:
+    bool operator==(const XmlName& other) const;
+    bool operator!=(const XmlName& other) const;
+
+public:
+    const boost::string_ref LocalName() const;
+    const boost::string_ref Prefix() const;
+    const std::string& QualifiedName() const;
+    bool Verbatim() const;
+
+private:
+    std::string qualifiedName_;
+    size_t prefixSize_;
+    size_t localNameOffset_;
+    size_t localNameSize_;
+    bool verbatim_;
+};
+
+struct FromInputXml
+{
+};
+
+class DataSetElement
+{
+public:
+    DataSetElement(const std::string& label, const XsdType& xsd = XsdType::NONE);
+    DataSetElement(const std::string& label, const FromInputXml& fromInputXml,
+                   const XsdType& xsd = XsdType::NONE);
+
+    virtual ~DataSetElement() = default;
+
+public:
+    bool operator==(const DataSetElement& other) const;
+    bool operator!=(const DataSetElement& other) const;
+
+public:
+    const std::string& Attribute(const std::string& name) const;
+    std::string& Attribute(const std::string& name);
+    const std::map<std::string, std::string>& Attributes() const;
+    std::map<std::string, std::string>& Attributes();
+    bool HasAttribute(const std::string& name) const;
+
+    const std::vector<std::shared_ptr<DataSetElement>>& Children() const;
+    std::vector<std::shared_ptr<DataSetElement>>& Children();
+    bool HasChild(const std::string& label) const;
+
+    const boost::string_ref LocalNameLabel() const;
+    const boost::string_ref PrefixLabel() const;
+    const std::string& QualifiedNameLabel() const;
+    bool IsVerbatimLabel() const;
+
+    const std::string& Text() const;
+    std::string& Text();
+
+    const XsdType& Xsd() const;
+
+public:
+    void Attribute(const std::string& name, const std::string& value);
+    void Label(const std::string& label);
+    void Text(const std::string& text);
+
+public:
+    size_t NumAttributes() const;
+    size_t NumChildren() const;
+    size_t Size() const;
+
+public:
+    template <typename T>
+    void AddChild(const T& e);
+
+    void AddChild(const DataSetElement& e);
+    void AddChild(std::shared_ptr<DataSetElement> e);
+    void RemoveChild(const DataSetElement& e);
+
+    template <typename T>
+    const T& Child(size_t index) const;
+
+    template <typename T>
+    T& Child(size_t index);
+
+    template <typename T>
+    const T& Child(const std::string& label) const;
+
+    template <typename T>
+    T& Child(const std::string& label);
+
+    template <typename T>
+    const T& operator[](size_t index) const;
+
+    template <typename T>
+    T& operator[](size_t index);
+
+    template <typename T = DataSetElement>
+    const T& operator[](const std::string& label) const;
+
+    template <typename T = DataSetElement>
+    T& operator[](const std::string& label);
+
+protected:
+    static const std::string& SharedNullString();
+
+public:
+    const std::string& ChildText(const std::string& label) const;
+    std::string& ChildText(const std::string& label);
+    void ChildText(const std::string& label, const std::string& text);
+
+protected:
+    XsdType xsd_;
+    XmlName label_;
+    std::string text_;
+    std::map<std::string, std::string> attributes_;
+    std::vector<std::shared_ptr<DataSetElement>> children_;
+
+private:
+    int IndexOf(const std::string& label) const;
+};
+
+class DataSetElementIteratorBase
+{
+public:
+    bool operator==(const DataSetElementIteratorBase& other) const;
+    bool operator!=(const DataSetElementIteratorBase& other) const;
+
+protected:
+    DataSetElementIteratorBase(const DataSetElement* parent, size_t i);
+    void Advance();
+
+protected:
+    const DataSetElement* parent_;
+    size_t index_;
+};
+
+template <typename T>
+class DataSetElementIterator : public DataSetElementIteratorBase
+{
+public:
+    DataSetElementIterator(const DataSetElement* parent, size_t i);
+
+    T& operator*();
+    T* operator->();
+
+    DataSetElementIterator& operator++();
+    DataSetElementIterator operator++(int);
+};
+
+template <typename T>
+class DataSetElementConstIterator : public DataSetElementIteratorBase
+{
+public:
+    DataSetElementConstIterator(const DataSetElement* parent, size_t i);
+
+    const T& operator*() const;
+    const T* operator->() const;
+
+    DataSetElementConstIterator& operator++();
+    DataSetElementConstIterator operator++(int);
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/DataSetElement.inl"
+
+#endif  // DATASETELEMENT_H
diff --git a/include/pbbam/internal/DataSetElement.inl b/include/pbbam/internal/DataSetElement.inl

new file mode 100644 (file)

index 0000000..7e5f261
--- /dev/null
+++ b/include/pbbam/internal/DataSetElement.inl
@@ -0,0 +1,411 @@
+// Author: Derek Barnett
+
+#include "pbbam/internal/DataSetElement.h"
+
+#include <iostream>
+#include <stdexcept>
+#include <tuple>
+#include <typeinfo>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// ----------------
+// DataSetElement
+// ----------------
+
+inline DataSetElement::DataSetElement(const std::string& label, const XsdType& xsd)
+    : xsd_(xsd), label_(label)
+{
+}
+
+inline DataSetElement::DataSetElement(const std::string& label, const FromInputXml&,
+                                      const XsdType& xsd)
+    : xsd_(xsd), label_(label, true)
+{
+}
+
+inline bool DataSetElement::operator==(const DataSetElement& other) const
+{
+    return std::tie(xsd_, label_, text_, attributes_, children_) ==
+           std::tie(other.xsd_, other.label_, other.text_, other.attributes_, other.children_);
+}
+
+inline bool DataSetElement::operator!=(const DataSetElement& other) const
+{
+    return !(*this == other);
+}
+
+template <typename T>
+const T& DataSetElement::operator[](size_t index) const
+{
+    return Child<T>(index);
+}
+
+template <typename T>
+T& DataSetElement::operator[](size_t index)
+{
+    return Child<T>(index);
+}
+
+template <typename T>
+const T& DataSetElement::operator[](const std::string& label) const
+{
+    return Child<T>(label);
+}
+
+template <typename T>
+T& DataSetElement::operator[](const std::string& label)
+{
+    return Child<T>(label);
+}
+
+template <typename T>
+inline void DataSetElement::AddChild(const T& e)
+{
+    children_.push_back(std::make_shared<T>(e));
+}
+
+inline void DataSetElement::AddChild(const DataSetElement& e)
+{
+    children_.push_back(std::make_shared<DataSetElement>(e));
+}
+
+inline void DataSetElement::AddChild(std::shared_ptr<DataSetElement> e) { children_.push_back(e); }
+
+inline std::string& DataSetElement::Attribute(const std::string& name) { return attributes_[name]; }
+
+inline const std::string& DataSetElement::Attribute(const std::string& name) const
+{
+    auto iter = attributes_.find(name);
+    if (iter == attributes_.cend()) return SharedNullString();
+    return iter->second;
+}
+
+inline void DataSetElement::Attribute(const std::string& name, const std::string& value)
+{
+    attributes_[name] = value;
+}
+
+inline const std::map<std::string, std::string>& DataSetElement::Attributes() const
+{
+    return attributes_;
+}
+
+inline std::map<std::string, std::string>& DataSetElement::Attributes() { return attributes_; }
+
+template <typename T>
+inline const T& DataSetElement::Child(size_t index) const
+{
+    DataSetElement* child = children_.at(index).get();
+    if (child == nullptr)
+        throw std::runtime_error{"DataSetElement: cannot access null child at index: " +
+                                 std::to_string(index)};
+    const T* c = dynamic_cast<const T*>(child);
+    return *c;
+}
+
+template <typename T>
+inline T& DataSetElement::Child(size_t index)
+{
+    DataSetElement* child = children_.at(index).get();
+    if (child == nullptr)
+        throw std::runtime_error{"DataSetElement: cannot access null child at index: " +
+                                 std::to_string(index)};
+    T* c = dynamic_cast<T*>(child);
+    return *c;
+}
+
+template <typename T>
+inline const T& DataSetElement::Child(const std::string& label) const
+{
+    const auto index = IndexOf(label);
+    return Child<T>(index);
+}
+
+template <typename T>
+inline T& DataSetElement::Child(const std::string& label)
+{
+    const int i = IndexOf(label);
+    if (i >= 0) {
+        assert(static_cast<size_t>(i) < NumChildren());
+        return Child<T>(i);
+    } else {
+        AddChild(T());
+        return Child<T>(NumChildren() - 1);
+    }
+}
+
+template <>
+inline DataSetElement& DataSetElement::Child<DataSetElement>(const std::string& label)
+{
+    const int i = IndexOf(label);
+    if (i >= 0) {
+        assert(static_cast<size_t>(i) < NumChildren());
+        return Child<DataSetElement>(i);
+    } else {
+        AddChild(DataSetElement{label});
+        return Child<DataSetElement>(NumChildren() - 1);
+    }
+}
+
+inline const std::vector<std::shared_ptr<DataSetElement>>& DataSetElement::Children() const
+{
+    return children_;
+}
+
+inline std::vector<std::shared_ptr<DataSetElement>>& DataSetElement::Children()
+{
+    return children_;
+}
+
+inline const std::string& DataSetElement::ChildText(const std::string& label) const
+{
+    if (!HasChild(label)) return SharedNullString();
+    return Child<DataSetElement>(label).Text();
+}
+
+inline std::string& DataSetElement::ChildText(const std::string& label)
+{
+    if (!HasChild(label)) AddChild(DataSetElement(label));
+    return Child<DataSetElement>(label).Text();
+}
+
+inline bool DataSetElement::HasAttribute(const std::string& name) const
+{
+    return attributes_.find(name) != attributes_.cend();
+}
+
+inline bool DataSetElement::HasChild(const std::string& label) const
+{
+    return IndexOf(label) != -1;
+}
+
+inline int DataSetElement::IndexOf(const std::string& label) const
+{
+    const size_t count = NumChildren();
+    for (size_t i = 0; i < count; ++i) {
+        const DataSetElement& child = *(children_.at(i).get());
+        if (child.LocalNameLabel() == label || child.QualifiedNameLabel() == label ||
+            child.label_ == label)
+            return i;
+    }
+    return -1;
+}
+
+inline const boost::string_ref DataSetElement::LocalNameLabel() const { return label_.LocalName(); }
+
+inline const boost::string_ref DataSetElement::PrefixLabel() const { return label_.Prefix(); }
+
+inline const std::string& DataSetElement::QualifiedNameLabel() const
+{
+    return label_.QualifiedName();
+}
+
+inline void DataSetElement::Label(const std::string& label) { label_ = XmlName(label, true); }
+
+inline size_t DataSetElement::NumAttributes() const { return attributes_.size(); }
+
+inline size_t DataSetElement::NumChildren() const { return children_.size(); }
+
+inline size_t DataSetElement::Size() const { return children_.size(); }
+
+inline void DataSetElement::RemoveChild(const DataSetElement& e)
+{
+    std::vector<std::shared_ptr<DataSetElement>> newChildren;
+    for (std::shared_ptr<DataSetElement>& child : children_) {
+        if (*(child.get()) != e) newChildren.push_back(std::move(child));
+    }
+    children_ = std::move(newChildren);
+}
+
+inline void DataSetElement::ChildText(const std::string& label, const std::string& text)
+{
+    if (!HasChild(label)) {
+        DataSetElement e(label);
+        e.Text(text);
+        AddChild(e);
+    } else {
+        Child<DataSetElement>(label).Text(text);
+    }
+}
+
+inline bool DataSetElement::IsVerbatimLabel() const { return label_.Verbatim(); }
+
+inline const std::string& DataSetElement::Text() const { return text_; }
+
+inline std::string& DataSetElement::Text() { return text_; }
+
+inline void DataSetElement::Text(const std::string& text) { text_ = text; }
+
+inline const XsdType& DataSetElement::Xsd() const { return xsd_; }
+
+// ----------------------------
+// DataSetElementIteratorBase
+// ----------------------------
+
+inline DataSetElementIteratorBase::DataSetElementIteratorBase(const DataSetElement* parent,
+                                                              size_t i)
+    : parent_(parent), index_(i)
+{
+}
+
+inline bool DataSetElementIteratorBase::operator==(const DataSetElementIteratorBase& other) const
+{
+    return std::tie(parent_, index_) == std::tie(other.parent_, other.index_);
+}
+
+inline bool DataSetElementIteratorBase::operator!=(const DataSetElementIteratorBase& other) const
+{
+    return !(*this == other);
+}
+
+inline void DataSetElementIteratorBase::Advance()
+{
+    if (index_ >= parent_->NumChildren()) {
+        parent_ = nullptr;
+        return;
+    }
+    ++index_;
+}
+
+// ------------------------
+// DataSetElementIterator
+// ------------------------
+
+template <typename T>
+inline DataSetElementIterator<T>::DataSetElementIterator(const DataSetElement* parent, size_t i)
+    : DataSetElementIteratorBase(parent, i)
+{
+}
+
+template <typename T>
+inline T& DataSetElementIterator<T>::operator*()
+{
+    return parent_->template Child<T>(index_);
+}
+
+template <typename T>
+inline T* DataSetElementIterator<T>::operator->()
+{
+    return &(operator*());
+}
+
+template <typename T>
+inline DataSetElementIterator<T>& DataSetElementIterator<T>::operator++()
+{
+    Advance();
+    return *this;
+}
+
+template <typename T>
+inline DataSetElementIterator<T> DataSetElementIterator<T>::operator++(int)
+{
+    DataSetElementIterator<T> result(*this);
+    ++(*this);
+    return result;
+}
+
+// -----------------------------
+// DataSetElementConstIterator
+// -----------------------------
+
+template <typename T>
+inline DataSetElementConstIterator<T>::DataSetElementConstIterator(const DataSetElement* parent,
+                                                                   size_t i)
+    : DataSetElementIteratorBase(parent, i)
+{
+}
+
+template <typename T>
+inline const T& DataSetElementConstIterator<T>::operator*() const
+{
+    return parent_->template Child<const T>(index_);
+}
+
+template <typename T>
+inline const T* DataSetElementConstIterator<T>::operator->() const
+{
+    return &(operator*());
+}
+
+template <typename T>
+inline DataSetElementConstIterator<T>& DataSetElementConstIterator<T>::operator++()
+{
+    Advance();
+    return *this;
+}
+
+template <typename T>
+DataSetElementConstIterator<T> DataSetElementConstIterator<T>::operator++(int)
+{
+    DataSetElementConstIterator<T> result(*this);
+    ++(*this);
+    return result;
+}
+
+// ----------------
+// XmlName
+// ----------------
+
+inline XmlName::XmlName(std::string fullName, bool verbatim)
+    : qualifiedName_(std::move(fullName))
+    , prefixSize_(0)
+    , localNameOffset_(0)
+    , localNameSize_(0)
+    , verbatim_(verbatim)
+{
+    const size_t colonFound = qualifiedName_.find(':');
+    if (colonFound == std::string::npos || colonFound == 0)
+        localNameSize_ = qualifiedName_.size();
+    else {
+        prefixSize_ = colonFound;
+        localNameSize_ = (qualifiedName_.size() - colonFound) - 1;
+    }
+
+    // adjust for colon if prefix present
+    localNameOffset_ = prefixSize_;
+    if (prefixSize_ != 0) ++localNameOffset_;
+}
+
+inline XmlName::XmlName(const std::string& localName, const std::string& prefix)
+    : prefixSize_(prefix.size())
+    , localNameOffset_(prefixSize_)
+    , localNameSize_(localName.size())
+    , verbatim_(true)
+{
+    qualifiedName_.clear();
+    qualifiedName_.reserve(localNameSize_ + prefixSize_ + 1);
+    qualifiedName_.append(prefix);
+    if (!qualifiedName_.empty()) qualifiedName_.append(1, ':');
+    qualifiedName_.append(localName);
+
+    // adjust for colon if prefix present
+    if (prefixSize_ != 0) ++localNameOffset_;
+}
+
+inline bool XmlName::operator==(const XmlName& other) const
+{
+    return qualifiedName_ == other.qualifiedName_;
+}
+
+inline bool XmlName::operator!=(const XmlName& other) const { return !(*this == other); }
+
+inline const boost::string_ref XmlName::LocalName() const
+{
+    return boost::string_ref(qualifiedName_.data() + localNameOffset_, localNameSize_);
+}
+
+inline const boost::string_ref XmlName::Prefix() const
+{
+    return boost::string_ref(qualifiedName_.data(), prefixSize_);
+}
+
+inline const std::string& XmlName::QualifiedName() const { return qualifiedName_; }
+
+inline bool XmlName::Verbatim() const { return verbatim_; }
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/include/pbbam/internal/PbiBasicTypes.inl b/include/pbbam/internal/PbiBasicTypes.inl

new file mode 100644 (file)

index 0000000..168813e
--- /dev/null
+++ b/include/pbbam/internal/PbiBasicTypes.inl
@@ -0,0 +1,28 @@
+// File Description
+/// \file PbiBasicTypes.inl
+/// \brief Inline implementations for the basic data structures used in PBI lookups.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiBasicTypes.h"
+
+#include <tuple>
+
+namespace PacBio {
+namespace BAM {
+
+inline IndexResultBlock::IndexResultBlock(size_t idx, size_t numReads)
+    : firstIndex_{idx}
+    , numReads_{numReads}
+{ }
+
+inline bool IndexResultBlock::operator==(const IndexResultBlock& other) const
+{
+    return std::tie(firstIndex_, numReads_, virtualOffset_) == std::tie(other.firstIndex_, other.numReads_, other.virtualOffset_);
+}
+
+inline bool IndexResultBlock::operator!=(const IndexResultBlock& other) const
+{ return !(*this == other); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/PbiFilter.inl b/include/pbbam/internal/PbiFilter.inl

new file mode 100644 (file)

index 0000000..276c445
--- /dev/null
+++ b/include/pbbam/internal/PbiFilter.inl
@@ -0,0 +1,227 @@
+// File Description
+/// \file PbiFilter.inl
+/// \brief Inline implementations for the PbiFilter class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiFilter.h"
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <set>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+/// \internal
+///
+/// This class wraps a the basic PBI filter (whether property filter or some operator
+/// e.g. union, intersect, etc.). The wrapper allows PbiFilters to hold heterogeneous,
+/// recursive filter types - without exposing pointers & worrying about memory ownership
+/// issues between client & library.
+///
+/// Filters can be given by value from client code and we will wrap them for composition.
+///
+/// \code{.cpp}
+///    PbiFilter f1(PbiZmwFilter(42));
+///    PbiFilter f2;
+///    f2.Add(PbiQueryLengthFilter(3000, GREATER_THAN_EQUAL));
+///    f2.Add(MyApplicationCustomFilter("foo"));
+///    PbiFilter intersect = PbiFilter::Intersect(f1, f2);
+///    ...
+/// \endcode
+///
+struct FilterWrapper
+{
+public:
+    template <typename T>
+    FilterWrapper(T x);
+
+    FilterWrapper(const FilterWrapper& other);
+    FilterWrapper(FilterWrapper&&) noexcept = default;
+    FilterWrapper& operator=(const FilterWrapper& other);
+    FilterWrapper& operator=(FilterWrapper&&) noexcept = default;
+
+public:
+    bool Accepts(const PacBio::BAM::PbiRawData& idx, const size_t row) const;
+
+private:
+    struct WrapperInterface
+    {
+        virtual ~WrapperInterface() = default;
+        virtual WrapperInterface* Clone() const = 0;
+        virtual bool Accepts(const PacBio::BAM::PbiRawData& idx, const size_t row) const = 0;
+    };
+
+    template <typename T>
+    struct WrapperImpl : public WrapperInterface
+    {
+        WrapperImpl(T x);
+        WrapperImpl(const WrapperImpl& other);
+        WrapperInterface* Clone() const override;
+        bool Accepts(const PacBio::BAM::PbiRawData& idx, const size_t row) const override;
+        T data_;
+    };
+
+private:
+    std::unique_ptr<WrapperInterface> self_;
+};
+
+// ---------------
+// FilterWrapper
+// ---------------
+
+template <typename T>
+inline FilterWrapper::FilterWrapper(T x) : self_{std::make_unique<WrapperImpl<T>>(std::move(x))}
+{}
+
+inline FilterWrapper::FilterWrapper(const FilterWrapper& other) : self_{other.self_->Clone()} {}
+
+inline FilterWrapper& FilterWrapper::operator=(const FilterWrapper& other)
+{
+    self_.reset(other.self_->Clone());
+    return *this;
+}
+
+inline bool FilterWrapper::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    return self_->Accepts(idx, row);
+}
+
+// ----------------
+// WrapperImpl<T>
+// ----------------
+
+template <typename T>
+inline FilterWrapper::WrapperImpl<T>::WrapperImpl(T x)
+    : FilterWrapper::WrapperInterface{}, data_(std::move(x))
+{
+    BOOST_CONCEPT_ASSERT((PbiFilterConcept<T>));
+}
+
+template <typename T>
+inline FilterWrapper::WrapperImpl<T>::WrapperImpl(const WrapperImpl& other)
+    : FilterWrapper::WrapperInterface{}, data_(other.data_)
+{}
+
+template <typename T>
+inline FilterWrapper::WrapperInterface* FilterWrapper::WrapperImpl<T>::Clone() const
+{
+    return new WrapperImpl(*this);
+}
+
+template <typename T>
+inline bool FilterWrapper::WrapperImpl<T>::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    return data_.Accepts(idx, row);
+}
+
+struct PbiFilterPrivate
+{
+    PbiFilterPrivate(PbiFilter::CompositionType type = PbiFilter::INTERSECT) : type_{type} {}
+
+    template <typename T>
+    void Add(T filter)
+    {
+        filters_.emplace_back(std::move(filter));
+    }
+
+    std::unique_ptr<internal::PbiFilterPrivate> DeepCopy()
+    {
+        auto copy = std::make_unique<PbiFilterPrivate>(type_);
+        copy->filters_ = this->filters_;
+        return copy;
+    }
+
+    bool Accepts(const PbiRawData& idx, const size_t row) const
+    {
+        // no filter -> accepts every record
+        if (filters_.empty()) return true;
+
+        // intersection of child filters
+        if (type_ == PbiFilter::INTERSECT) {
+            for (const auto& filter : filters_) {
+                if (!filter.Accepts(idx, row)) return false;  // break early on failure
+            }
+            return true;  // all passed
+        }
+
+        // union of child filters
+        else if (type_ == PbiFilter::UNION) {
+            for (const auto& filter : filters_) {
+                if (filter.Accepts(idx, row)) return true;  // break early on pass
+            }
+            return false;  // none passed
+        }
+
+        else
+            //assert(false); // invalid composite filter type
+            throw std::runtime_error{"PbiFilter: invalid composite filter type"};
+    }
+
+    PbiFilter::CompositionType type_;
+    std::vector<FilterWrapper> filters_;
+};
+
+}  // namespace internal
+
+inline PbiFilter::PbiFilter(const CompositionType type)
+    : d_{std::make_unique<internal::PbiFilterPrivate>(type)}
+{}
+
+template <typename T>
+inline PbiFilter::PbiFilter(T filter) : d_{std::make_unique<internal::PbiFilterPrivate>()}
+{
+    Add(std::move(filter));
+}
+
+inline PbiFilter::PbiFilter(std::vector<PbiFilter> filters)
+    : d_{std::make_unique<internal::PbiFilterPrivate>()}
+{
+    Add(std::move(filters));
+}
+
+inline PbiFilter::PbiFilter(const PbiFilter& other) : d_{other.d_->DeepCopy()} {}
+
+inline PbiFilter& PbiFilter::operator=(const PbiFilter& other)
+{
+    d_ = other.d_->DeepCopy();
+    return *this;
+}
+
+inline bool PbiFilter::Accepts(const PacBio::BAM::PbiRawData& idx, const size_t row) const
+{
+    return d_->Accepts(idx, row);
+}
+
+template <typename T>
+inline PbiFilter& PbiFilter::Add(T filter)
+{
+    d_->Add(std::move(filter));
+    return *this;
+}
+
+inline PbiFilter& PbiFilter::Add(PbiFilter filter)
+{
+    d_->Add(std::move(filter));
+    return *this;
+}
+
+inline PbiFilter& PbiFilter::Add(std::vector<PbiFilter> filters)
+{
+    for (auto&& filter : filters)
+        d_->Add(std::move(filter));
+    return *this;
+}
+
+inline bool PbiFilter::IsEmpty() const { return d_->filters_.empty(); }
+
+inline size_t PbiFilter::NumChildren() const { return d_->filters_.size(); }
+
+inline PbiFilter::CompositionType PbiFilter::Type() const { return d_->type_; }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/include/pbbam/internal/PbiFilterTypes.inl b/include/pbbam/internal/PbiFilterTypes.inl

new file mode 100644 (file)

index 0000000..7e043a4
--- /dev/null
+++ b/include/pbbam/internal/PbiFilterTypes.inl
@@ -0,0 +1,508 @@
+// File Description
+/// \file PbiFilterTypes.inl
+/// \brief Inline implementations for the built-in PBI filters.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiFilterTypes.h"
+
+#include <cassert>
+#include <stdexcept>
+
+#include <boost/functional/hash/hash.hpp>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+
+template <typename T>
+inline FilterBase<T>::FilterBase(T value, const Compare::Type cmp)
+    : value_{std::move(value)}, cmp_{cmp}
+{}
+
+template <typename T>
+inline FilterBase<T>::FilterBase(std::vector<T> values, const Compare::Type cmp)
+    : multiValue_{std::move(values)}, cmp_{cmp}
+{
+    // "=="/"!=" can come in from XML, e.g. <Property Name="zmw" Operator="==" Value="(x,y,z)" />"
+    // switch to whitelist/blacklist containment for multi-value filters
+    if (cmp_ == Compare::EQUAL)
+        cmp_ = Compare::CONTAINS;
+    else if (cmp_ == Compare::NOT_EQUAL)
+        cmp_ = Compare::NOT_CONTAINS;
+
+    if (cmp_ != Compare::CONTAINS && cmp_ != Compare::NOT_CONTAINS) {
+        throw std::runtime_error{
+            "PbiFilter: multi-valued filters (e.g. whitelists) can only check containment."};
+    }
+}
+
+template <typename T>
+inline bool FilterBase<T>::CompareHelper(const T& lhs) const
+{
+    if (multiValue_ == boost::none)
+        return CompareSingleHelper(lhs);
+    else
+        return CompareMultiHelper(lhs);
+}
+
+template <typename T>
+inline bool FilterBase<T>::CompareMultiHelper(const T& lhs) const
+{
+    // multi-value filters are whitelist/blacklist
+    assert(cmp_ == Compare::CONTAINS || cmp_ == Compare::NOT_CONTAINS);
+
+    // whitelist - return true on any hit
+    if (cmp_ == Compare::CONTAINS) {
+        for (const auto x : multiValue_.get())
+            if (x == lhs) return true;
+        return false;
+    }
+    // blacklist - return false on any hit
+    else {
+        for (const auto x : multiValue_.get())
+            if (x == lhs) return false;
+        return true;
+    }
+}
+
+template <typename T>
+inline bool FilterBase<T>::CompareSingleHelper(const T& lhs) const
+{
+    return Compare::Check(lhs, value_, cmp_);
+}
+
+template <>
+inline bool FilterBase<LocalContextFlags>::CompareSingleHelper(const LocalContextFlags& lhs) const
+{
+    switch (cmp_) {
+        case Compare::EQUAL:
+            return lhs == value_;
+        case Compare::LESS_THAN:
+            return lhs < value_;
+        case Compare::LESS_THAN_EQUAL:
+            return lhs <= value_;
+        case Compare::GREATER_THAN:
+            return lhs > value_;
+        case Compare::GREATER_THAN_EQUAL:
+            return lhs >= value_;
+        case Compare::NOT_EQUAL:
+            return lhs != value_;
+        case Compare::CONTAINS:
+            return ((lhs & value_) != 0);
+        case Compare::NOT_CONTAINS:
+            return ((lhs & value_) == 0);
+
+        default:
+            assert(false);
+            throw std::runtime_error{"PbiFilter: unknown compare type (" +
+                                     Compare::TypeToName(cmp_) + ")"};
+    }
+}
+
+// BarcodeDataFilterBase
+
+template <typename T, PbiFile::BarcodeField field>
+inline BarcodeDataFilterBase<T, field>::BarcodeDataFilterBase(T value, const Compare::Type cmp)
+    : FilterBase<T>{std::move(value), cmp}
+{}
+
+template <typename T, PbiFile::BarcodeField field>
+inline BarcodeDataFilterBase<T, field>::BarcodeDataFilterBase(std::vector<T> values,
+                                                              const Compare::Type cmp)
+    : FilterBase<T>{std::move(values), cmp}
+{}
+
+template <typename T, PbiFile::BarcodeField field>
+inline bool BarcodeDataFilterBase<T, field>::BarcodeDataFilterBase::Accepts(const PbiRawData& idx,
+                                                                            const size_t row) const
+{
+    const PbiRawBarcodeData& barcodeData = idx.BarcodeData();
+    switch (field) {
+        case PbiFile::BarcodeField::BC_FORWARD:
+            return FilterBase<T>::CompareHelper(barcodeData.bcForward_.at(row));
+        case PbiFile::BarcodeField::BC_REVERSE:
+            return FilterBase<T>::CompareHelper(barcodeData.bcReverse_.at(row));
+        case PbiFile::BarcodeField::BC_QUALITY:
+            return FilterBase<T>::CompareHelper(barcodeData.bcQual_.at(row));
+        default:
+            assert(false);
+            throw std::runtime_error{"PbiFilter: unknown barcode field requested."};
+    }
+}
+
+// BasicDataFilterBase
+
+template <typename T, PbiFile::BasicField field>
+inline BasicDataFilterBase<T, field>::BasicDataFilterBase(T value, const Compare::Type cmp)
+    : FilterBase<T>{std::move(value), cmp}
+{}
+
+template <typename T, PbiFile::BasicField field>
+inline BasicDataFilterBase<T, field>::BasicDataFilterBase(std::vector<T> values,
+                                                          const Compare::Type cmp)
+    : FilterBase<T>{std::move(values), cmp}
+{}
+
+template <typename T, PbiFile::BasicField field>
+inline bool BasicDataFilterBase<T, field>::BasicDataFilterBase::Accepts(const PbiRawData& idx,
+                                                                        const size_t row) const
+{
+    const PbiRawBasicData& basicData = idx.BasicData();
+    switch (field) {
+        case PbiFile::BasicField::RG_ID:
+            return FilterBase<T>::CompareHelper(basicData.rgId_.at(row));
+        case PbiFile::BasicField::Q_START:
+            return FilterBase<T>::CompareHelper(basicData.qStart_.at(row));
+        case PbiFile::BasicField::Q_END:
+            return FilterBase<T>::CompareHelper(basicData.qEnd_.at(row));
+        case PbiFile::BasicField::ZMW:
+            return FilterBase<T>::CompareHelper(basicData.holeNumber_.at(row));
+        case PbiFile::BasicField::READ_QUALITY:
+            return FilterBase<T>::CompareHelper(basicData.readQual_.at(row));
+        // NOTE(DB): PbiFile::BasicField::CONTEXT_FLAG has its own specialization
+        default:
+            assert(false);
+            throw std::runtime_error{"PbiFilter: unknown basic data field requested."};
+    }
+}
+
+// this typedef exists purely so that the next method signature isn't 2 screen widths long
+using LocalContextFilter__ =
+    BasicDataFilterBase<LocalContextFlags, PbiFile::BasicField::CONTEXT_FLAG>;
+
+template <>
+inline bool LocalContextFilter__::BasicDataFilterBase::Accepts(const PbiRawData& idx,
+                                                               const size_t row) const
+{
+    const auto& basicData = idx.BasicData();
+    const auto rowFlags = static_cast<LocalContextFlags>(basicData.ctxtFlag_.at(row));
+    return FilterBase<LocalContextFlags>::CompareHelper(rowFlags);
+}
+
+// BasicDataFilterBase
+
+template <typename T, PbiFile::MappedField field>
+inline MappedDataFilterBase<T, field>::MappedDataFilterBase(T value, const Compare::Type cmp)
+    : FilterBase<T>{std::move(value), cmp}
+{}
+
+template <typename T, PbiFile::MappedField field>
+inline MappedDataFilterBase<T, field>::MappedDataFilterBase(std::vector<T> values,
+                                                            const Compare::Type cmp)
+    : FilterBase<T>{std::move(values), cmp}
+{}
+
+template <>
+inline bool
+MappedDataFilterBase<Strand, PbiFile::MappedField::STRAND>::MappedDataFilterBase::Accepts(
+    const PbiRawData& idx, const size_t row) const
+{
+    const PbiRawMappedData& mappedData = idx.MappedData();
+    const Strand strand = (mappedData.revStrand_.at(row) == 1 ? Strand::REVERSE : Strand::FORWARD);
+    return FilterBase<Strand>::CompareHelper(strand);
+}
+
+template <typename T, PbiFile::MappedField field>
+inline bool MappedDataFilterBase<T, field>::MappedDataFilterBase::Accepts(const PbiRawData& idx,
+                                                                          const size_t row) const
+{
+    const PbiRawMappedData& mappedData = idx.MappedData();
+    switch (field) {
+        case PbiFile::MappedField::T_ID:
+            return FilterBase<T>::CompareHelper(mappedData.tId_.at(row));
+        case PbiFile::MappedField::T_START:
+            return FilterBase<T>::CompareHelper(mappedData.tStart_.at(row));
+        case PbiFile::MappedField::T_END:
+            return FilterBase<T>::CompareHelper(mappedData.tEnd_.at(row));
+        case PbiFile::MappedField::A_START:
+            return FilterBase<T>::CompareHelper(mappedData.aStart_.at(row));
+        case PbiFile::MappedField::A_END:
+            return FilterBase<T>::CompareHelper(mappedData.aEnd_.at(row));
+        case PbiFile::MappedField::N_M:
+            return FilterBase<T>::CompareHelper(mappedData.nM_.at(row));
+        case PbiFile::MappedField::N_MM:
+            return FilterBase<T>::CompareHelper(mappedData.nMM_.at(row));
+        case PbiFile::MappedField::N_DEL:
+            return FilterBase<T>::CompareHelper(mappedData.NumDeletedBasesAt(row));
+        case PbiFile::MappedField::N_INS:
+            return FilterBase<T>::CompareHelper(mappedData.NumInsertedBasesAt(row));
+        case PbiFile::MappedField::MAP_QUALITY:
+            return FilterBase<T>::CompareHelper(mappedData.mapQV_.at(row));
+        default:
+            assert(false);
+            throw std::runtime_error{"PbiFilter: unknown mapped data field requested."};
+    }
+}
+
+}  // namespace internal
+
+// PbiAlignedEndFilter
+
+inline PbiAlignedEndFilter::PbiAlignedEndFilter(const uint32_t position, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::A_END>{position, cmp}
+{}
+
+// PbiAlignedLengthFilter
+
+inline PbiAlignedLengthFilter::PbiAlignedLengthFilter(const uint32_t length,
+                                                      const Compare::Type cmp)
+    : internal::FilterBase<uint32_t>{length, cmp}
+{}
+
+// PbiAlignedStartFilter
+
+inline PbiAlignedStartFilter::PbiAlignedStartFilter(const uint32_t position,
+                                                    const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::A_START>{position, cmp}
+{}
+
+// PbiAlignedStrandFilter
+
+inline PbiAlignedStrandFilter::PbiAlignedStrandFilter(const Strand strand, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<Strand, PbiFile::MappedField::STRAND>{strand, cmp}
+{
+    if (cmp != Compare::EQUAL && cmp != Compare::NOT_EQUAL)
+        throw std::runtime_error{
+            "PbiFilter: compare type for aligned strand must be either EQUAL or NOT_EQUAL"};
+}
+
+// PbiBarcodeFilter
+
+inline PbiBarcodeFilter::PbiBarcodeFilter(const int16_t barcode, const Compare::Type cmp)
+    : compositeFilter_{PbiFilter::Union(
+          {PbiBarcodeForwardFilter{barcode, cmp}, PbiBarcodeReverseFilter{barcode, cmp}})}
+{}
+
+inline PbiBarcodeFilter::PbiBarcodeFilter(std::vector<int16_t> barcodes, const Compare::Type cmp)
+    : compositeFilter_{PbiFilter::Union(
+          {PbiBarcodeForwardFilter{barcodes, cmp}, PbiBarcodeReverseFilter{barcodes, cmp}})}
+{}
+
+inline bool PbiBarcodeFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    return compositeFilter_.Accepts(idx, row);
+}
+
+// PbiBarcodeForwardFilter
+
+inline PbiBarcodeForwardFilter::PbiBarcodeForwardFilter(const int16_t bcFwdId,
+                                                        const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<int16_t, PbiFile::BarcodeField::BC_FORWARD>{bcFwdId, cmp}
+{}
+
+inline PbiBarcodeForwardFilter::PbiBarcodeForwardFilter(std::vector<int16_t> barcodes,
+                                                        const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<int16_t, PbiFile::BarcodeField::BC_FORWARD>{
+          std::move(barcodes), cmp}
+{}
+
+// PbiBarcodeQualityFilter
+
+inline PbiBarcodeQualityFilter::PbiBarcodeQualityFilter(const uint8_t bcQuality,
+                                                        const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<uint8_t, PbiFile::BarcodeField::BC_QUALITY>{bcQuality, cmp}
+{}
+
+// PbiBarcodeReverseFilter
+
+inline PbiBarcodeReverseFilter::PbiBarcodeReverseFilter(const int16_t bcRevId,
+                                                        const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<int16_t, PbiFile::BarcodeField::BC_REVERSE>{bcRevId, cmp}
+{}
+
+inline PbiBarcodeReverseFilter::PbiBarcodeReverseFilter(std::vector<int16_t> barcodes,
+                                                        const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<int16_t, PbiFile::BarcodeField::BC_REVERSE>{
+          std::move(barcodes), cmp}
+{}
+
+// PbiBarcodesFilter
+
+inline PbiBarcodesFilter::PbiBarcodesFilter(const std::pair<int16_t, int16_t> barcodes,
+                                            const Compare::Type cmp)
+    : PbiBarcodesFilter{barcodes.first, barcodes.second, cmp}
+{}
+
+inline PbiBarcodesFilter::PbiBarcodesFilter(const int16_t bcForward, const int16_t bcReverse,
+                                            const Compare::Type cmp)
+    : compositeFilter_{PbiFilter::Intersection(
+          {PbiBarcodeForwardFilter{bcForward, cmp}, PbiBarcodeReverseFilter{bcReverse, cmp}})}
+{}
+
+inline bool PbiBarcodesFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    return compositeFilter_.Accepts(idx, row);
+}
+
+// PbiIdentityFilter
+
+inline PbiIdentityFilter::PbiIdentityFilter(const float identity, const Compare::Type cmp)
+    : internal::FilterBase<float>{identity, cmp}
+{}
+
+// PbiLocalContextFilter
+
+inline PbiLocalContextFilter::PbiLocalContextFilter(const LocalContextFlags& flags,
+                                                    const Compare::Type cmp)
+    : internal::BasicDataFilterBase<LocalContextFlags, PbiFile::BasicField::CONTEXT_FLAG>{flags,
+                                                                                          cmp}
+{}
+
+// PbiMapQualityFilter
+
+inline PbiMapQualityFilter::PbiMapQualityFilter(const uint8_t mapQual, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint8_t, PbiFile::MappedField::MAP_QUALITY>{mapQual, cmp}
+{}
+
+// PbiMovieNameFilter
+
+inline bool PbiMovieNameFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const bool found = compositeFilter_.Accepts(idx, row);
+    if (cmp_ == Compare::EQUAL || cmp_ == Compare::CONTAINS)
+        return found;
+    else if (cmp_ == Compare::NOT_EQUAL || cmp_ == Compare::NOT_CONTAINS)
+        return !found;
+    else
+        throw std::runtime_error{"unsupported compare type on movie name filter"};
+}
+
+// PbiNumDeletedBasesFilter
+
+inline PbiNumDeletedBasesFilter::PbiNumDeletedBasesFilter(const size_t numDeletions,
+                                                          const Compare::Type cmp)
+    : internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_DEL>{numDeletions, cmp}
+{}
+
+// PbiNumInsertedBasesFilter
+
+inline PbiNumInsertedBasesFilter::PbiNumInsertedBasesFilter(const size_t numInsertions,
+                                                            const Compare::Type cmp)
+    : internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_INS>{numInsertions, cmp}
+{}
+
+// PbiNumMatchesFilter
+
+inline PbiNumMatchesFilter::PbiNumMatchesFilter(const size_t numMatchedBases,
+                                                const Compare::Type cmp)
+    : internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_M>{numMatchedBases, cmp}
+{}
+
+// PbiNumMismatchesFilter
+
+inline PbiNumMismatchesFilter::PbiNumMismatchesFilter(const size_t numMismatchedBases,
+                                                      const Compare::Type cmp)
+    : internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_MM>{numMismatchedBases, cmp}
+{}
+
+// PbiQueryEndFilter
+
+inline PbiQueryEndFilter::PbiQueryEndFilter(const int32_t position, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::Q_END>{position, cmp}
+{}
+
+// PbiQueryLengthFilter
+
+inline PbiQueryLengthFilter::PbiQueryLengthFilter(const int32_t length, const Compare::Type cmp)
+    : internal::FilterBase<int32_t>{length, cmp}
+{}
+
+// PbiQueryStartFilter
+
+inline PbiQueryStartFilter::PbiQueryStartFilter(const int32_t position, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::Q_START>{position, cmp}
+{}
+
+// PbiReadAccuracyFilter
+
+inline PbiReadAccuracyFilter::PbiReadAccuracyFilter(const Accuracy accuracy,
+                                                    const Compare::Type cmp)
+    : internal::BasicDataFilterBase<Accuracy, PbiFile::BasicField::READ_QUALITY>{accuracy, cmp}
+{}
+
+// PbiReferenceEndFilter
+
+inline PbiReferenceEndFilter::PbiReferenceEndFilter(const uint32_t tEnd, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::T_END>{tEnd, cmp}
+{}
+
+// PbiReferenceIdFilter
+
+inline PbiReferenceIdFilter::PbiReferenceIdFilter(const int32_t tId, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<int32_t, PbiFile::MappedField::T_ID>{tId, cmp}
+{}
+
+inline PbiReferenceIdFilter::PbiReferenceIdFilter(std::vector<int32_t> tIds,
+                                                  const Compare::Type cmp)
+    : internal::MappedDataFilterBase<int32_t, PbiFile::MappedField::T_ID>{std::move(tIds), cmp}
+{}
+
+// PbiReferenceStartFilter
+
+inline PbiReferenceStartFilter::PbiReferenceStartFilter(const uint32_t tStart,
+                                                        const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::T_START>{tStart, cmp}
+{}
+
+// PbiZmwFilter
+
+inline PbiZmwFilter::PbiZmwFilter(const int32_t zmw, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::ZMW>{zmw, cmp}
+{}
+
+inline PbiZmwFilter::PbiZmwFilter(std::vector<int32_t> whitelist, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::ZMW>{std::move(whitelist), cmp}
+{}
+
+// PbiZmwModuloFilter
+
+inline PbiZmwModuloFilter::PbiZmwModuloFilter(const uint32_t denominator, const uint32_t value,
+                                              const FilterHash hashType, const Compare::Type cmp)
+    : denominator_{denominator}, value_{value}, hash_{hashType}, cmp_{cmp}
+{}
+
+inline uint32_t UnsignedLongIntCast(const int32_t zm) { return static_cast<uint32_t>(zm); }
+
+inline uint32_t BoostHashCombine(const int32_t zm)
+{
+    constexpr static const uint16_t mask = 0xFFFF;
+
+    const uint16_t upper = (zm >> 16) & mask;
+    const uint16_t lower = zm & mask;
+
+    // FIXME: discrepancies with Python API. Will return to nail down.
+
+    size_t seed = 0;
+    boost::hash_combine(seed, upper);
+    boost::hash_combine(seed, lower);
+    return static_cast<uint32_t>(seed);
+}
+
+inline bool PbiZmwModuloFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const auto zm = idx.BasicData().holeNumber_.at(row);
+
+    uint32_t hashValue;
+    switch (hash_) {
+        case FilterHash::UNSIGNED_LONG_CAST: {
+            hashValue = UnsignedLongIntCast(zm);
+            break;
+        }
+
+        case FilterHash::BOOST_HASH_COMBINE: {
+            hashValue = BoostHashCombine(zm);
+            break;
+        }
+
+        default:
+            throw std::runtime_error{"unsupported filter hash type"};
+    }
+
+    const auto modResult = hashValue % denominator_;
+    return Compare::Check(modResult, value_, cmp_);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/include/pbbam/internal/QueryBase.h b/include/pbbam/internal/QueryBase.h

new file mode 100644 (file)

index 0000000..427c49a
--- /dev/null
+++ b/include/pbbam/internal/QueryBase.h
@@ -0,0 +1,107 @@
+// Author: Derek Barnett
+
+#ifndef QUERYBASE_H
+#define QUERYBASE_H
+
+#include "pbbam/Config.h"
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/DataSet.h"
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template <typename T>
+class QueryBase;
+
+template <typename T>
+class QueryIteratorBase
+{
+public:
+    ~QueryIteratorBase() = default;
+
+    bool operator==(const QueryIteratorBase<T>& other) const;
+    bool operator!=(const QueryIteratorBase<T>& other) const;
+
+protected:
+    QueryIteratorBase() = default;
+    QueryIteratorBase(QueryBase<T>& query);
+
+    void ReadNext();
+
+protected:
+    QueryBase<T>* query_ = nullptr;
+    T record_;
+};
+
+template <typename T>
+class QueryIterator : public QueryIteratorBase<T>
+{
+public:
+    QueryIterator() = default;
+    QueryIterator(QueryBase<T>& query);
+
+    T& operator*();
+    T* operator->();
+
+    QueryIterator<T>& operator++();
+    QueryIterator<T> operator++(int);
+};
+
+template <typename T>
+class QueryConstIterator : public QueryIteratorBase<T>
+{
+public:
+    QueryConstIterator() = default;
+    QueryConstIterator(const QueryBase<T>& query);
+
+    const T& operator*() const;
+    const T* operator->() const;
+
+    QueryConstIterator<T>& operator++();
+    QueryConstIterator<T> operator++(int);
+};
+
+template <typename T>
+class QueryBase
+{
+
+public:
+    using iterator = QueryIterator<T>;
+    using const_iterator = QueryConstIterator<T>;
+
+public:
+    virtual ~QueryBase() = default;
+
+public:
+    QueryConstIterator<T> begin() const;
+    QueryConstIterator<T> cbegin() const;
+    QueryIterator<T> begin();
+
+    QueryConstIterator<T> end() const;
+    QueryConstIterator<T> cend() const;
+    QueryIterator<T> end();
+
+public:
+    virtual bool GetNext(T& r) = 0;
+
+protected:
+    QueryBase() = default;
+};
+
+using IQuery = QueryBase<BamRecord>;
+using IGroupQuery = QueryBase<std::vector<BamRecord>>;
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/QueryBase.inl"
+
+#endif  // QUERYBASE_H
diff --git a/include/pbbam/internal/QueryBase.inl b/include/pbbam/internal/QueryBase.inl

new file mode 100644 (file)

index 0000000..49cf860
--- /dev/null
+++ b/include/pbbam/internal/QueryBase.inl
@@ -0,0 +1,122 @@
+// Author: Derek Barnett
+
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// -------------------
+// QueryIteratorBase
+// -------------------
+
+template<typename T>
+inline QueryIteratorBase<T>::QueryIteratorBase(QueryBase<T>& query)
+    : query_{&query}
+{ ReadNext(); }
+
+template<typename T> inline
+bool QueryIteratorBase<T>::operator==(const QueryIteratorBase<T>& other) const
+{ return query_ == other.query_; }
+
+template<typename T> inline
+bool QueryIteratorBase<T>::operator!=(const QueryIteratorBase<T>& other) const
+{ return !(*this == other); }
+
+// -------------------
+// QueryIterator
+// -------------------
+
+template<typename T> inline
+QueryIterator<T>::QueryIterator(QueryBase<T>& query)
+    : QueryIteratorBase<T>{query}
+{ }
+
+template<typename T> inline
+T& QueryIterator<T>::operator*()
+{ return QueryIteratorBase<T>::record_; }
+
+template<typename T> inline
+T* QueryIterator<T>::operator->()
+{ return &(operator*()); }
+
+template<typename T> inline
+QueryIterator<T>& QueryIterator<T>::operator++()
+{ QueryIteratorBase<T>::ReadNext(); return *this; }
+
+template<typename T> inline
+QueryIterator<T> QueryIterator<T>::operator++(int)
+{
+    QueryIterator<T> result(*this);
+    ++(*this);
+    return result;
+}
+
+// --------------------
+// QueryConstIterator
+// --------------------
+
+template<typename T> inline
+QueryConstIterator<T>::QueryConstIterator(const QueryBase<T>& query)
+    : QueryIteratorBase<T>{const_cast<QueryBase<T>&>(query)}
+{ }
+
+template<typename T> inline
+const T& QueryConstIterator<T>::operator*() const
+{ return QueryIteratorBase<T>::record_; }
+
+template<typename T> inline
+const T* QueryConstIterator<T>::operator->() const
+{ return &(operator*()); }
+
+template<typename T> inline
+QueryConstIterator<T>& QueryConstIterator<T>::operator++()
+{ QueryIteratorBase<T>::ReadNext(); return *this; }
+
+template<typename T> inline
+QueryConstIterator<T> QueryConstIterator<T>::operator++(int)
+{
+    QueryConstIterator<T> result(*this);
+    ++(*this);
+    return result;
+}
+
+// -----------
+// QueryBase
+// -----------
+
+template<typename T> inline
+QueryConstIterator<T> QueryBase<T>::begin() const
+{ return QueryConstIterator<T>(*this); }
+
+template<typename T> inline
+QueryIterator<T> QueryBase<T>::begin()
+{ return QueryIterator<T>(*this); }
+
+template<typename T> inline
+QueryConstIterator<T> QueryBase<T>::cbegin() const
+{ return QueryConstIterator<T>(*this); }
+
+template<typename T> inline
+QueryConstIterator<T> QueryBase<T>::cend() const
+{ return QueryConstIterator<T>(); }
+
+template<typename T> inline
+QueryConstIterator<T> QueryBase<T>::end() const
+{ return QueryConstIterator<T>(); }
+
+template<typename T> inline
+QueryIterator<T> QueryBase<T>::end()
+{ return QueryIterator<T>(); }
+
+template<typename T>
+inline void QueryIteratorBase<T>::ReadNext()
+{
+    assert(query_);
+    if (!query_->GetNext(record_))
+        query_ = nullptr;
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/vcf/VcfFile.h b/include/pbbam/vcf/VcfFile.h

new file mode 100644 (file)

index 0000000..8d6c356
--- /dev/null
+++ b/include/pbbam/vcf/VcfFile.h
@@ -0,0 +1,32 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFFILE_H
+#define PBBAM_VCF_VCFFILE_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+#include <pbbam/vcf/VcfHeader.h>
+
+namespace PacBio {
+namespace VCF {
+
+class VcfFile
+{
+public:
+    explicit VcfFile(std::string fn);
+
+public:
+    const std::string& Filename() const;
+    const VcfHeader& Header() const;
+
+private:
+    std::string filename_;
+    VcfHeader header_;
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFFILE_H
diff --git a/include/pbbam/vcf/VcfFormat.h b/include/pbbam/vcf/VcfFormat.h

new file mode 100644 (file)

index 0000000..b6885ee
--- /dev/null
+++ b/include/pbbam/vcf/VcfFormat.h
@@ -0,0 +1,107 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFFORMAT_H
+#define PBBAM_VCF_VCFFORMAT_H
+
+#include "pbbam/Config.h"
+
+#include <iosfwd>
+#include <string>
+
+#include <pbbam/vcf/VcfHeader.h>
+#include <pbbam/vcf/VcfVariant.h>
+
+namespace PacBio {
+namespace VCF {
+
+struct VcfFormat
+{
+public:
+    /// \name General format info
+    /// \{
+
+    static const char* CurrentVersion();
+
+    /// \}
+
+public:
+    /// \name Header format
+    /// \{
+
+    static VcfHeader ParsedHeader(const std::string& text);
+
+    static std::string FormattedHeader(const VcfHeader& header);
+
+    static VcfHeader HeaderFromFile(const std::string& fn);
+
+    static VcfHeader HeaderFromStream(std::istream& in);
+
+    /// \}
+
+public:
+    /// \name Variant format
+    /// \{
+
+    static VcfVariant ParsedVariant(const std::string& line);
+
+    static std::string FormattedVariant(const VcfVariant& var);
+
+    /// \}
+
+    // ---------------------------------------------------------------------- //
+    // The following methods are mostly internal helpers, exposed here for    //
+    // testing. Client code should probably not need these, but are available //
+    // here if needed.                                                        //
+    // ---------------------------------------------------------------------- //
+
+public:
+    /// \internal
+    /// \name Header format helpers
+    /// \{
+
+    static ContigDefinition ParsedContigDefinition(std::string line);
+
+    static FilterDefinition ParsedFilterDefinition(std::string line);
+
+    static FormatDefinition ParsedFormatDefinition(std::string line);
+
+    static GeneralDefinition ParsedGeneralDefinition(const std::string& line);
+
+    static InfoDefinition ParsedInfoDefinition(std::string line);
+
+    static std::string FormattedContigDefinition(const ContigDefinition& def);
+
+    static std::string FormattedFilterDefinition(const FilterDefinition& def);
+
+    static std::string FormattedFormatDefinition(const FormatDefinition& def);
+
+    static std::string FormattedGeneralDefinition(const GeneralDefinition& def);
+
+    static std::string FormattedInfoDefinition(const InfoDefinition& def);
+
+    /// \}
+
+public:
+    /// \internal
+    /// \name Variant format helpers
+    /// \{
+
+    static std::string FormattedInfoField(const InfoField& field);
+
+    static std::string FormattedInfoFields(const std::vector<InfoField>& fields);
+
+    static std::string FormattedGenotypeField(const GenotypeField& field);
+
+    static InfoField ParsedInfoField(const std::string& text);
+
+    static std::vector<InfoField> ParsedInfoFields(const std::string& text);
+
+    static GenotypeField ParsedGenotypeField(const std::string& field);
+
+    /// \}
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFFORMAT_H
diff --git a/include/pbbam/vcf/VcfHeader.h b/include/pbbam/vcf/VcfHeader.h

new file mode 100644 (file)

index 0000000..49bc62d
--- /dev/null
+++ b/include/pbbam/vcf/VcfHeader.h
@@ -0,0 +1,115 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFHEADER_H
+#define PBBAM_VCF_VCFHEADER_H
+
+#include "pbbam/Config.h"
+
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <boost/optional.hpp>
+
+#include <pbbam/vcf/VcfHeaderTypes.h>
+
+namespace PacBio {
+namespace VCF {
+
+class VcfHeader
+{
+public:
+    VcfHeader();
+
+    explicit VcfHeader(const std::string& hdrText);
+
+public:
+    // general lines
+
+    size_t NumLines() const;
+
+    const std::string& FileDate() const;
+    const std::string& Version() const;
+
+    const std::vector<PacBio::VCF::GeneralDefinition>& GeneralDefinitions() const;
+    const PacBio::VCF::GeneralDefinition& GeneralDefinition(const std::string& id) const;
+
+    // ##contig
+    const std::vector<PacBio::VCF::ContigDefinition>& ContigDefinitions() const;
+    const PacBio::VCF::ContigDefinition& ContigDefinition(const std::string& id) const;
+
+    // INFO
+
+    const std::vector<PacBio::VCF::InfoDefinition>& InfoDefinitions() const;
+    const PacBio::VCF::InfoDefinition& InfoDefinition(const std::string& id) const;
+
+    // FILTER
+
+    const std::vector<PacBio::VCF::FilterDefinition>& FilterDefinitions() const;
+    const PacBio::VCF::FilterDefinition& FilterDefinition(const std::string& id) const;
+
+    // FORMAT
+
+    const std::vector<PacBio::VCF::FormatDefinition>& FormatDefinitions() const;
+    const PacBio::VCF::FormatDefinition& FormatDefinition(const std::string& id) const;
+
+    // samples
+
+    size_t IndexOfSample(const Sample& sample) const;
+    const Sample& SampleAt(size_t index) const;
+    const std::vector<Sample>& Samples() const;
+
+public:
+    // general lines
+
+    VcfHeader& FileDate(std::string fileDate);
+    VcfHeader& Version(std::string version);
+
+    VcfHeader& AddGeneralDefinition(PacBio::VCF::GeneralDefinition def);
+    VcfHeader& GeneralDefinitions(std::vector<PacBio::VCF::GeneralDefinition> defs);
+
+    // ##contig
+    VcfHeader& AddContigDefinition(PacBio::VCF::ContigDefinition def);
+    VcfHeader& ContigDefinitions(std::vector<PacBio::VCF::ContigDefinition> defs);
+
+    // INFO
+
+    VcfHeader& AddInfoDefinition(PacBio::VCF::InfoDefinition info);
+    VcfHeader& InfoDefinitions(std::vector<PacBio::VCF::InfoDefinition> defs);
+
+    // FILTER
+
+    VcfHeader& AddFilterDefinition(PacBio::VCF::FilterDefinition filter);
+    VcfHeader& FilterDefinitions(std::vector<PacBio::VCF::FilterDefinition> defs);
+
+    // FORMAT
+
+    VcfHeader& AddFormatDefinition(PacBio::VCF::FormatDefinition format);
+    VcfHeader& FormatDefinitions(std::vector<PacBio::VCF::FormatDefinition> defs);
+
+    // samples
+
+    VcfHeader& AddSample(std::string sample);
+    VcfHeader& Samples(std::vector<std::string> names);
+
+private:
+    std::vector<PacBio::VCF::GeneralDefinition> generalDefinitions_;
+    std::vector<PacBio::VCF::ContigDefinition> contigDefinitions_;
+    std::vector<PacBio::VCF::InfoDefinition> infoDefinitions_;
+    std::vector<PacBio::VCF::FilterDefinition> filterDefinitions_;
+    std::vector<PacBio::VCF::FormatDefinition> formatDefinitions_;
+    std::vector<PacBio::VCF::Sample> samples_;
+
+    std::unordered_map<std::string, size_t> generalLookup_;
+    std::unordered_map<std::string, size_t> contigLookup_;
+    std::unordered_map<std::string, size_t> infoLookup_;
+    std::unordered_map<std::string, size_t> filterLookup_;
+    std::unordered_map<std::string, size_t> formatLookup_;
+    std::unordered_map<std::string, size_t> sampleLookup_;
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFHEADER_H
diff --git a/include/pbbam/vcf/VcfHeaderTypes.h b/include/pbbam/vcf/VcfHeaderTypes.h

new file mode 100644 (file)

index 0000000..9dd8fbf
--- /dev/null
+++ b/include/pbbam/vcf/VcfHeaderTypes.h
@@ -0,0 +1,126 @@
+
+#ifndef PBBAM_VCF_VCFHEADERTYPES_H
+#define PBBAM_VCF_VCFHEADERTYPES_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <boost/optional.hpp>
+
+namespace PacBio {
+namespace VCF {
+
+using Sample = std::string;
+
+class ContigDefinition
+{
+public:
+    explicit ContigDefinition(std::string id);
+    ContigDefinition(std::string id, std::vector<std::pair<std::string, std::string>> attributes);
+
+public:
+    const std::string& Id() const;
+    const std::vector<std::pair<std::string, std::string>>& Attributes() const;
+
+    ContigDefinition& AddAttribute(std::string id, std::string value);
+    ContigDefinition& AddAttribute(std::pair<std::string, std::string> attribute);
+    ContigDefinition& Attributes(std::vector<std::pair<std::string, std::string>> attributes);
+
+private:
+    std::string id_;
+    std::vector<std::pair<std::string, std::string>> attributes_;
+};
+
+///
+/// \brief The FilterDefinition class
+///
+class FilterDefinition
+{
+public:
+    FilterDefinition(std::string id, std::string description);
+
+    const std::string& Id() const;
+    const std::string& Description() const;
+
+private:
+    // required fields
+    std::string id_;
+    std::string description_;
+};
+
+///
+/// \brief The FormatDefinition class
+///
+class FormatDefinition
+{
+public:
+    FormatDefinition(std::string id, std::string number, std::string type, std::string description);
+
+    const std::string& Id() const;
+    const std::string& Number() const;
+    const std::string& Type() const;
+    const std::string& Description() const;
+
+private:
+    std::string id_;
+    std::string number_;  // TODO: enum
+    std::string type_;    // TODO: enum
+    std::string description_;
+};
+
+///
+/// \brief The GeneralDefinition class
+///
+class GeneralDefinition
+{
+public:
+    GeneralDefinition(std::string id, std::string text);
+
+    const std::string& Id() const;
+    const std::string& Text() const;
+
+private:
+    // required fields
+    std::string id_;
+    std::string text_;
+};
+
+///
+/// \brief The InfoDefinition class
+///
+class InfoDefinition
+{
+public:
+    InfoDefinition(std::string id, std::string number, std::string type, std::string description,
+                   std::string source = std::string{}, std::string version = std::string{});
+
+    const std::string& Id() const;
+    const std::string& Number() const;
+    const std::string& Type() const;
+    const std::string& Description() const;
+    const boost::optional<std::string>& Source() const;
+    const boost::optional<std::string>& Version() const;
+
+    InfoDefinition& Source(std::string s);
+    InfoDefinition& Version(std::string v);
+
+private:
+    // required fields
+    // (functionally const, not marked as such to still allow moves)
+    std::string id_;
+    std::string number_;  // TODO: enum
+    std::string type_;    // TODO: enum
+    std::string description_;
+
+    // optional fields - settable after ctor
+    boost::optional<std::string> source_;
+    boost::optional<std::string> version_;
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFHEADERTYPES_H
diff --git a/include/pbbam/vcf/VcfQuery.h b/include/pbbam/vcf/VcfQuery.h

new file mode 100644 (file)

index 0000000..c0c8a6d
--- /dev/null
+++ b/include/pbbam/vcf/VcfQuery.h
@@ -0,0 +1,40 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFQUERY_H
+#define PBBAM_VCF_VCFQUERY_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfReader.h>
+#include <pbbam/vcf/VcfVariant.h>
+
+#include <pbbam/internal/QueryBase.h>
+
+namespace PacBio {
+namespace VCF {
+
+class VcfQuery : public PacBio::BAM::internal::QueryBase<VcfVariant>
+{
+public:
+    explicit VcfQuery(std::string fn);
+    explicit VcfQuery(const VcfFile& file);
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(VcfVariant& var) override;
+
+private:
+    VcfReader reader_;
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFQUERY_H
diff --git a/include/pbbam/vcf/VcfReader.h b/include/pbbam/vcf/VcfReader.h

new file mode 100644 (file)

index 0000000..f4fbdbb
--- /dev/null
+++ b/include/pbbam/vcf/VcfReader.h
@@ -0,0 +1,46 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFREADER_H
+#define PBBAM_VCF_VCFREADER_H
+
+#include "pbbam/Config.h"
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfHeader.h>
+#include <pbbam/vcf/VcfVariant.h>
+
+namespace PacBio {
+namespace VCF {
+
+///
+/// \brief The VcfReader class
+///
+class VcfReader
+{
+public:
+    explicit VcfReader(std::string fn);
+    explicit VcfReader(const VcfFile& file);
+
+public:
+    const VcfHeader& Header() const;
+
+    bool GetNext(VcfVariant& var);
+
+private:
+    void FetchNext();
+
+private:
+    std::ifstream in_;
+    VcfHeader header_;
+    std::string line_;
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFREADER_H
diff --git a/include/pbbam/vcf/VcfSort.h b/include/pbbam/vcf/VcfSort.h

new file mode 100644 (file)

index 0000000..02c3aa1
--- /dev/null
+++ b/include/pbbam/vcf/VcfSort.h
@@ -0,0 +1,32 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFSORT_H
+#define PBBAM_VCF_VCFSORT_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+
+#include "pbbam/vcf/VcfFile.h"
+
+namespace PacBio {
+namespace VCF {
+
+///
+/// \brief SortFile
+/// \param file
+/// \param outputFilename
+///
+void SortFile(const VcfFile& file, const std::string& outputFilename);
+
+///
+/// \brief SortFile
+/// \param inputFilename
+/// \param outputFilename
+///
+void SortFile(const std::string& inputFilename, const std::string& outputFilename);
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFSORT_H
diff --git a/include/pbbam/vcf/VcfVariant.h b/include/pbbam/vcf/VcfVariant.h

new file mode 100644 (file)

index 0000000..84f8ad7
--- /dev/null
+++ b/include/pbbam/vcf/VcfVariant.h
@@ -0,0 +1,146 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VARIANT_H
+#define PBBAM_VCF_VARIANT_H
+
+#include "pbbam/Config.h"
+
+#include <cassert>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <boost/optional.hpp>
+
+#include <pbbam/Position.h>
+#include <pbbam/vcf/VcfHeaderTypes.h>
+
+namespace PacBio {
+namespace VCF {
+
+struct InfoField
+{
+    std::string id;
+    boost::optional<std::string> value;
+    boost::optional<std::vector<std::string>> values;
+};
+
+struct GenotypeData
+{
+    boost::optional<std::string> value;
+    boost::optional<std::vector<std::string>> values;
+};
+
+struct GenotypeField
+{
+    std::vector<GenotypeData> data;
+};
+
+class VcfVariant
+{
+public:
+    VcfVariant();
+
+    explicit VcfVariant(const std::string& text);
+
+    VcfVariant(std::string id, std::string chrom, PacBio::BAM::Position pos, std::string refAllele,
+               std::string altAllele);
+
+public:
+    // core fields
+
+    const std::string& Chrom() const;
+    VcfVariant& Chrom(std::string chrom);
+
+    PacBio::BAM::Position Position() const;
+    VcfVariant& Position(PacBio::BAM::Position pos);
+
+    const std::string& Id() const;
+    VcfVariant& Id(std::string id);
+
+    const std::string& RefAllele() const;
+    VcfVariant& RefAllele(std::string refAllele);
+
+    const std::string& AltAllele() const;
+    VcfVariant& AltAllele(std::string altAllele);
+
+    float Quality() const;
+    VcfVariant& Quality(float qual);
+
+    const std::string& Filter() const;
+    VcfVariant& Filter(std::string filter);
+
+    // convenience methods
+    bool IsDeletion() const;
+    bool IsInsertion() const;
+    bool IsQualityMissing() const;
+    bool IsSnp() const;
+
+public:
+    // info fields
+
+    VcfVariant& AddInfoField(InfoField field);
+    VcfVariant& RemoveInfoField(const std::string& id);
+
+    const std::vector<InfoField>& InfoFields() const;
+    VcfVariant& InfoFields(std::vector<InfoField> fields);
+
+    bool HasInfoField(const std::string& id) const;
+
+    const boost::optional<std::string> InfoValue(const std::string& id) const;
+    VcfVariant& InfoValue(const std::string& id, boost::optional<std::string> value);
+
+    const boost::optional<std::vector<std::string>> InfoValues(const std::string& id) const;
+    VcfVariant& InfoValues(const std::string& id, boost::optional<std::vector<std::string>> values);
+
+public:
+    // sample genotypes
+
+    // NOTE: if you want to look up by sample name, get the index from header
+
+    std::vector<std::string> GenotypeIds() const;
+    VcfVariant& GenotypeIds(std::vector<std::string> ids);
+
+    std::vector<GenotypeField> Genotypes() const;
+    VcfVariant& Genotypes(std::vector<GenotypeField> genotypes);
+
+    const boost::optional<std::string>& GenotypeValue(const size_t sampleIndex,
+                                                      const std::string& id) const;
+    VcfVariant& GenotypeValue(const size_t sampleIndex, const std::string& id,
+                              boost::optional<std::string> value);
+
+    const boost::optional<std::vector<std::string>>& GenotypeValues(const size_t sampleIndex,
+                                                                    const std::string& id) const;
+    VcfVariant& GenotypeValues(const size_t sampleIndex, const std::string& id,
+                               boost::optional<std::vector<std::string>> values);
+
+    bool IsSampleHeterozygous(const size_t sampleIndex) const;
+    bool IsSamplePhased(const size_t sampleIndex) const;
+
+private:
+    // FIXED data
+    std::string chrom_;
+    PacBio::BAM::Position pos_;
+    std::string id_;
+    std::string refAllele_;
+    std::string altAllele_;  // multiple? KISS, only add if needed
+    float qual_;
+    std::string filter_;
+
+    // INFO data
+    std::vector<InfoField> infoFields_;
+    std::unordered_map<std::string, size_t> infoLookup_;
+
+    // SAMPLE GENOTYPE data
+    std::vector<std::string> format_;  // order matches FORMAT string
+    std::unordered_map<std::string, size_t>
+        genotypeDataLookup_;                      // genotype ID -> genotypeField.data index
+    std::vector<GenotypeField> sampleGenotypes_;  // index matches sample order
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+// #include "pbbam/vcf/internal/VcfVariant.inl"
+
+#endif  // PBBAM_VCF_VARIANT_H
diff --git a/include/pbbam/vcf/VcfWriter.h b/include/pbbam/vcf/VcfWriter.h

new file mode 100644 (file)

index 0000000..266d9a8
--- /dev/null
+++ b/include/pbbam/vcf/VcfWriter.h
@@ -0,0 +1,37 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFWRITER_H
+#define PBBAM_VCF_VCFWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+
+namespace PacBio {
+namespace VCF {
+
+class VcfHeader;
+class VcfVariant;
+
+class VcfWriter
+{
+public:
+    VcfWriter(std::string filename, const VcfHeader& header);
+
+    VcfWriter(VcfWriter&&) noexcept;
+    VcfWriter& operator=(VcfWriter&&) noexcept;
+    ~VcfWriter();
+
+public:
+    bool Write(const VcfVariant& var);
+
+private:
+    struct VcfWriterPrivate;
+    std::unique_ptr<VcfWriterPrivate> d_;
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFWRITER_H
diff --git a/include/pbbam/virtual/VirtualPolymeraseBamRecord.h b/include/pbbam/virtual/VirtualPolymeraseBamRecord.h

new file mode 100644 (file)

index 0000000..372fa99
--- /dev/null
+++ b/include/pbbam/virtual/VirtualPolymeraseBamRecord.h
@@ -0,0 +1,23 @@
+// File Description
+/// \file VirtualPolymeraseBamRecord.h
+/// \brief Defines the VirtualPolymeraseBamRecord class.
+//
+// Author: Armin Töpfer
+
+#ifndef VIRTUALPOLYMERASEBAMRECORD_H
+#define VIRTUALPOLYMERASEBAMRECORD_H
+
+#include "pbbam/Config.h"
+
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \deprecated Use VirtualZmwBamRecord instead.
+using VirtualPolymeraseBamRecord = VirtualZmwBamRecord;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALPOLYMERASEBAMRECORD_H
diff --git a/include/pbbam/virtual/VirtualPolymeraseCompositeReader.h b/include/pbbam/virtual/VirtualPolymeraseCompositeReader.h

new file mode 100644 (file)

index 0000000..a1df56f
--- /dev/null
+++ b/include/pbbam/virtual/VirtualPolymeraseCompositeReader.h
@@ -0,0 +1,23 @@
+// File Description
+/// \file VirtualPolymeraseCompositeReader.h
+/// \brief Defines the VirtualPolymeraseCompositeReader class.
+//
+// Author: Derek Barnett
+
+#ifndef VIRTUALPOLYMERASECOMPOSITEREADER_H
+#define VIRTUALPOLYMERASECOMPOSITEREADER_H
+
+#include "pbbam/Config.h"
+
+#include "pbbam/virtual/ZmwReadStitcher.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \deprecated Use ZmwReadStitcher instead.
+using VirtualPolymeraseCompositeReader = ZmwReadStitcher;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALPOLYMERASECOMPOSITEREADER_H
diff --git a/include/pbbam/virtual/VirtualPolymeraseReader.h b/include/pbbam/virtual/VirtualPolymeraseReader.h

new file mode 100644 (file)

index 0000000..4c6a66e
--- /dev/null
+++ b/include/pbbam/virtual/VirtualPolymeraseReader.h
@@ -0,0 +1,23 @@
+// File Description
+/// \file VirtualPolymeraseReader.h
+/// \brief Defines the VirtualPolymeraseReader class.
+//
+// Author: Armin Töpfer
+
+#ifndef VIRTUALPOLYMERASEREADER_H
+#define VIRTUALPOLYMERASEREADER_H
+
+#include "pbbam/Config.h"
+
+#include "pbbam/virtual/ZmwReadStitcher.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \deprecated Use ZmwReadStitcher instead.
+using VirtualPolymeraseReader = ZmwReadStitcher;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALPOLYMERASEREADER_H
diff --git a/include/pbbam/virtual/VirtualRegion.h b/include/pbbam/virtual/VirtualRegion.h

new file mode 100644 (file)

index 0000000..c56d6bb
--- /dev/null
+++ b/include/pbbam/virtual/VirtualRegion.h
@@ -0,0 +1,50 @@
+// File Description
+/// \file VirtualRegion.h
+/// \brief Defines the VirtualRegion class.
+//
+// Author: Armin Töpfer
+
+#ifndef VIRTUALREGION_H
+#define VIRTUALREGION_H
+
+#include "pbbam/Config.h"
+
+#include "pbbam/LocalContextFlags.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The VirtualRegion represents an annotation of a polymerase region.
+///
+class VirtualRegion
+{
+public:
+    VirtualRegionType type;
+    int beginPos;
+    int endPos;
+    LocalContextFlags cxTag = LocalContextFlags::NO_LOCAL_CONTEXT;
+    int barcodeLeft = -1;
+    int barcodeRight = -1;
+    int score = 0;
+
+    /// \brief Creates a virtual region with basic type & position info.
+    ///
+    VirtualRegion(const VirtualRegionType type_, const int beginPos_, const int endPos_,
+                  const int score_ = 0);
+
+    /// \brief Creates a virtual region with type/position info, as well as context & barcode.
+    ///
+    VirtualRegion(const VirtualRegionType type_, const int beginPos_, const int endPos_,
+                  const LocalContextFlags cxTag_, const int barcodeLeft_, const int barcodeRight_,
+                  const int score_ = 0);
+
+    VirtualRegion() = default;
+
+    bool operator==(const VirtualRegion& v1) const;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALREGION_H
diff --git a/include/pbbam/virtual/VirtualRegionType.h b/include/pbbam/virtual/VirtualRegionType.h

new file mode 100644 (file)

index 0000000..c23d3ee
--- /dev/null
+++ b/include/pbbam/virtual/VirtualRegionType.h
@@ -0,0 +1,30 @@
+// File Description
+/// \file VirtualRegionType.h
+/// \brief Defines the VirtualRegionType enum.
+//
+// Author: Derek Barnett
+
+#ifndef REGIONTYPE_H
+#define REGIONTYPE_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the types of annotated region.
+///
+enum class VirtualRegionType  // : char
+{
+    ADAPTER = 0x41,   ///< Adapter region ('A')
+    BARCODE = 0x42,   ///< Barcode region ('B')
+    FILTERED = 0x46,  ///< Filtered subread ('F')
+    SUBREAD = 0x53,   ///< Subread ('S')
+    HQREGION = 0x48,  ///< High-quality region ('H')
+    LQREGION = 0x4C   ///< Low-quality region ('L'), i.e. outside the HQ region
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // REGIONTYPE_H
diff --git a/include/pbbam/virtual/VirtualRegionTypeMap.h b/include/pbbam/virtual/VirtualRegionTypeMap.h

new file mode 100644 (file)

index 0000000..d770a35
--- /dev/null
+++ b/include/pbbam/virtual/VirtualRegionTypeMap.h
@@ -0,0 +1,31 @@
+// File Description
+/// \file VirtualRegionTypeMap.h
+/// \brief Defines the VirtualRegionTypeMap class.
+//
+// Author: Derek Barnett
+
+#ifndef VIRTUALREGIONTYPEMAP_H
+#define VIRTUALREGIONTYPEMAP_H
+
+#include "pbbam/Config.h"
+
+#include <map>
+
+#include "pbbam/virtual/VirtualRegionType.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The VirtualRegionTypeMap class provides mapping between char codes and
+///        VirtualRegionType enum keys.
+///
+class VirtualRegionTypeMap
+{
+public:
+    static std::map<char, VirtualRegionType> ParseChar;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALREGIONTYPEMAP_H
diff --git a/include/pbbam/virtual/VirtualZmwBamRecord.h b/include/pbbam/virtual/VirtualZmwBamRecord.h

new file mode 100644 (file)

index 0000000..0fb1965
--- /dev/null
+++ b/include/pbbam/virtual/VirtualZmwBamRecord.h
@@ -0,0 +1,78 @@
+// File Description
+/// \file VirtualZmwBamRecord.h
+/// \brief Defines the VirtualZmwBamRecord class.
+//
+// Author: Armin Töpfer
+
+#ifndef VirtualZmwBAMRECORD_H
+#define VirtualZmwBAMRECORD_H
+
+#include "pbbam/Config.h"
+
+#include <sstream>
+#include <vector>
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/virtual/VirtualRegion.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The VirtualZmwBamRecord class represents a ZMW read stitched
+///        on-the-fly from subreads|hqregion + scraps.
+///
+class VirtualZmwBamRecord : public BamRecord
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a "virtual" ZMW %BAM record, by re-stitching its
+    ///        constituent segments.
+    ///
+    /// \param[in] unorderedSources source data (subreads, scraps, etc.)
+    /// \param[in] header           %BAM header to associate with the new record
+    ///
+    /// \throws std::runtime_error on failure to stitch virtual record
+    ///
+    VirtualZmwBamRecord(std::vector<BamRecord> unorderedSources, const BamHeader& header);
+
+    /// \}
+
+    /// \name Virtual Record Attributes
+    ///
+
+    /// \returns true if requested VirtualRegionType has been annotated.
+    ///
+    bool HasVirtualRegionType(const VirtualRegionType regionType) const;
+
+    /// \returns IPD frame data
+    ///
+    Frames IPDV1Frames(Orientation orientation = Orientation::NATIVE) const;
+
+    /// \brief Provides all annotations of the polymerase read as a map (type => regions)
+    ///
+    std::map<VirtualRegionType, std::vector<VirtualRegion>> VirtualRegionsMap() const;
+
+    /// \brief Provides annotations of the polymerase read for a given VirtualRegionType.
+    ///
+    /// \param[in] regionType  requested region type
+    /// \returns regions that match the requested type (empty vector if none found).
+    ///
+    std::vector<VirtualRegion> VirtualRegionsTable(const VirtualRegionType regionType) const;
+
+    /// \}
+
+private:
+    std::vector<BamRecord> sources_;
+    std::map<VirtualRegionType, std::vector<VirtualRegion>> virtualRegionsMap_;
+
+    void StitchSources();
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VirtualZmwBAMRECORD_H
diff --git a/include/pbbam/virtual/WhitelistedZmwReadStitcher.h b/include/pbbam/virtual/WhitelistedZmwReadStitcher.h

new file mode 100644 (file)

index 0000000..c5272f5
--- /dev/null
+++ b/include/pbbam/virtual/WhitelistedZmwReadStitcher.h
@@ -0,0 +1,103 @@
+// File Description
+/// \file WhitelistedZmwReadStitcher.h
+/// \brief Defines the  ZmwReadStitcher class.
+//
+// Author: Derek Barnett
+
+#ifndef WHITELISTEDZMWREADSTITCHER_H
+#define WHITELISTEDZMWREADSTITCHER_H
+
+#include "pbbam/Config.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+class DataSet;
+class PbiFilter;
+
+/// \brief The WhitelistedZmwReadStitcher class provides an interface for
+///        re-stitching "virtual" ZMW reads from their constituent parts,
+///        limiting results to only those reads originating from a 'whitelist'
+///         of ZMW hole numbers.
+///
+/// Whitelisted ZMWs that are not present in both primary and scraps BAMs
+/// will be "pre-removed." This ensures that, given client code like this:
+///
+/// \include code/WhitelistedZmwReadStitcher.txt
+///
+/// each iteration will always provide valid data - either a valid virtual
+/// record from Next() or a non-empty vector from NextRaw().
+///
+/// \note This reader requires that both input %BAM files also have associated
+///       PBI files available for query. See BamFile::EnsurePacBioIndexExists .
+///
+class PBBAM_EXPORT WhitelistedZmwReadStitcher
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a reader that will operate on a primary %BAM file (e.g. subread data)
+    ///        and a scraps file, using a ZMW whitelist to filter the input.
+    ///
+    /// \param[in] zmwWhitelist         list of ZMWs to restrict iteration over
+    /// \param[in] primaryBamFilePath   hqregion.bam or subreads.bam file path
+    /// \param[in] scrapsBamFilePath    scraps.bam file path
+    ///
+    /// \note This reader requires that both input %BAM files also have associated PBI
+    ///       files available for query. See BamFile::EnsurePacBioIndexExists .
+    ///
+    /// \throws std::runtime_error if any files (*.bam and/or *.pbi) were not available for reading, or
+    ///         if malformed data encountered
+    ///
+    WhitelistedZmwReadStitcher(const std::vector<int32_t>& zmwWhitelist,
+                               const std::string& primaryBamFilePath,
+                               const std::string& scrapsBamFilePath);
+
+    ~WhitelistedZmwReadStitcher();
+
+    /// \}
+
+    /// \name Stitched Record Reading
+    /// \{
+
+    /// \returns true if more ZMWs are available for reading.
+    bool HasNext() const;
+
+    /// \returns the re-stitched polymerase read from the next ZMW in the whitelist
+    VirtualZmwBamRecord Next();
+
+    /// \returns the set of reads that belong to the next ZMW in the whitelist.
+    ///          This enables stitching records in a distinct thread.
+    ///
+    std::vector<BamRecord> NextRaw();
+
+    /// \}
+
+    /// \name File Headers
+    /// \{
+
+    /// \returns the BamHeader associated with this reader's "primary" %BAM file
+    BamHeader PrimaryHeader() const;
+
+    /// \returns the BamHeader associated with this reader's "scraps" %BAM file
+    BamHeader ScrapsHeader() const;
+
+    /// \}
+
+private:
+    class WhitelistedZmwReadStitcherPrivate;
+    std::unique_ptr<WhitelistedZmwReadStitcherPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // WHITELISTEDZMWREADSTITCHER
diff --git a/include/pbbam/virtual/ZmwReadStitcher.h b/include/pbbam/virtual/ZmwReadStitcher.h

new file mode 100644 (file)

index 0000000..711cfa7
--- /dev/null
+++ b/include/pbbam/virtual/ZmwReadStitcher.h
@@ -0,0 +1,89 @@
+// File Description
+/// \file ZmwReadStitcher.h
+/// \brief Defines the ZmwReadStitcher class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWREADSTITCHER_H
+#define ZMWREADSTITCHER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+class DataSet;
+class PbiFilter;
+
+/// \brief The ZmwReadStitcher class provides an interface for re-stitching
+///        "virtual" polymerase reads from their constituent parts.
+///
+/// \note This reader requires that any input %BAM files also have associated PBI
+///       files available for query. See BamFile::EnsurePacBioIndexExists .
+///
+class PBBAM_EXPORT ZmwReadStitcher
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// entire file, from BAM names
+    ZmwReadStitcher(std::string primaryBamFilePath, std::string scrapsBamFilePath);
+
+    /// filtered input from BAM names
+    ZmwReadStitcher(std::string primaryBamFilePath, std::string scrapsBamFilePath,
+                    PbiFilter filter);
+
+    /// maybe filtered, from DataSet input
+    ZmwReadStitcher(const DataSet& dataset);
+
+    ~ZmwReadStitcher();
+
+    /// \}
+
+    /// \name File Headers
+    /// \{
+
+    /// \returns the BamHeader associated with this reader's "primary" %BAM file
+    BamHeader PrimaryHeader() const;
+
+    /// \returns the BamHeader associated with this reader's "scraps" %BAM file
+    BamHeader ScrapsHeader() const;
+
+    /// \return the BamHeader associated with the newly stitched BAM data
+    BamHeader StitchedHeader() const;
+
+    /// \}
+
+    /// \name Stitched Record Reading
+    ///
+
+    /// \returns true if more ZMWs are available for reading.
+    bool HasNext();
+
+    /// \returns the next stitched polymerase read
+    VirtualZmwBamRecord Next();
+
+    /// \returns the next set of reads that belong to one ZMW.
+    ///          This enables stitching records in a distinct thread.
+    ///
+    std::vector<BamRecord> NextRaw();
+
+    /// \}
+
+private:
+    class ZmwReadStitcherPrivate;
+    std::unique_ptr<ZmwReadStitcherPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWREADSTITCHER_H
diff --git a/include/pbbam/virtual/ZmwWhitelistVirtualReader.h b/include/pbbam/virtual/ZmwWhitelistVirtualReader.h

new file mode 100644 (file)

index 0000000..af1ab7c
--- /dev/null
+++ b/include/pbbam/virtual/ZmwWhitelistVirtualReader.h
@@ -0,0 +1,23 @@
+// File Description
+/// \file ZmwWhitelistVirtualReader.h
+/// \brief Defines the ZmwWhitelistVirtualReader class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWWHITELISTVIRTUALREADER_H
+#define ZMWWHITELISTVIRTUALREADER_H
+
+#include "pbbam/Config.h"
+
+#include "pbbam/virtual/WhitelistedZmwReadStitcher.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \deprecated Use WhitelistedZmwReadStitcher instead.
+using ZmwWhitelistVirtualReader = WhitelistedZmwReadStitcher;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWWHITELISTVIRTUALREADER_H
diff --git a/meson.build b/meson.build

new file mode 100644 (file)

index 0000000..7dc4ec2
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,140 @@
+project(
+  'PacBioBAM',
+  'cpp',
+  version : '1.0.6',
+  default_options : [
+    'buildtype=release',
+    'warning_level=3',
+    'cpp_std=c++14',
+    'b_ndebug=if-release'],
+  license : 'BSD-3',
+  meson_version : '>= 0.46.0')
+
+############
+# CXXFLAGS #
+############
+
+pbbam_warning_flags = []
+cpp = meson.get_compiler('cpp')
+foreach cflag: [
+  '-Wduplicated-cond',
+  '-Wduplicated-branches',
+  '-Wlogical-op',
+  '-Wrestrict',
+  '-Wnull-dereference',
+  '-Wuseless-cast',
+  '-Wdouble-promotion',
+  '-Wshadow',
+  '-Wformat=1']
+    if cpp.has_argument(cflag)
+      pbbam_warning_flags += cflag
+    endif
+endforeach
+
+pbbam_macros = []
+if get_option('permissive-cigar')
+  pbbam_macros += ['-DPBBAM_PERMISSIVE_CIGAR']
+  warning('**********************************************')
+  warning('* You have enabled allowing "M" in BAM files *')
+  warning('*    This is an unsupported combination!     *')
+  warning('**********************************************')
+endif
+
+################
+# dependencies #
+################
+
+# threads
+pbbam_thread_dep = dependency('threads', required : true)
+
+# boost
+pbbam_boost_dep = dependency('boost', required : true)
+
+# TODO(dseifert): Add test for Winsock on Windows
+# Winsock for htslib on Windows
+#if(WIN32)
+#    set(SOCKET_LIBRARIES "ws2_32")
+#endif()
+
+# zlib
+pbbam_zlib_dep = dependency('zlib', required : true, fallback : ['zlib', 'zlib_dep'])
+
+# htslib
+pbbam_htslib_dep = dependency('htslib', required : true, version : '>=1.4', fallback : ['htslib', 'htslib_dep'])
+
+# pbcopper
+pbbam_pbcopper_dep = dependency('pbcopper', required : true, fallback : ['pbcopper', 'pbcopper_dep'])
+
+###########
+# headers #
+###########
+
+subdir('include')
+
+#####################
+# sources + library #
+#####################
+
+subdir('src')
+
+#########
+# tests #
+#########
+
+if not meson.is_subproject()
+  if get_option('build-tools') or get_option('tests')
+    pbbam_python = find_program('python')
+
+    if get_option('tests')
+      pbbam_clang_formatter = find_program('tools/check-formatting')
+      subdir('tests')
+    endif
+  endif
+endif
+
+#########
+# tools #
+#########
+
+if not meson.is_subproject()
+  if get_option('build-tools') or get_option('tests')
+    subdir('tools')
+  endif
+endif
+
+#################
+# documentation #
+#################
+
+if get_option('build-docs')
+  subdir('docs')
+endif
+
+###################
+# dependency info #
+###################
+
+if not meson.is_subproject()
+  # need to add pbcopper into 'Requires:' field,
+  # but Meson currently only allows this if it's a
+  # 'pkgconfig-dependency object' and not a subproject
+  pbbam_requires = []
+  if pbbam_pbcopper_dep.type_name() == 'pkgconfig'
+    pbbam_requires = [pbbam_pbcopper_dep]
+  endif 
+
+  import('pkgconfig').generate(
+    pbbam_lib,
+    version : meson.project_version(),
+    name : 'pbbam',
+    requires : pbbam_requires,
+    filebase : 'pbbam',
+    description : 'Library for accessing PacBio-compatible BAM files')
+endif
+
+pbbam_dep = declare_dependency(
+  include_directories : pbbam_include_directories,
+  link_with : pbbam_lib,
+  dependencies : [pbbam_htslib_dep, pbbam_pbcopper_dep],
+  version : meson.project_version(),
+  compile_args : pbbam_macros)
diff --git a/meson_options.txt b/meson_options.txt

new file mode 100644 (file)

index 0000000..56dbe36
--- /dev/null
+++ b/meson_options.txt
@@ -0,0 +1,24 @@
+option('build-tools',
+    type : 'boolean',
+    value : true,
+    description : 'Build PacBioBAM command line utilities (e.g. pbindex)')
+
+option('build-docs',
+    type : 'boolean',
+    value : false,
+    description : 'Build PacBioBAM\'s API documentation')
+
+option('auto-validate',
+    type : 'boolean',
+    value : false,
+    description : 'Build PacBioBAM with BAM validation')
+
+option('tests',
+    type : 'boolean',
+    value : true,
+    description : 'Enable dependencies required for testing')
+
+option('permissive-cigar',
+    type : 'boolean',
+    value : false,
+    description : 'Allows loading BAM records which contain "M" operations in CIGAR strings.')
diff --git a/scripts/ci/artifact.sh b/scripts/ci/artifact.sh

new file mode 100755 (executable)

index 0000000..7a2268c
--- /dev/null
+++ b/scripts/ci/artifact.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+set -vex
+
+############
+# ARTIFACT #
+############
+
+if [[ ${_create_artifact} != true ]]; then
+  echo "Not creating artifact (branch: ${bamboo_planRepository_branchName}), returning."
+  return 0
+fi
+
+# *never* create artifacts with ASAN enabled
+meson configure -Dprefix=/ -Db_sanitize=none "${CURRENT_BUILD_DIR:-build}"
+
+NEXUS_VERSION="$(${CURRENT_BUILD_DIR:-build}/tools/pbindex --version | grep -o -E '[[:digit:]]+(\.[[:digit:]]+)*')".${BUILD_NUMBER}
+case "${bamboo_planRepository_branchName}" in
+  develop)
+    VERSION="$(${CURRENT_BUILD_DIR:-build}/tools/pbindex --version | grep -o -E '[[:digit:]]+(\.[[:digit:]]+)*')".SNAPSHOT${BUILD_NUMBER}
+    NEXUS_REPO=maven-snapshots
+    ;;
+  master)
+    VERSION="${NEXUS_VERSION}"
+    NEXUS_REPO=maven-releases
+    ;;
+  *)
+    echo "You can only create artifacts from 'develop' or 'master' branches"
+    exit 1
+    ;;
+esac
+
+DESTDIR="${PWD}/staging" ninja -C "${CURRENT_BUILD_DIR:-build}" -v install
+
+# merge pbcopper and pbbam for PA
+pushd "${PWD}/staging/lib"
+  # GNU ld MRI script trick
+  # https://stackoverflow.com/a/23621751
+  echo "create libnew.a" >libnew.mri
+  for i in libpb*.a; do
+    echo "addlib ${i}" >>libnew.mri
+  done
+  echo save >>libnew.mri
+  echo end >>libnew.mri
+  ar -M <libnew.mri
+
+  rm libpb*.a libnew.mri
+  mv libnew.a libpbbam.a
+
+  # remove pkg-config because it confuses PA's build system
+  rm -rf pkgconfig
+popd
+
+if [[ ${_artifact_versionprepend:-false} == true ]]; then
+  ( cd staging && tar zcf ../pbbam-${VERSION}-x86_64.tgz . --transform "s,^\./,pbbam-${VERSION}/," )
+else
+  ( cd staging && tar zcf ../pbbam-${VERSION}-x86_64.tgz . )
+fi
+
+md5sum  pbbam-${VERSION}-x86_64.tgz | awk -e '{print $1}' >| pbbam-${VERSION}-x86_64.tgz.md5
+sha1sum pbbam-${VERSION}-x86_64.tgz | awk -e '{print $1}' >| pbbam-${VERSION}-x86_64.tgz.sha1
+
+NEXUS_URL=http://ossnexus.pacificbiosciences.com/repository/${NEXUS_REPO}/${NEXUS_PROJECT:-pacbio/sat/pbbam/pbbam}/${NEXUS_VERSION:-gcc-6.4.0}${NEXUS_TC}
+curl -vn --upload-file pbbam-${VERSION}-x86_64.tgz      ${NEXUS_URL}/pbbam-${VERSION}-x86_64.tgz
+curl -vn --upload-file pbbam-${VERSION}-x86_64.tgz.md5  ${NEXUS_URL}/pbbam-${VERSION}-x86_64.tgz.md5
+curl -vn --upload-file pbbam-${VERSION}-x86_64.tgz.sha1 ${NEXUS_URL}/pbbam-${VERSION}-x86_64.tgz.sha1
diff --git a/scripts/ci/build.sh b/scripts/ci/build.sh

new file mode 100755 (executable)

index 0000000..b1d7459
--- /dev/null
+++ b/scripts/ci/build.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+set -vex
+
+#########
+# BUILD #
+#########
+
+# on PA, need to first build pbcopper+htslib
+if [[ ${GCC_VERSION} == PA ]]; then
+  pushd _deps/pbcopper
+    meson \
+      --default-library static \
+      --libdir lib \
+      --wrap-mode nofallback \
+      --prefix "${bamboo_build_working_directory}/staging" \
+      -Dtests=false \
+      build .
+    ninja -C build -v install
+  popd
+
+  wget https://github.com/samtools/htslib/releases/download/1.9/htslib-1.9.tar.bz2
+  tar -xjf htslib-1.9.tar.bz2
+  pushd htslib-1.9
+    CFLAGS="-O3" ./configure \
+      --prefix="${bamboo_build_working_directory}/staging" \
+      --libdir="${bamboo_build_working_directory}/staging/lib" \
+      --disable-bz2 \
+      --disable-gcs \
+      --disable-libcurl \
+      --disable-lzma \
+      --disable-plugins \
+      --disable-s3
+
+    make -j install
+
+    # clean out unneeded cruft and shared libs,
+    # as -lhts will prefer shared libraries
+    rm -rf ${bamboo_build_working_directory}/staging/{bin,share}
+    rm -f ${bamboo_build_working_directory}/staging/lib/*.so*
+
+    # set pkg-config variables
+    export PKG_CONFIG_LIBDIR+=":${bamboo_build_working_directory}/staging/lib/pkgconfig"
+
+    # convert `-I` to `-isystem` in pkg-config file in order not to trigger -Werror
+    sed -e 's/-I/-isystem/g' -i "${bamboo_build_working_directory}/staging/lib/pkgconfig/htslib.pc"
+  popd
+fi
+
+# configure
+# '--wrap-mode nofallback' prevents meson from downloading
+# stuff from the internet or using subprojects.
+meson \
+  --werror \
+  --buildtype "${BUILDTYPE:-release}" \
+  --default-library "${LIBRARYTYPE:-shared}" \
+  --libdir lib \
+  --unity "${ENABLED_UNITY_BUILD:-off}" \
+  --wrap-mode nofallback \
+  --prefix "${PREFIX_ARG:-/}" \
+  -Db_coverage="${ENABLED_COVERAGE:-false}" \
+  -Db_sanitize="${ENABLED_SANITIZERS:-none}" \
+  -Dtests="${ENABLED_TESTS:-false}" \
+  "${CURRENT_BUILD_DIR:-build}" .
+
+# build
+ninja -C "${CURRENT_BUILD_DIR:-build}" -v
diff --git a/scripts/ci/install.sh b/scripts/ci/install.sh

new file mode 100755 (executable)

index 0000000..7bf937b
--- /dev/null
+++ b/scripts/ci/install.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+set -vex
+
+###########
+# INSTALL #
+###########
+
+if [[ ${_install_image} != true ]]; then
+  echo "Not installing image (branch: ${bamboo_planRepository_branchName}), returning."
+  return 0
+fi
+
+if [[ ${PREFIX_ARG} ]]; then
+  ## Cleaning out old installation from /mnt/software
+  rm -rf "${PREFIX_ARG}"/*
+fi
+
+# *never* install with ASAN enabled
+meson configure -Db_sanitize=none "${CURRENT_BUILD_DIR:-build}"
+
+DESTDIR="${DESTDIR:-/}" ninja -C "${CURRENT_BUILD_DIR:-build}" -v install
diff --git a/scripts/ci/setup.sh b/scripts/ci/setup.sh

new file mode 100755 (executable)

index 0000000..a01bdcc
--- /dev/null
+++ b/scripts/ci/setup.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+set -vex
+
+export ENABLED_TESTS="true"
+
+case "${bamboo_planRepository_branchName}" in
+  master)
+    _pbcopper_module="pbcopper/master"
+    ;;
+  *)
+    _pbcopper_module="pbcopper/develop"
+    ;;
+esac
+
+case "${GCC_VERSION}" in
+  next)
+    module load gcc/8.1.0
+    module load gtest
+    module load ${_pbcopper_module}
+    ;;
+
+  PA)
+    # have to build htslib for PA
+    module unload htslib
+    module load zlib
+    module load gtest/gcc48
+
+    # load SCL GCC
+    source /opt/rh/devtoolset-6/enable
+
+    export NEXUS_PROJECT=pacbio/seq/pa/pbbam
+    export NEXUS_TC=""
+    export _artifact_versionprepend="true"
+    ;;
+
+  *)
+    module load gcc
+    module load gtest
+    module load ${_pbcopper_module}
+    ;;
+esac
+
+module load ccache
+
+export CC="ccache gcc"
+export CXX="ccache g++"
+export CCACHE_BASEDIR="${PWD}"
+
+if [[ -z ${bamboo_planRepository_branchName+x} ]]; then
+  : #pass
+elif [[ ! -d /pbi/flash/bamboo/ccachedir ]]; then
+  echo "[WARNING] /pbi/flash/bamboo/ccachedir is missing"
+elif [[ $bamboo_planRepository_branchName == develop ]]; then
+  export CCACHE_DIR=/pbi/flash/bamboo/ccachedir/${bamboo_shortPlanKey}.${bamboo_shortJobKey}.develop
+  export CCACHE_TEMPDIR=/scratch/bamboo.ccache_tempdir
+elif [[ $bamboo_planRepository_branchName == master ]]; then
+  export CCACHE_DIR=/pbi/flash/bamboo/ccachedir/${bamboo_shortPlanKey}.${bamboo_shortJobKey}.master
+  export CCACHE_TEMPDIR=/scratch/bamboo.ccache_tempdir
+elif [[ $USER == bamboo ]]; then
+  _shortPlanKey=$(echo ${bamboo_shortPlanKey}|sed -e 's/[0-9]*$//')
+  export CCACHE_DIR=/pbi/flash/bamboo/ccachedir/${bamboo_shortPlanKey}.${bamboo_shortJobKey}
+  if [[ -d /pbi/flash/bamboo/ccachedir/${_shortPlanKey}.${bamboo_shortJobKey}.develop ]]; then
+    ( cd /pbi/flash/bamboo/ccachedir/
+      cp -a ${_shortPlanKey}.${bamboo_shortJobKey}.develop $CCACHE_DIR
+    )
+  fi
+  export CCACHE_TEMPDIR=/scratch/bamboo.ccache_tempdir
+fi
diff --git a/scripts/ci/test.sh b/scripts/ci/test.sh

new file mode 100755 (executable)

index 0000000..3abbd6c
--- /dev/null
+++ b/scripts/ci/test.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -vex
+
+########
+# TEST #
+########
+
+ninja -C "${CURRENT_BUILD_DIR:-build}" -v test
+
+############
+# COVERAGE #
+############
+
+if [[ ${ENABLED_COVERAGE:-false} == true ]]; then
+  module load gcov
+
+  pushd "${CURRENT_BUILD_DIR:-build}"
+  find . -type f -iname '*.o' | xargs gcov -acbrfu {} \; >/dev/null && \
+    mkdir coverage && pushd coverage && mv ../*.gcov . && \
+    sed -i -e 's@Source:@Source:../@' *.gcov && \
+    sed -i -e 's@Graph:@Graph:../@' *.gcov && \
+    sed -i -e 's@Data:@Data:../@' *.gcov && \
+    rm pugixml* && popd
+  popd
+fi
diff --git a/src/AlignmentPrinter.cpp b/src/AlignmentPrinter.cpp

new file mode 100644 (file)

index 0000000..b55736c
--- /dev/null
+++ b/src/AlignmentPrinter.cpp
@@ -0,0 +1,131 @@
+// File Description
+/// \file AlignmentPrinter.cpp
+/// \brief Implements the AlignmentPrinter class.
+//
+// Author: Armin Töpfer
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/AlignmentPrinter.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <type_traits>
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(!std::is_copy_constructible<AlignmentPrinter>::value,
+              "AlignmentPrinter(const AlignmentPrinter&) is not = delete");
+static_assert(!std::is_copy_assignable<AlignmentPrinter>::value,
+              "AlignmentPrinter& operator=(const AlignmentPrinter&) is not = delete");
+
+static_assert(std::is_nothrow_move_constructible<AlignmentPrinter>::value,
+              "AlignmentPrinter(AlignmentPrinter&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<AlignmentPrinter>::value,
+              "AlignmentPrinter& operator=(AlignmentPrinter&&) is not = noexcept");
+
+AlignmentPrinter::AlignmentPrinter(const IndexedFastaReader& ifr)
+    : ifr_{std::make_unique<IndexedFastaReader>(ifr)}
+{
+}
+
+std::string AlignmentPrinter::Print(const BamRecord& record, const Orientation orientation)
+{
+    const std::string seq{record.Sequence(orientation, true, true)};
+    const std::string ref{ifr_->ReferenceSubsequence(record, orientation, true, true)};
+
+    if (seq.size() != ref.size()) {
+        std::ostringstream s;
+        s << "AlignmentPrinter: sequence and reference lengths are not equal:\n"
+          << "  seq: " << seq.size() << '\n'
+          << "  ref: " << ref.size();
+        throw std::runtime_error{s.str()};
+    }
+
+    int seqLength = 0;
+    float matches = 0;
+    std::string pretty;
+    Position refCoord = record.ReferenceStart();
+    Position seqCoord = BAM::IsCcsOrTranscript(record.Type()) ? 0 : record.QueryStart();
+
+    for (size_t i = 0; i < seq.size();) {
+        auto refCoordStr = std::to_string(refCoord);
+        auto seqCoordStr = std::to_string(seqCoord);
+
+        size_t maxCoordLength = std::max(refCoordStr.size(), seqCoordStr.size());
+        while (refCoordStr.size() < maxCoordLength)
+            refCoordStr = " " + refCoordStr;
+        while (seqCoordStr.size() < maxCoordLength)
+            seqCoordStr = " " + seqCoordStr;
+
+        std::string seqWrap{seqCoordStr + " : "};
+        std::string refWrap{refCoordStr + " : "};
+        std::string prettyWrap(maxCoordLength + 3, ' ');
+        prettyWrap.reserve(seq.size());
+
+        // clang-format off
+        for (int j = 0; i < seq.size() && j < 40; ++i, ++j) {
+            refWrap += ref[i];
+
+            if (seq[i] == ref[i]) {
+                ++matches;
+                if (refCoord == 0 || refCoord % 10)
+                    prettyWrap += '|';
+                else {
+                    prettyWrap += "\033" "[1m" "\x1b" "[31m";
+                    prettyWrap += '|';
+                    prettyWrap += "\033" "[0m" "\x1b" "[39;49m";
+                }
+                seqWrap += seq[i];
+            } else if (seq[i] == '-' || ref[i] == '-') {
+                prettyWrap += ' ';
+                seqWrap += seq[i];
+            } else {
+                prettyWrap += '.';
+                seqWrap += "\033" "[1m" "\x1b" "[31m";
+                seqWrap += seq[i];
+                seqWrap += "\033" "[0m" "\x1b" "[39;49m";
+            }
+            if (seq[i] != '-') {
+                ++seqLength;
+                ++seqCoord;
+            }
+            if (ref[i] != '-') {
+                ++refCoord;
+            }
+        }
+        // clang-format on
+
+        refCoordStr = std::to_string(refCoord);
+        seqCoordStr = std::to_string(seqCoord);
+
+        maxCoordLength = std::max(refCoordStr.size(), seqCoordStr.size());
+        while (refCoordStr.size() < maxCoordLength)
+            refCoordStr = " " + refCoordStr;
+        while (seqCoordStr.size() < maxCoordLength)
+            seqCoordStr = " " + seqCoordStr;
+
+        seqWrap += " : " + seqCoordStr;
+        refWrap += " : " + refCoordStr;
+
+        pretty += refWrap + '\n' + prettyWrap + '\n' + seqWrap + "\n\n";
+    }
+    const float similarity = matches / seq.size();
+
+    std::stringstream output;
+    output << "Read        : " << record.FullName() << '\n'
+           << "Reference   : " << record.ReferenceName() << "\n\n"
+           << "Read-length : " << seqLength << '\n'
+           << "Concordance : " << std::setprecision(3) << (similarity) << "\n\n"
+           << pretty;
+    return output.str();
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Autovalidate.h b/src/Autovalidate.h

new file mode 100644 (file)

index 0000000..098c691
--- /dev/null
+++ b/src/Autovalidate.h
@@ -0,0 +1,24 @@
+// File Description
+/// \file Autovalidate.h
+/// \brief Sets the default macro for the autovalidation mode.
+//
+// Author: Derek Barnett
+
+#ifndef AUTOVALIDATE_H
+#define AUTOVALIDATE_H
+
+#include "pbbam/Config.h"
+
+// \brief Auto-validation
+//
+// To validate BAM components (header, records, etc.) you can either use the
+// Validator API provided, or enable auto-validation. To compile pbbam for
+// auto-validation, add the -Dauto-validate=true option to your Meson
+// invocation.
+//
+//
+#ifndef PBBAM_AUTOVALIDATE
+#define PBBAM_AUTOVALIDATE 0
+#endif
+
+#endif  // AUTOVALIDATE_H
diff --git a/src/BaiIndexCache.cpp b/src/BaiIndexCache.cpp

new file mode 100644 (file)

index 0000000..8ea9bc8
--- /dev/null
+++ b/src/BaiIndexCache.cpp
@@ -0,0 +1,73 @@
+// File Description
+/// \file BaiIndexCache.cpp
+/// \brief Implements the BaiIndexCache class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BaiIndexCache.h"
+
+#include <stdexcept>
+
+#include <htslib/sam.h>
+
+#include "MemoryUtils.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/DataSet.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct BaiIndexCacheData::BaiIndexCacheDataPrivate
+{
+    using IndexType = std::unique_ptr<hts_idx_t, HtslibIndexDeleter>;
+    IndexType htsIndex_;
+};
+
+BaiIndexCacheData::BaiIndexCacheData(const BamFile& bamFile) : BaiIndexCacheData(bamFile.Filename())
+{
+}
+
+BaiIndexCacheData::BaiIndexCacheData(const std::string& bamFilename)
+    : d_{std::make_unique<BaiIndexCacheData::BaiIndexCacheDataPrivate>()}
+{
+    d_->htsIndex_.reset(bam_index_load(bamFilename.c_str()));
+    if (!d_->htsIndex_) {
+        throw std::runtime_error{"BaiIndexCache: could not load *.bai index data for file: " +
+                                 bamFilename};
+    }
+}
+
+BaiIndexCacheData::~BaiIndexCacheData() = default;
+
+hts_itr_t* BaiIndexCacheData::IteratorForInterval(const int32_t refId, const Position start,
+                                                  const Position stop) const
+{
+    return bam_itr_queryi(d_->htsIndex_.get(), refId, start, stop);
+}
+
+using BaiIndexCache = std::shared_ptr<std::vector<std::shared_ptr<BaiIndexCacheData>>>;
+
+BaiIndexCache MakeBaiIndexCache(const DataSet& dataset)
+{
+    return MakeBaiIndexCache(dataset.BamFiles());
+}
+
+BaiIndexCache MakeBaiIndexCache(const std::vector<BamFile>& bamFiles)
+{
+    BaiIndexCache cache = std::make_shared<std::vector<std::shared_ptr<BaiIndexCacheData>>>();
+    auto& indices = *cache.get();
+    for (const auto& bamFile : bamFiles)
+        indices.push_back(std::make_shared<BaiIndexCacheData>(bamFile));
+    return cache;
+}
+
+BaiIndexCache MakeBaiIndexCache(const BamFile& bamFile)
+{
+    std::vector<BamFile> bamFiles{bamFile};
+    return MakeBaiIndexCache(bamFiles);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BaiIndexedBamReader.cpp b/src/BaiIndexedBamReader.cpp

new file mode 100644 (file)

index 0000000..fd62b1e
--- /dev/null
+++ b/src/BaiIndexedBamReader.cpp
@@ -0,0 +1,141 @@
+// File Description
+/// \file BaiIndexedBamReader.cpp
+/// \brief Implements the BaiIndexedBamReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BaiIndexedBamReader.h"
+
+#include <cassert>
+#include <cstddef>
+#include <sstream>
+#include <stdexcept>
+
+#include "MemoryUtils.h"
+#include "pbbam/BaiIndexCache.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BaiIndexedBamReader::BaiIndexedBamReaderPrivate
+{
+public:
+    BaiIndexedBamReaderPrivate(BamFile file, const std::shared_ptr<BaiIndexCacheData>& index)
+        : file_{std::move(file)}, index_{index}
+    {
+        if (!index_) index_ = std::make_shared<BaiIndexCacheData>(file_);
+        assert(index_);  // should throw in cache load if failed
+    }
+
+    BaiIndexedBamReaderPrivate(BamFile file, const GenomicInterval& interval,
+                               const std::shared_ptr<BaiIndexCacheData>& indexCache)
+        : BaiIndexedBamReaderPrivate{std::move(file), indexCache}
+    {
+        Interval(file_.Header(), interval);
+    }
+
+    void Interval(const BamHeader& header, const GenomicInterval& interval)
+    {
+        htsIterator_.reset();
+
+        if (header.HasSequence(interval.Name())) {
+            auto id = header.SequenceId(interval.Name());
+            if (id >= 0 && static_cast<size_t>(id) < header.NumSequences()) {
+                htsIterator_.reset(
+                    index_->IteratorForInterval(id, interval.Start(), interval.Stop()));
+            }
+        }
+
+        if (!htsIterator_) {
+            std::ostringstream s;
+            s << "BaiIndexedBamReader: could not create iterator for requested region: "
+              << interval.Name() << " [" << interval.Start() << ", " << interval.Stop() << ')';
+            throw std::runtime_error{s.str()};
+        }
+    }
+
+    int ReadRawData(BGZF* bgzf, bam1_t* b)
+    {
+        assert(htsIterator_.get());
+        return hts_itr_next(bgzf, htsIterator_.get(), b, nullptr);
+    }
+
+    BamFile file_;
+    std::shared_ptr<BaiIndexCacheData> index_;
+    GenomicInterval interval_;
+    std::unique_ptr<hts_itr_t, HtslibIteratorDeleter> htsIterator_;
+};
+
+BaiIndexedBamReader::BaiIndexedBamReader(std::string filename)
+    : BaiIndexedBamReader{BamFile{std::move(filename)}, nullptr}
+{
+}
+
+BaiIndexedBamReader::BaiIndexedBamReader(std::string filename,
+                                         const std::shared_ptr<BaiIndexCacheData>& index)
+    : BaiIndexedBamReader{BamFile{std::move(filename)}, index}
+{
+}
+
+BaiIndexedBamReader::BaiIndexedBamReader(BamFile bamFile)
+    : BamReader{bamFile.Filename()}
+    , d_{std::make_unique<BaiIndexedBamReaderPrivate>(std::move(bamFile), nullptr)}
+{
+}
+
+BaiIndexedBamReader::BaiIndexedBamReader(BamFile bamFile,
+                                         const std::shared_ptr<BaiIndexCacheData>& index)
+    : BamReader{bamFile.Filename()}
+    , d_{std::make_unique<BaiIndexedBamReaderPrivate>(std::move(bamFile), index)}
+{
+}
+
+BaiIndexedBamReader::BaiIndexedBamReader(const GenomicInterval& interval, std::string filename)
+    : BaiIndexedBamReader{interval, BamFile{std::move(filename)}, nullptr}
+{
+}
+
+BaiIndexedBamReader::BaiIndexedBamReader(const GenomicInterval& interval, std::string filename,
+                                         const std::shared_ptr<BaiIndexCacheData>& index)
+    : BaiIndexedBamReader{interval, BamFile{std::move(filename)}, index}
+{
+}
+
+BaiIndexedBamReader::BaiIndexedBamReader(const GenomicInterval& interval, BamFile bamFile)
+    : BamReader{bamFile.Filename()}
+    , d_{std::make_unique<BaiIndexedBamReaderPrivate>(std::move(bamFile), interval, nullptr)}
+{
+}
+
+BaiIndexedBamReader::BaiIndexedBamReader(const GenomicInterval& interval, BamFile bamFile,
+                                         const std::shared_ptr<BaiIndexCacheData>& index)
+    : BamReader{bamFile.Filename()}
+    , d_{std::make_unique<BaiIndexedBamReaderPrivate>(std::move(bamFile), interval, index)}
+{
+}
+
+const BamFile& BaiIndexedBamReader::File() const { return d_->file_; }
+
+const GenomicInterval& BaiIndexedBamReader::Interval() const
+{
+    assert(d_);
+    return d_->interval_;
+}
+
+int BaiIndexedBamReader::ReadRawData(BGZF* bgzf, bam1_t* b)
+{
+    assert(d_);
+    return d_->ReadRawData(bgzf, b);
+}
+
+BaiIndexedBamReader& BaiIndexedBamReader::Interval(const GenomicInterval& interval)
+{
+    assert(d_);
+    d_->Interval(Header(), interval);
+    return *this;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamFile.cpp b/src/BamFile.cpp

new file mode 100644 (file)

index 0000000..a6bac47
--- /dev/null
+++ b/src/BamFile.cpp
@@ -0,0 +1,194 @@
+// File Description
+/// \file BamFile.cpp
+/// \brief Implements the BamFile class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamFile.h"
+
+#include <sys/stat.h>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include <htslib/sam.h>
+
+#include "Autovalidate.h"
+#include "FileUtils.h"
+#include "MemoryUtils.h"
+#include "pbbam/PbiFile.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamFile::BamFilePrivate
+{
+public:
+    explicit BamFilePrivate(std::string fn) : filename_{std::move(fn)}, firstAlignmentOffset_{-1}
+    {
+        // ensure we've updated htslib verbosity with requested verbosity here
+        hts_verbose = (PacBio::BAM::HtslibVerbosity == -1 ? 0 : PacBio::BAM::HtslibVerbosity);
+
+        // attempt open
+        auto f = RawOpen();
+
+#if !defined(PBBAM_NO_CHECK_EOF) || PBBAM_AUTOVALIDATE
+        // sanity check on file
+        const auto eofCheck = bgzf_check_EOF(f->fp.bgzf);
+        if (eofCheck <= 0) {
+            // 1:  EOF present & correct
+            // 2:  not seekable (e.g. reading from stdin)
+            // 0:  EOF absent
+            // -1: some other error
+            std::ostringstream e;
+            if (eofCheck == 0)
+                e << "BamFile: missing EOF block in " << fn;
+            else
+                e << "BamFile: unknown error encountered while checking EOF in " << fn
+                  << " (status code = " << eofCheck << ')';
+            throw std::runtime_error{e.str()};
+        }
+#endif
+
+        // attempt fetch header
+        std::unique_ptr<bam_hdr_t, HtslibHeaderDeleter> hdr(sam_hdr_read(f.get()));
+        header_ = BamHeaderMemory::FromRawData(hdr.get());
+
+        // cache first alignment offset
+        firstAlignmentOffset_ = bgzf_tell(f->fp.bgzf);
+    }
+
+    std::unique_ptr<BamFilePrivate> DeepCopy()
+    {
+        return std::make_unique<BamFilePrivate>(filename_);
+    }
+
+    bool HasEOF() const
+    {
+        // streamed input is unknown, since it's not random-accessible
+        if (filename_ == "-") return false;
+
+        // attempt open
+        auto f = RawOpen();
+        return RawEOFCheck(f) == 1;
+    }
+
+    int RawEOFCheck(const std::unique_ptr<samFile, HtslibFileDeleter>& f) const
+    {
+        assert(f);
+        assert(f->fp.bgzf);
+        return bgzf_check_EOF(f->fp.bgzf);
+    }
+
+    std::unique_ptr<samFile, HtslibFileDeleter> RawOpen() const
+    {
+        std::unique_ptr<samFile, HtslibFileDeleter> f(sam_open(filename_.c_str(), "rb"));
+        if (!f || !f->fp.bgzf) throw std::runtime_error{"BamFile: could not open: " + filename_};
+        if (f->format.format != bam)
+            throw std::runtime_error{"BamFile: expected BAM, encountered un supported format: " +
+                                     filename_};
+        return f;
+    }
+
+    std::string filename_;
+    BamHeader header_;
+    int64_t firstAlignmentOffset_;
+};
+
+BamFile::BamFile(std::string filename) : d_{std::make_unique<BamFilePrivate>(std::move(filename))}
+{
+}
+
+BamFile::BamFile(const BamFile& other) : d_{other.d_->DeepCopy()} {}
+
+BamFile::BamFile(BamFile&&) noexcept = default;
+
+BamFile& BamFile::operator=(const BamFile& other)
+{
+    if (this != &other) {
+        d_ = other.d_->DeepCopy();
+    }
+    return *this;
+}
+
+BamFile& BamFile::operator=(BamFile&&) noexcept = default;
+
+BamFile::~BamFile() = default;
+
+void BamFile::CreatePacBioIndex() const { PbiFile::CreateFrom(*this); }
+
+void BamFile::CreateStandardIndex() const
+{
+    const auto ret = bam_index_build(d_->filename_.c_str(), 0);
+    if (ret != 0) {
+        std::ostringstream s;
+        s << "BamFile: could not create *.bai index for file: " << d_->filename_
+          << " (status code = " << ret << ')';
+        throw std::runtime_error{s.str()};
+    }
+}
+
+void BamFile::EnsurePacBioIndexExists() const
+{
+    if (!PacBioIndexExists()) CreatePacBioIndex();
+}
+
+void BamFile::EnsureStandardIndexExists() const
+{
+    if (!StandardIndexExists()) CreateStandardIndex();
+}
+
+const std::string& BamFile::Filename() const { return d_->filename_; }
+
+int64_t BamFile::FirstAlignmentOffset() const { return d_->firstAlignmentOffset_; }
+
+bool BamFile::HasEOF() const { return d_->HasEOF(); }
+
+bool BamFile::HasReference(const std::string& name) const { return d_->header_.HasSequence(name); }
+
+const BamHeader& BamFile::Header() const { return d_->header_; }
+
+bool BamFile::IsPacBioBAM() const { return !d_->header_.PacBioBamVersion().empty(); }
+
+bool BamFile::PacBioIndexExists() const { return FileUtils::Exists(PacBioIndexFilename()); }
+
+std::string BamFile::PacBioIndexFilename() const { return d_->filename_ + ".pbi"; }
+
+bool BamFile::PacBioIndexIsNewer() const
+{
+    const auto bamTimestamp = FileUtils::LastModified(Filename());
+    const auto pbiTimestamp = FileUtils::LastModified(PacBioIndexFilename());
+    return bamTimestamp <= pbiTimestamp;
+}
+
+int BamFile::ReferenceId(const std::string& name) const { return d_->header_.SequenceId(name); }
+
+uint32_t BamFile::ReferenceLength(const std::string& name) const
+{
+    return ReferenceLength(ReferenceId(name));
+}
+
+uint32_t BamFile::ReferenceLength(const int id) const
+{
+    return std::stoul(d_->header_.SequenceLength(id));
+}
+
+std::string BamFile::ReferenceName(const int id) const { return d_->header_.SequenceName(id); }
+
+bool BamFile::StandardIndexExists() const { return FileUtils::Exists(StandardIndexFilename()); }
+
+std::string BamFile::StandardIndexFilename() const { return d_->filename_ + ".bai"; }
+
+bool BamFile::StandardIndexIsNewer() const
+{
+    const auto bamTimestamp = FileUtils::LastModified(Filename());
+    const auto baiTimestamp = FileUtils::LastModified(StandardIndexFilename());
+    return bamTimestamp <= baiTimestamp;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamFileMerger.cpp b/src/BamFileMerger.cpp

new file mode 100644 (file)

index 0000000..b246619
--- /dev/null
+++ b/src/BamFileMerger.cpp
@@ -0,0 +1,260 @@
+// File Description
+/// \file BamFileMerger.cpp
+/// \brief Implements the BamFileMerger & helper classes.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamFileMerger.h"
+
+#include <memory>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamReader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/IRecordWriter.h"
+#include "pbbam/IndexedBamWriter.h"
+#include "pbbam/PbiBuilder.h"
+#include "pbbam/PbiFilter.h"
+#include "pbbam/PbiIndexedBamReader.h"
+#include "pbbam/RecordType.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {  // anonymous
+
+using CompositeMergeItem = internal::CompositeMergeItem;
+
+struct QNameSorter
+{
+    bool operator()(const CompositeMergeItem& lhs, const CompositeMergeItem& rhs) const
+    {
+        const BamRecord& l = lhs.record;
+        const BamRecord& r = rhs.record;
+
+        // movie name
+        std::string lMovieName, rMovieName;  // TODO(CD): memoize movienames?
+        try {
+            lMovieName = l.MovieName();
+        } catch (std::runtime_error const& err) {
+            std::ostringstream s;
+            s << "BamFileMerger: could not get movie name from file: " << lhs.reader->Filename()
+              << " (reason = " << err.what() << ')';
+            throw std::runtime_error(s.str());
+        }
+        try {
+            rMovieName = r.MovieName();
+        } catch (std::runtime_error const& err) {
+            std::ostringstream s;
+            s << "BamFileMerger: could not get movie name from file: " << rhs.reader->Filename()
+              << " (reason = " << err.what() << ')';
+            throw std::runtime_error(s.str());
+        }
+        const int cmp = lMovieName.compare(rMovieName);
+        if (cmp != 0) return cmp < 0;
+
+        // hole number
+        const auto lhsZmw = l.HoleNumber();
+        const auto rhsZmw = r.HoleNumber();
+        if (lhsZmw != rhsZmw) return lhsZmw < rhsZmw;
+
+        // shuffle CCS/transcript reads after all others
+        if (IsCcsOrTranscript(l.Type())) return false;
+        if (IsCcsOrTranscript(r.Type())) return true;
+
+        // sort on qStart, then finally qEnd
+        const auto lhsQStart = l.QueryStart();
+        const auto rhsQStart = r.QueryStart();
+        if (lhsQStart != rhsQStart) return lhsQStart < rhsQStart;
+
+        const auto lhsQEnd = l.QueryEnd();
+        const auto rhsQEnd = r.QueryEnd();
+        return lhsQEnd < rhsQEnd;
+    }
+};
+
+class ICollator
+{
+public:
+    virtual bool GetNext(BamRecord&) = 0;
+    virtual ~ICollator() = default;
+
+protected:
+    ICollator() = default;
+};
+
+template <typename Comp>
+class CollatorImpl : public ICollator
+{
+public:
+    CollatorImpl(std::vector<std::unique_ptr<BamReader>> readers) : ICollator()
+    {
+        for (auto&& reader : readers) {
+            auto item = CompositeMergeItem{std::move(reader)};
+            if (item.reader->GetNext(item.record)) mergeItems_.insert(std::move(item));
+        }
+    }
+
+    bool GetNext(BamRecord& record) override
+    {
+        if (mergeItems_.empty()) return false;
+
+        // Move first record into our result
+        auto& firstItem = const_cast<CompositeMergeItem&>(*mergeItems_.begin());
+        auto& firstRecord = firstItem.record;
+        std::swap(record, firstRecord);
+
+        // Try to read next record from current reader. If available, re-insert
+        // into the set. Otherwise, just drop it (dtor will release resource).
+        CompositeMergeItem tmp(std::move(firstItem));
+        mergeItems_.erase(mergeItems_.begin());
+        if (tmp.reader->GetNext(tmp.record)) mergeItems_.insert(std::move(tmp));
+        return true;
+    }
+
+private:
+    std::multiset<CompositeMergeItem, Comp> mergeItems_;
+};
+
+using QNameCollator = CollatorImpl<QNameSorter>;
+using AlignedCollator = CollatorImpl<PositionSorter>;
+
+std::vector<std::unique_ptr<BamReader>> MakeBamReaders(std::vector<BamFile> bamFiles,
+                                                       PbiFilter filter = PbiFilter{})
+{
+    std::vector<std::unique_ptr<BamReader>> readers;
+    for (auto& file : bamFiles) {
+        if (filter.IsEmpty())
+            readers.emplace_back(std::make_unique<BamReader>(std::move(file)));
+        else
+            readers.emplace_back(std::make_unique<PbiIndexedBamReader>(filter, std::move(file)));
+    }
+    assert(!readers.empty());
+    return readers;
+}
+
+std::unique_ptr<ICollator> MakeCollator(std::vector<std::unique_ptr<BamReader>> readers,
+                                        const bool isCoordinateSorted = false)
+{
+    std::unique_ptr<ICollator> collator;
+    if (isCoordinateSorted)
+        collator = std::make_unique<AlignedCollator>(std::move(readers));
+    else
+        collator = std::make_unique<QNameCollator>(std::move(readers));
+    return collator;
+}
+
+std::unique_ptr<IRecordWriter> MakeBamWriter(const std::vector<std::unique_ptr<BamReader>>& readers,
+                                             const std::string& outputFilename,
+                                             const bool createPbi, const ProgramInfo& pgInfo)
+{
+    if (outputFilename.empty())
+        throw std::runtime_error{"BamFileMerger: no output filename provided"};
+
+    // read headers
+    std::vector<BamHeader> headers;
+    for (const auto& reader : readers)
+        headers.push_back(reader->Header());
+    assert(!headers.empty());
+
+    // merge headers
+    BamHeader mergedHeader = headers.at(0);
+    const std::string usingSortOrder = mergedHeader.SortOrder();
+    for (size_t i = 1; i < headers.size(); ++i) {
+        const auto& header = headers.at(i);
+        if (header.SortOrder() != usingSortOrder)
+            throw std::runtime_error{
+                "BamFileMerger: BAM file sort orders do not match, aborting merge"};
+        mergedHeader += header;
+    }
+
+    // maybe add program info
+    if (pgInfo.IsValid()) mergedHeader.AddProgram(pgInfo);
+
+    // create BAM writer (PBI-on-the-fly?)
+    if (createPbi)
+        return std::make_unique<IndexedBamWriter>(outputFilename, mergedHeader);
+    else
+        return std::make_unique<BamWriter>(outputFilename, mergedHeader);
+}
+
+}  // namespace anonymous
+
+void BamFileMerger::Merge(const std::vector<std::string>& bamFilenames,
+                          const std::string& outputFilename, bool createPbi,
+                          const ProgramInfo& pgInfo)
+{
+    std::vector<BamFile> bamFiles;
+    for (const auto& fn : bamFilenames)
+        bamFiles.emplace_back(fn);
+
+    auto readers = MakeBamReaders(std::move(bamFiles));
+    const bool isCoordinateSorted = readers.front()->Header().SortOrder() == "coordinate";
+
+    auto writer = MakeBamWriter(readers, outputFilename, createPbi, pgInfo);
+    auto collator = MakeCollator(std::move(readers), isCoordinateSorted);
+
+    BamRecord record;
+    while (collator->GetNext(record))
+        writer->Write(record);
+}
+
+void BamFileMerger::Merge(const DataSet& dataset, const std::string& outputFilename, bool createPbi,
+                          const ProgramInfo& pgInfo)
+{
+    std::vector<BamFile> bamFiles = dataset.BamFiles();
+    if (bamFiles.empty()) throw std::runtime_error{"BamFileMerger: no input filenames provided"};
+
+    auto readers = MakeBamReaders(std::move(bamFiles), PbiFilter::FromDataSet(dataset));
+    const bool isCoordinateSorted = readers.front()->Header().SortOrder() == "coordinate";
+
+    auto writer = MakeBamWriter(readers, outputFilename, createPbi, pgInfo);
+    auto collator = MakeCollator(std::move(readers), isCoordinateSorted);
+
+    BamRecord record;
+    while (collator->GetNext(record))
+        writer->Write(record);
+}
+
+void BamFileMerger::Merge(const std::vector<std::string>& bamFilenames, IRecordWriter& writer)
+{
+    std::vector<BamFile> bamFiles;
+    for (const auto& fn : bamFilenames)
+        bamFiles.emplace_back(fn);
+    if (bamFiles.empty()) throw std::runtime_error{"BamFileMerger: no input filenames provided"};
+
+    auto readers = MakeBamReaders(std::move(bamFiles));
+    const bool isCoordinateSorted = readers.front()->Header().SortOrder() == "coordinate";
+
+    auto collator = MakeCollator(std::move(readers), isCoordinateSorted);
+
+    BamRecord record;
+    while (collator->GetNext(record))
+        writer.Write(record);
+}
+
+void BamFileMerger::Merge(const DataSet& dataset, IRecordWriter& writer)
+{
+    std::vector<BamFile> bamFiles = dataset.BamFiles();
+    if (bamFiles.empty()) throw std::runtime_error{"BamFileMerger: no input filenames provided"};
+
+    auto readers = MakeBamReaders(std::move(bamFiles), PbiFilter::FromDataSet(dataset));
+    const bool isCoordinateSorted = readers.front()->Header().SortOrder() == "coordinate";
+
+    auto collator = MakeCollator(std::move(readers), isCoordinateSorted);
+
+    BamRecord record;
+    while (collator->GetNext(record))
+        writer.Write(record);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamHeader.cpp b/src/BamHeader.cpp

new file mode 100644 (file)

index 0000000..7a8bbc6
--- /dev/null
+++ b/src/BamHeader.cpp
@@ -0,0 +1,453 @@
+// File Description
+/// \file BamHeader.cpp
+/// \brief Implements the BamHeader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamHeader.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <set>
+#include <sstream>
+#include <type_traits>
+
+#include <htslib/hts.h>
+
+#include "Version.h"
+#include "pbbam/SamTagCodec.h"
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+const std::string BamHeaderPrefixHD{"@HD"};
+const std::string BamHeaderPrefixSQ{"@SQ"};
+const std::string BamHeaderPrefixRG{"@RG"};
+const std::string BamHeaderPrefixPG{"@PG"};
+const std::string BamHeaderPrefixCO{"@CO"};
+
+const std::string BamHeaderTokenVN{"VN"};
+const std::string BamHeaderTokenSO{"SO"};
+const std::string BamHeaderTokenpb{"pb"};
+
+bool CheckSortOrder(const std::string& lhs, const std::string& rhs) { return lhs == rhs; }
+
+bool CheckPbVersion(const std::string& lhs, const std::string& rhs)
+{
+    using Version = PacBio::BAM::Version;
+
+    return (Version{lhs} >= Version::Minimum && Version{rhs} >= Version::Minimum);
+}
+
+bool CheckSequences(const std::string& sortOrder, const std::vector<SequenceInfo>& lhs,
+                    const std::vector<SequenceInfo>& rhs)
+{
+    return ((sortOrder == "coordinate") ? lhs == rhs : true);
+}
+
+static void EnsureCanMerge(const BamHeader& lhs, const BamHeader& rhs)
+{
+    // check compatibility
+    const auto sortOrderOk = CheckSortOrder(lhs.SortOrder(), rhs.SortOrder());
+    const auto pbVersionOk = CheckPbVersion(lhs.PacBioBamVersion(), rhs.PacBioBamVersion());
+    const auto sequencesOk = CheckSequences(lhs.SortOrder(), lhs.Sequences(), rhs.Sequences());
+    if (sortOrderOk && pbVersionOk && sequencesOk) return;
+
+    // if any checks failed, format error message & throw
+    std::ostringstream e;
+    e << "BamHeader: could not merge headers:\n";
+
+    if (!sortOrderOk) {
+        e << "  mismatched sort orders (@HD:SO) : (" << lhs.SortOrder() << ", " << rhs.SortOrder()
+          << ")\n";
+    }
+
+    if (!pbVersionOk) {
+        e << "  incompatible PacBio BAM versions (@HD:pb) : (" << lhs.PacBioBamVersion() << ", "
+          << rhs.PacBioBamVersion() << ")\n";
+    }
+
+    if (!sequencesOk) e << "  mismatched sequence lists (@SQ entries)\n";
+
+    throw std::runtime_error{e.str()};
+}
+
+void ParseHeaderLine(const std::string& line, BamHeader& hdr)
+{
+    // pop off '@HD\t', then split HD lines into tokens
+    const auto tokens = Split(line.substr(4), '\t');
+    for (const auto& token : tokens) {
+        const auto tokenTag = token.substr(0, 2);
+        const auto tokenValue = token.substr(3);
+
+        // set header contents
+        if (tokenTag == BamHeaderTokenVN)
+            hdr.Version(tokenValue);
+        else if (tokenTag == BamHeaderTokenSO)
+            hdr.SortOrder(tokenValue);
+        else if (tokenTag == BamHeaderTokenpb)
+            hdr.PacBioBamVersion(tokenValue);
+    }
+}
+
+}  // anonymous
+
+static_assert(std::is_copy_constructible<BamHeader>::value,
+              "BamHeader(const BamHeader&) is not = default");
+static_assert(std::is_copy_assignable<BamHeader>::value,
+              "BamHeader& operator=(const BamHeader&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<BamHeader>::value,
+              "BamHeader(BamHeader&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<BamHeader>::value,
+              "BamHeader& operator=(BamHeader&&) is not = noexcept");
+
+class BamHeader::BamHeaderPrivate
+{
+public:
+    std::string version_;
+    std::string pacbioBamVersion_;
+    std::string sortOrder_;
+    std::map<std::string, std::string> headerLineCustom_;
+
+    std::map<std::string, ReadGroupInfo> readGroups_;  // id => read group info
+    std::map<std::string, ProgramInfo> programs_;      // id => program info
+    std::vector<std::string> comments_;
+
+    // we need to preserve insertion order, use lookup for access by name
+    std::vector<SequenceInfo> sequences_;
+    std::map<std::string, int32_t> sequenceIdLookup_;
+};
+
+BamHeader::BamHeader() : d_{std::make_shared<BamHeaderPrivate>()} {}
+
+BamHeader::BamHeader(const std::string& samHeaderText) : d_{std::make_shared<BamHeaderPrivate>()}
+{
+    std::istringstream s{samHeaderText};
+    std::string line;
+    std::string firstToken;
+    while (std::getline(s, line)) {
+
+        // skip if line is not long enough to contain true values
+        if (line.length() < 5) continue;
+
+        // determine token at beginning of line
+        firstToken = line.substr(0, 3);
+
+        if (firstToken == BamHeaderPrefixHD) {
+            ParseHeaderLine(line, *this);
+            if (Version().empty()) Version(std::string{hts_version()});
+        }
+
+        else if (firstToken == BamHeaderPrefixSQ)
+            AddSequence(SequenceInfo::FromSam(line));
+
+        else if (firstToken == BamHeaderPrefixRG)
+            AddReadGroup(ReadGroupInfo::FromSam(line));
+
+        else if (firstToken == BamHeaderPrefixPG)
+            AddProgram(ProgramInfo::FromSam(line));
+
+        else if (firstToken == BamHeaderPrefixCO)
+            AddComment(line.substr(4));
+    }
+}
+
+BamHeader& BamHeader::operator+=(const BamHeader& other)
+{
+    EnsureCanMerge(*this, other);
+
+    // merge read groups
+    for (const auto& rg : other.ReadGroups()) {
+        if (!HasReadGroup(rg.Id())) AddReadGroup(rg);
+    }
+
+    // merge programs
+    for (const auto& pg : other.Programs()) {
+        if (!HasProgram(pg.Id())) AddProgram(pg);
+    }
+
+    // merge comments
+    for (const auto& comment : other.Comments())
+        AddComment(comment);
+
+    return *this;
+}
+
+BamHeader BamHeader::operator+(const BamHeader& other) const { return DeepCopy() += other; }
+
+BamHeader& BamHeader::AddComment(std::string comment)
+{
+    d_->comments_.push_back(std::move(comment));
+    return *this;
+}
+
+BamHeader& BamHeader::AddProgram(ProgramInfo pg)
+{
+    d_->programs_[pg.Id()] = std::move(pg);
+    return *this;
+}
+
+BamHeader& BamHeader::AddReadGroup(ReadGroupInfo readGroup)
+{
+    d_->readGroups_[readGroup.Id()] = std::move(readGroup);
+    return *this;
+}
+
+BamHeader& BamHeader::AddSequence(SequenceInfo sequence)
+{
+    const std::string name = sequence.Name();
+    d_->sequences_.push_back(std::move(sequence));
+    d_->sequenceIdLookup_[name] = d_->sequences_.size() - 1;
+    return *this;
+}
+
+BamHeader& BamHeader::ClearComments()
+{
+    d_->comments_.clear();
+    return *this;
+}
+
+BamHeader& BamHeader::ClearPrograms()
+{
+    d_->programs_.clear();
+    return *this;
+}
+
+BamHeader& BamHeader::ClearReadGroups()
+{
+    d_->readGroups_.clear();
+    return *this;
+}
+
+BamHeader& BamHeader::ClearSequences()
+{
+    d_->sequenceIdLookup_.clear();
+    d_->sequences_.clear();
+    return *this;
+}
+
+std::vector<std::string> BamHeader::Comments() const { return d_->comments_; }
+
+BamHeader& BamHeader::Comments(std::vector<std::string> comments)
+{
+    d_->comments_ = std::move(comments);
+    return *this;
+}
+
+BamHeader BamHeader::DeepCopy() const
+{
+    BamHeader result;
+    result.d_->version_ = d_->version_;
+    result.d_->pacbioBamVersion_ = d_->pacbioBamVersion_;
+    result.d_->sortOrder_ = d_->sortOrder_;
+    result.d_->headerLineCustom_ = d_->headerLineCustom_;
+    result.d_->readGroups_ = d_->readGroups_;
+    result.d_->programs_ = d_->programs_;
+    result.d_->comments_ = d_->comments_;
+    result.d_->sequences_ = d_->sequences_;
+    result.d_->sequenceIdLookup_ = d_->sequenceIdLookup_;
+    return result;
+}
+
+bool BamHeader::HasProgram(const std::string& id) const
+{
+    return d_->programs_.find(id) != d_->programs_.cend();
+}
+
+bool BamHeader::HasReadGroup(const std::string& id) const
+{
+    return d_->readGroups_.find(id) != d_->readGroups_.cend();
+}
+
+bool BamHeader::HasSequence(const std::string& name) const
+{
+    return d_->sequenceIdLookup_.find(name) != d_->sequenceIdLookup_.cend();
+}
+
+size_t BamHeader::NumSequences() const { return d_->sequences_.size(); }
+
+std::string BamHeader::PacBioBamVersion() const { return d_->pacbioBamVersion_; }
+
+BamHeader& BamHeader::PacBioBamVersion(const std::string& version)
+{
+    d_->pacbioBamVersion_ = version;
+    const PacBio::BAM::Version fileVersion{version};
+    if (fileVersion < Version::Minimum) {
+        throw std::runtime_error{
+            "BamHeader: invalid PacBio BAM version number (" + fileVersion.ToString() +
+            ") is older than the minimum supported version (" + Version::Minimum.ToString() + ")"};
+    }
+    return *this;
+}
+
+ProgramInfo BamHeader::Program(const std::string& id) const
+{
+    const auto iter = d_->programs_.find(id);
+    if (iter == d_->programs_.cend())
+        throw std::runtime_error{"BamHeader: program ID not found: " + id};
+    return iter->second;
+}
+
+std::vector<std::string> BamHeader::ProgramIds() const
+{
+    std::vector<std::string> result;
+    result.reserve(d_->programs_.size());
+    for (const auto& pg : d_->programs_)
+        result.push_back(pg.first);
+    return result;
+}
+
+std::vector<ProgramInfo> BamHeader::Programs() const
+{
+    std::vector<ProgramInfo> result;
+    result.reserve(d_->programs_.size());
+    for (const auto& pg : d_->programs_)
+        result.push_back(pg.second);
+    return result;
+}
+
+BamHeader& BamHeader::Programs(std::vector<ProgramInfo> programs)
+{
+    d_->programs_.clear();
+    for (const auto& pg : programs)
+        d_->programs_[pg.Id()] = std::move(pg);
+    return *this;
+}
+
+ReadGroupInfo BamHeader::ReadGroup(const std::string& id) const
+{
+    const auto iter = d_->readGroups_.find(id);
+    if (iter == d_->readGroups_.cend())
+        throw std::runtime_error{"BamHeader: read group ID not found: " + id};
+    return iter->second;
+}
+
+std::vector<std::string> BamHeader::ReadGroupIds() const
+{
+    std::vector<std::string> result;
+    result.reserve(d_->readGroups_.size());
+    for (const auto& rg : d_->readGroups_)
+        result.push_back(rg.first);
+    return result;
+}
+
+std::vector<ReadGroupInfo> BamHeader::ReadGroups() const
+{
+    std::vector<ReadGroupInfo> result;
+    result.reserve(d_->readGroups_.size());
+    for (const auto& rg : d_->readGroups_)
+        result.push_back(rg.second);
+    return result;
+}
+
+BamHeader& BamHeader::ReadGroups(std::vector<ReadGroupInfo> readGroups)
+{
+    d_->readGroups_.clear();
+    for (auto&& rg : readGroups)
+        d_->readGroups_[rg.Id()] = std::move(rg);
+    return *this;
+}
+
+SequenceInfo BamHeader::Sequence(const int32_t id) const { return d_->sequences_.at(id); }
+
+SequenceInfo BamHeader::Sequence(const std::string& name) const
+{
+    // TODO: SequenceId(name) throws if not found, should we do so here as well?
+
+    const auto iter = d_->sequenceIdLookup_.find(name);
+    if (iter == d_->sequenceIdLookup_.cend()) return SequenceInfo();
+    const auto index = iter->second;
+    assert(index >= 0 && static_cast<size_t>(index) < d_->sequences_.size());
+    return d_->sequences_.at(index);
+}
+
+int32_t BamHeader::SequenceId(const std::string& name) const
+{
+    const auto iter = d_->sequenceIdLookup_.find(name);
+    if (iter == d_->sequenceIdLookup_.cend())
+        throw std::runtime_error{"BamHeader: sequence name not found: " + name};
+    return iter->second;
+}
+
+std::string BamHeader::SequenceLength(const int32_t id) const { return Sequence(id).Length(); }
+
+std::string BamHeader::SequenceName(const int32_t id) const { return Sequence(id).Name(); }
+
+std::vector<std::string> BamHeader::SequenceNames() const
+{
+    std::vector<std::string> result;
+    result.reserve(d_->sequences_.size());
+    for (const auto& seq : d_->sequences_)
+        result.push_back(seq.Name());
+    return result;
+}
+
+std::vector<SequenceInfo> BamHeader::Sequences() const { return d_->sequences_; }
+
+BamHeader& BamHeader::Sequences(std::vector<SequenceInfo> sequences)
+{
+    d_->sequences_.clear();
+    for (auto&& seq : sequences)
+        AddSequence(std::move(seq));
+    return *this;
+}
+
+std::string BamHeader::SortOrder() const { return d_->sortOrder_; }
+
+BamHeader& BamHeader::SortOrder(std::string order)
+{
+    d_->sortOrder_ = std::move(order);
+    return *this;
+}
+
+std::string BamHeader::ToSam() const
+{
+    // init stream
+    std::ostringstream out;
+
+    // @HD
+    const auto outputVersion = (d_->version_.empty() ? std::string{hts_version()} : d_->version_);
+    const auto outputSortOrder = (d_->sortOrder_.empty() ? std::string{"unknown"} : d_->sortOrder_);
+    const auto outputPbBamVersion =
+        (d_->pacbioBamVersion_.empty() ? Version::Current.ToString() : d_->pacbioBamVersion_);
+
+    out << BamHeaderPrefixHD << MakeSamTag(BamHeaderTokenVN, outputVersion)
+        << MakeSamTag(BamHeaderTokenSO, outputSortOrder)
+        << MakeSamTag(BamHeaderTokenpb, outputPbBamVersion) << '\n';
+
+    // @SQ
+    for (const auto& seq : d_->sequences_)
+        out << seq.ToSam() << '\n';
+
+    // @RG
+    for (const auto& rgIter : d_->readGroups_)
+        out << rgIter.second.ToSam() << '\n';
+
+    // @PG
+    for (const auto& progIter : d_->programs_)
+        out << progIter.second.ToSam() << '\n';
+
+    // @CO
+    for (const auto& comment : d_->comments_)
+        out << BamHeaderPrefixCO << '\t' << comment << '\n';
+
+    // return result
+    return out.str();
+}
+
+std::string BamHeader::Version() const { return d_->version_; }
+
+BamHeader& BamHeader::Version(std::string version)
+{
+    d_->version_ = std::move(version);
+    return *this;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamReader.cpp b/src/BamReader.cpp

new file mode 100644 (file)

index 0000000..7f454fd
--- /dev/null
+++ b/src/BamReader.cpp
@@ -0,0 +1,115 @@
+// File Description
+/// \file BamReader.cpp
+/// \brief Implements the BamReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamReader.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+
+#include <htslib/bgzf.h>
+#include <htslib/hfile.h>
+#include <htslib/hts.h>
+#include <boost/optional.hpp>
+
+#include "Autovalidate.h"
+#include "MemoryUtils.h"
+#include "pbbam/Validator.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamReader::BamReaderPrivate
+{
+public:
+    explicit BamReaderPrivate(std::string fn) : filename_{std::move(fn)}
+    {
+        htsFile_.reset(sam_open(filename_.c_str(), "rb"));
+        if (!htsFile_ || !htsFile_->fp.bgzf) {
+            throw std::runtime_error{"BamReader: could not open BAM file for reading: " +
+                                     filename_};
+        }
+
+        std::unique_ptr<bam_hdr_t, HtslibHeaderDeleter> hdr(sam_hdr_read(htsFile_.get()));
+        header_ = BamHeaderMemory::FromRawData(hdr.get());
+    }
+
+    std::string filename_;
+    std::unique_ptr<samFile, HtslibFileDeleter> htsFile_;
+    BamHeader header_;
+};
+
+BamReader::BamReader() : internal::IQuery(), d_{std::make_unique<BamReaderPrivate>("-")} {}
+
+BamReader::BamReader(std::string fn)
+    : internal::IQuery(), d_{std::make_unique<BamReaderPrivate>(std::move(fn))}
+{
+}
+
+BamReader::BamReader(BamFile bamFile) : BamReader(bamFile.Filename()) {}
+
+BamReader::~BamReader() = default;
+
+BGZF* BamReader::Bgzf() const { return d_->htsFile_->fp.bgzf; }
+
+const std::string& BamReader::Filename() const { return d_->filename_; }
+
+const BamHeader& BamReader::Header() const { return d_->header_; }
+
+bool BamReader::GetNext(BamRecord& record)
+{
+    assert(BamRecordMemory::GetRawData(record).get());
+
+    const auto result = ReadRawData(Bgzf(), BamRecordMemory::GetRawData(record).get());
+
+    // success
+    if (result >= 0) {
+        BamRecordMemory::UpdateRecordTags(record);
+        record.header_ = d_->header_;
+        record.ResetCachedPositions();
+
+#if PBBAM_AUTOVALIDATE
+        Validator::Validate(record);
+#endif
+        return true;
+    }
+
+    // EOF or end-of-data range (not an error)
+    else if (result == -1)
+        return false;
+
+    // error corrupted file
+    else {
+        std::ostringstream msg;
+        msg << "BamReader: cannot read from corrupted file: " << Filename() << '\n' << "  reason: ";
+        if (result == -2)
+            msg << "probably truncated";
+        else if (result == -3)
+            msg << "could not read BAM record's' core data";
+        else if (result == -4)
+            msg << "could not read BAM record's' variable-length data";
+        else
+            msg << "unknown reason (status code = " << result << ") (" << Filename() << ')';
+        throw std::runtime_error{msg.str()};
+    }
+}
+
+int BamReader::ReadRawData(BGZF* bgzf, bam1_t* b) { return bam_read1(bgzf, b); }
+
+void BamReader::VirtualSeek(int64_t virtualOffset)
+{
+    const auto result = bgzf_seek(Bgzf(), virtualOffset, SEEK_SET);
+    if (result != 0) throw std::runtime_error{"Failed to seek in BAM file"};
+}
+
+int64_t BamReader::VirtualTell() const { return bgzf_tell(Bgzf()); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamRecord.cpp b/src/BamRecord.cpp

new file mode 100644 (file)

index 0000000..c53445c
--- /dev/null
+++ b/src/BamRecord.cpp
@@ -0,0 +1,2006 @@
+// File Description
+/// \file BamRecord.cpp
+/// \brief Implements the BamRecord class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamRecord.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <stdexcept>
+
+#include <htslib/sam.h>
+#include <boost/numeric/conversion/cast.hpp>
+
+#include <pbcopper/data/Clipping.h>
+#include <pbcopper/data/internal/ClippingImpl.h>
+
+#include "BamRecordTags.h"
+#include "MemoryUtils.h"
+#include "Pulse2BaseCache.h"
+#include "SequenceUtils.h"
+#include "pbbam/StringUtilities.h"
+#include "pbbam/ZmwTypeMap.h"
+#include "pbbam/virtual/VirtualRegionTypeMap.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+// record type names
+const std::string recordTypeName_ZMW{"ZMW"};
+const std::string recordTypeName_Polymerase{"POLYMERASE"};
+const std::string recordTypeName_HqRegion{"HQREGION"};
+const std::string recordTypeName_Subread{"SUBREAD"};
+const std::string recordTypeName_CCS{"CCS"};
+const std::string recordTypeName_Scrap{"SCRAP"};
+const std::string recordTypeName_Transcript{"TRANSCRIPT"};
+const std::string recordTypeName_Unknown{"UNKNOWN"};
+
+int32_t HoleNumberFromName(const std::string& fullName)
+{
+    const auto mainTokens = Split(fullName, '/');
+    if (mainTokens.at(0) == "transcript") {
+        if (mainTokens.size() != 2)
+            throw std::runtime_error{"BamRecord: malformed transcript record name: " + fullName};
+        return std::stoi(mainTokens.at(1));
+    } else {
+        if (mainTokens.size() != 3)
+            throw std::runtime_error{"BamRecord: malformed record name: " + fullName};
+        return std::stoi(mainTokens.at(1));
+    }
+}
+
+Position QueryEndFromName(const std::string& fullName)
+{
+    const auto mainTokens = Split(fullName, '/');
+    if (mainTokens.size() != 3)
+        throw std::runtime_error{"BamRecord: malformed record name: " + fullName};
+
+    const auto queryTokens = Split(mainTokens.at(2), '_');
+    if (queryTokens.size() != 2)
+        throw std::runtime_error{"BamRecord: malformed record name: " + fullName};
+
+    return stoi(queryTokens.at(1));
+}
+
+Position QueryStartFromName(const std::string& fullName)
+{
+    const auto mainTokens = Split(fullName, '/');
+    if (mainTokens.size() != 3)
+        throw std::runtime_error{"BamRecord: malformed record name: " + fullName};
+
+    const auto queryTokens = Split(mainTokens.at(2), '_');
+    if (queryTokens.size() != 2)
+        throw std::runtime_error{"BamRecord: malformed record name: " + fullName};
+
+    return stoi(queryTokens.at(0));
+}
+
+std::string Label(const BamRecordTag tag) { return BamRecordTags::LabelFor(tag); }
+
+BamRecordImpl* CreateOrEdit(const BamRecordTag tag, const Tag& value, BamRecordImpl* impl)
+{
+    if (impl->HasTag(tag))
+        impl->EditTag(tag, value);
+    else
+        impl->AddTag(tag, value);
+    return impl;
+}
+
+std::pair<int32_t, int32_t> AlignedOffsets(const BamRecord& record, const int seqLength)
+{
+    int32_t startOffset = 0;
+    int32_t endOffset = seqLength;
+
+    const auto b = BamRecordMemory::GetRawData(record);
+    uint32_t* cigarData = bam_get_cigar(b.get());
+    const size_t numCigarOps = b->core.n_cigar;
+    if (numCigarOps > 0) {
+
+        // start offset
+        for (size_t i = 0; i < numCigarOps; ++i) {
+            const auto type = static_cast<CigarOperationType>(bam_cigar_op(cigarData[i]));
+            if (type == CigarOperationType::HARD_CLIP) {
+                if (startOffset != 0 && startOffset != seqLength) {
+                    startOffset = -1;
+                    break;
+                }
+            } else if (type == CigarOperationType::SOFT_CLIP)
+                startOffset += bam_cigar_oplen(cigarData[i]);
+            else
+                break;
+        }
+
+        // end offset
+        for (int i = numCigarOps - 1; i >= 0; --i) {
+            const auto type = static_cast<CigarOperationType>(bam_cigar_op(cigarData[i]));
+            if (type == CigarOperationType::HARD_CLIP) {
+                if (endOffset != 0 && endOffset != seqLength) {
+                    endOffset = -1;
+                    break;
+                }
+            } else if (type == CigarOperationType::SOFT_CLIP)
+                endOffset -= bam_cigar_oplen(cigarData[i]);
+            else
+                break;
+        }
+
+        if (endOffset == 0) endOffset = seqLength;
+    }
+    return {startOffset, endOffset};
+}
+
+template <class InputIt, class Size, class OutputIt>
+OutputIt Move_N(InputIt first, Size count, OutputIt result)
+{
+    return std::move(first, first + count, result);
+}
+
+template <typename T>
+T ClipSeqQV(const T& input, const size_t pos, const size_t len)
+{
+    if (input.empty()) return {};
+    return T{input.cbegin() + pos, input.cbegin() + pos + len};
+}
+
+template <typename T>
+T ClipPulse(const T& input, PacBio::BAM::Pulse2BaseCache* p2bCache, const size_t pos,
+            const size_t len)
+{
+    assert(p2bCache);
+    if (input.empty()) return {};
+
+    // find start
+    size_t start = p2bCache->FindFirst();
+    size_t basesSeen = 0;
+    while (basesSeen < pos) {
+        start = p2bCache->FindNext(start);
+        ++basesSeen;
+    }
+
+    // find end
+    size_t end = start;
+    size_t seen = 1;
+    while (seen < len) {
+        end = p2bCache->FindNext(end);
+        ++seen;
+    }
+
+    // return clipped
+    return {input.cbegin() + start, input.cbegin() + end + 1};
+}
+
+template <typename F, typename N>
+void ClipAndGapify(const BamRecordImpl& impl, const bool aligned, const bool exciseSoftClips,
+                   F* seq, N paddingNullValue, N deletionNullValue)
+{
+    assert(seq);
+
+    const bool clipOrGapRequested = aligned || exciseSoftClips;
+    if (impl.IsMapped() && clipOrGapRequested) {
+        // determine final container length
+        auto incrementsOutputLength = [](const CigarOperationType type, const bool isAligned,
+                                         const bool exciseSoftClipsFromAln) {
+            if (type == CigarOperationType::HARD_CLIP ||
+                type == CigarOperationType::REFERENCE_SKIP) {
+                return false;
+            } else if (type == CigarOperationType::SOFT_CLIP && exciseSoftClipsFromAln) {
+                return false;
+            } else if (!isAligned && (type == CigarOperationType::DELETION ||
+                                      type == CigarOperationType::PADDING)) {
+                return false;
+            } else
+                return true;
+        };
+
+        size_t outputLength = 0;
+        const auto cigar = impl.CigarData();
+        for (const CigarOperation& op : cigar) {
+            if (incrementsOutputLength(op.Type(), aligned, exciseSoftClips))
+                outputLength += op.Length();
+        }
+
+        // move original data to temp, prep output container size
+        F originalSeq = std::move(*seq);
+        seq->resize(outputLength);
+
+        // apply CIGAR ops
+        size_t srcIndex = 0;
+        size_t dstIndex = 0;
+        for (const CigarOperation& op : cigar) {
+            const auto opType = op.Type();
+            const auto opLength = op.Length();
+
+            // nothing to do for hard-clipped & ref-skipped positions
+            if (opType == CigarOperationType::HARD_CLIP ||
+                opType == CigarOperationType::REFERENCE_SKIP) {
+                continue;
+            }
+
+            // maybe skip soft-clipped positions
+            else if (opType == CigarOperationType::SOFT_CLIP) {
+                if (exciseSoftClips)
+                    srcIndex += opLength;
+                else {
+                    Move_N(originalSeq.begin() + srcIndex, opLength, seq->begin() + dstIndex);
+                    srcIndex += opLength;
+                    dstIndex += opLength;
+                }
+            }
+
+            // maybe add deletion/padding values
+            // either way, srcIndex is not incremented
+            else if (opType == CigarOperationType::DELETION) {
+                if (aligned) {
+                    for (size_t i = 0; i < opLength; ++i) {
+                        (*seq)[dstIndex] = deletionNullValue;
+                        ++dstIndex;
+                    }
+                }
+            } else if (opType == CigarOperationType::PADDING) {
+                if (aligned) {
+                    for (size_t i = 0; i < opLength; ++i) {
+                        (*seq)[dstIndex] = paddingNullValue;
+                        ++dstIndex;
+                    }
+                }
+            }
+
+            // all other CIGAR ops
+            else {
+                Move_N(originalSeq.begin() + srcIndex, opLength, seq->begin() + dstIndex);
+                srcIndex += opLength;
+                dstIndex += opLength;
+            }
+        }
+    }
+}
+
+void ClipAndGapifyBases(const BamRecordImpl& impl, const bool aligned, const bool exciseSoftClips,
+                        std::string* seq)
+{
+    ClipAndGapify<std::string, char>(impl, aligned, exciseSoftClips, seq, '*', '-');
+}
+
+void ClipAndGapifyFrames(const BamRecordImpl& impl, const bool aligned, const bool exciseSoftClips,
+                         Frames* frames)
+{
+    assert(frames);
+    std::vector<uint16_t> data{std::move(frames->Data())};
+    ClipAndGapify<std::vector<uint16_t>, uint16_t>(impl, aligned, exciseSoftClips, &data, 0, 0);
+    frames->Data(data);
+}
+
+void ClipAndGapifyPhotons(const BamRecordImpl& impl, const bool aligned, const bool exciseSoftClips,
+                          std::vector<float>* data)
+{
+    ClipAndGapify<std::vector<float>, float>(impl, aligned, exciseSoftClips, data, 0.0, 0.0);
+}
+
+void ClipAndGapifyQualities(const BamRecordImpl& impl, const bool aligned,
+                            const bool exciseSoftClips, QualityValues* quals)
+{
+    ClipAndGapify<QualityValues, QualityValue>(impl, aligned, exciseSoftClips, quals,
+                                               QualityValue(0), QualityValue(0));
+}
+
+void ClipAndGapifyUInts(const BamRecordImpl& impl, const bool aligned, const bool exciseSoftClips,
+                        std::vector<uint32_t>* data)
+{
+    ClipAndGapify<std::vector<uint32_t>, uint32_t>(impl, aligned, exciseSoftClips, data, 0, 0);
+}
+
+void ClipAndGapifyUInt8s(const BamRecordImpl& impl, const bool aligned, const bool exciseSoftClips,
+                         std::vector<uint8_t>* data)
+{
+    ClipAndGapify<std::vector<uint8_t>, uint8_t>(impl, aligned, exciseSoftClips, data, 0, 0);
+}
+
+RecordType NameToType(const std::string& name)
+{
+    if (name == recordTypeName_Subread) return RecordType::SUBREAD;
+    if (name == recordTypeName_ZMW || name == recordTypeName_Polymerase) return RecordType::ZMW;
+    if (name == recordTypeName_HqRegion) return RecordType::HQREGION;
+    if (name == recordTypeName_CCS) return RecordType::CCS;
+    if (name == recordTypeName_Scrap) return RecordType::SCRAP;
+    if (name == recordTypeName_Transcript) return RecordType::TRANSCRIPT;
+    return RecordType::UNKNOWN;
+}
+
+void OrientBasesAsRequested(std::string* bases, Orientation current, Orientation requested,
+                            bool isReverseStrand, bool isPulse)
+{
+    assert(bases);
+    if (current != requested && isReverseStrand) {
+        if (isPulse)
+            ReverseComplementCaseSens(*bases);
+        else
+            ReverseComplement(*bases);
+    }
+}
+
+template <typename Container>
+void OrientTagDataAsRequested(Container* data, Orientation current, Orientation requested,
+                              bool isReverseStrand)
+{
+    assert(data);
+    if (current != requested && isReverseStrand) std::reverse(data->begin(), data->end());
+}
+
+}  // namespace
+
+const float BamRecord::photonFactor = 10.0;
+
+BamRecord::BamRecord()
+    : alignedStart_{PacBio::BAM::UnmappedPosition}, alignedEnd_{PacBio::BAM::UnmappedPosition}
+{
+}
+
+BamRecord::BamRecord(BamHeader header)
+    : header_{std::move(header)}
+    , alignedStart_{PacBio::BAM::UnmappedPosition}
+    , alignedEnd_{PacBio::BAM::UnmappedPosition}
+{
+}
+
+BamRecord::BamRecord(BamRecordImpl impl)
+    : impl_{std::move(impl)}
+    , alignedStart_{PacBio::BAM::UnmappedPosition}
+    , alignedEnd_{PacBio::BAM::UnmappedPosition}
+{
+}
+
+BamRecord::BamRecord(const BamRecord& other)
+    : impl_{other.impl_}
+    , header_{other.header_}
+    , alignedStart_{other.alignedStart_}
+    , alignedEnd_{other.alignedEnd_}
+{
+}
+
+BamRecord::BamRecord(BamRecord&&) noexcept = default;
+
+BamRecord& BamRecord::operator=(const BamRecord& other)
+{
+    if (this != &other) {
+        impl_ = other.impl_;
+        header_ = other.header_;
+        alignedStart_ = other.alignedStart_;
+        alignedEnd_ = other.alignedEnd_;
+        p2bCache_.reset();  // just reset, for now at least
+    }
+    return *this;
+}
+
+BamRecord& BamRecord::operator=(BamRecord&&) noexcept = default;
+
+BamRecord::~BamRecord() = default;
+
+Position BamRecord::AlignedEnd() const
+{
+    if (alignedEnd_ == PacBio::BAM::UnmappedPosition) CalculateAlignedPositions();
+    return alignedEnd_;
+}
+
+Position BamRecord::AlignedStart() const
+{
+    if (alignedStart_ == PacBio::BAM::UnmappedPosition) CalculateAlignedPositions();
+    return alignedStart_;
+}
+
+Strand BamRecord::AlignedStrand() const
+{
+    return impl_.IsReverseStrand() ? Strand::REVERSE : Strand::FORWARD;
+}
+
+QualityValues BamRecord::AltLabelQV(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                    PulseBehavior pulseBehavior) const
+{
+    return FetchQualities(BamRecordTag::ALT_LABEL_QV, orientation, aligned, exciseSoftClips,
+                          pulseBehavior);
+}
+
+BamRecord& BamRecord::AltLabelQV(const QualityValues& altLabelQVs)
+{
+    CreateOrEdit(BamRecordTag::ALT_LABEL_QV, altLabelQVs.Fastq(), &impl_);
+    return *this;
+}
+
+std::string BamRecord::AltLabelTag(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                   PulseBehavior pulseBehavior) const
+{
+    return FetchBases(BamRecordTag::ALT_LABEL_TAG, orientation, aligned, exciseSoftClips,
+                      pulseBehavior);
+}
+
+BamRecord& BamRecord::AltLabelTag(const std::string& tags)
+{
+    CreateOrEdit(BamRecordTag::ALT_LABEL_TAG, tags, &impl_);
+    return *this;
+}
+
+int16_t BamRecord::BarcodeForward() const { return Barcodes().first; }
+
+int16_t BamRecord::BarcodeReverse() const { return Barcodes().second; }
+
+uint8_t BamRecord::BarcodeQuality() const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::BARCODE_QUALITY);
+    const auto bq = impl_.TagValue(tagName);
+    if (bq.IsNull())
+        return 0;  // ?? "missing" value for tags ?? should we consider boost::optional<T> for these kind of guys ??
+    return bq.ToUInt8();
+}
+
+BamRecord& BamRecord::BarcodeQuality(const uint8_t quality)
+{
+    CreateOrEdit(BamRecordTag::BARCODE_QUALITY, quality, &impl_);
+    return *this;
+}
+
+std::pair<int16_t, int16_t> BamRecord::Barcodes() const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::BARCODES);
+    const Tag bc = impl_.TagValue(tagName);
+    if (bc.IsNull())
+        throw std::runtime_error{"BamRecord: barcode tag (bc) was requested but is missing"};
+
+    // NOTE: barcodes are still stored, per the spec, as uint16, even though
+    // we're now using them as int16_t in the API (bug 31511)
+    //
+    if (!bc.IsUInt16Array())
+        throw std::runtime_error{
+            "BamRecord: barcode tag (bc) is malformed: should be a uint16_t array of size==2."};
+    const auto bcArray = bc.ToUInt16Array();
+    if (bcArray.size() != 2)
+        throw std::runtime_error{
+            "BamRecord: barcode tag (bc) is malformed: should be a uint16_t array of size==2."};
+
+    return {boost::numeric_cast<int16_t>(bcArray[0]), boost::numeric_cast<int16_t>(bcArray[1])};
+}
+
+BamRecord& BamRecord::Barcodes(const std::pair<int16_t, int16_t>& barcodeIds)
+{
+    const std::vector<uint16_t> data{boost::numeric_cast<uint16_t>(barcodeIds.first),
+                                     boost::numeric_cast<uint16_t>(barcodeIds.second)};
+    CreateOrEdit(BamRecordTag::BARCODES, data, &impl_);
+    return *this;
+}
+
+void BamRecord::CalculateAlignedPositions() const
+{
+    // reset
+    ResetCachedPositions();
+
+    // skip if unmapped, or has no queryStart/End
+    if (!impl_.IsMapped()) return;
+
+    // get the query start/end
+    const auto seqLength = static_cast<int>(impl_.SequenceLength());
+    const bool isCcsOrTranscript = IsCcsOrTranscript(Type());
+    const Position qStart = isCcsOrTranscript ? 0 : QueryStart();
+    const Position qEnd = isCcsOrTranscript ? seqLength : QueryEnd();
+
+    if (qStart == PacBio::BAM::UnmappedPosition || qEnd == PacBio::BAM::UnmappedPosition) return;
+
+    // determine clipped end ranges
+    const auto alignedOffsets = AlignedOffsets(*this, seqLength);
+    const auto startOffset = alignedOffsets.first;
+    const auto endOffset = alignedOffsets.second;
+    if (endOffset == -1 || startOffset == -1) return;  // TODO: handle error more??
+
+    // store aligned positions (polymerase read coordinates)
+    if (impl_.IsReverseStrand()) {
+        alignedStart_ = qStart + (seqLength - endOffset);
+        alignedEnd_ = qEnd - startOffset;
+    } else {
+        alignedStart_ = qStart + startOffset;
+        alignedEnd_ = qEnd - (seqLength - endOffset);
+    }
+}
+
+void BamRecord::CalculatePulse2BaseCache() const
+{
+    // skip already calculated
+    if (p2bCache_) return;
+
+    // else try to calculate p2b cache.
+    if (!HasPulseCall())
+        throw std::runtime_error{
+            "BamRecord: cannot calculate pulse2base mapping without 'pc' tag."};
+    const auto pulseCalls =
+        FetchBases(BamRecordTag::PULSE_CALL, Orientation::NATIVE, false, false, PulseBehavior::ALL);
+    p2bCache_ = std::make_unique<Pulse2BaseCache>(pulseCalls);
+}
+
+Cigar BamRecord::CigarData(bool exciseAllClips) const
+{
+    auto isClippingOp = [](const CigarOperation& op) {
+        const auto type = op.Type();
+        return type == CigarOperationType::SOFT_CLIP || type == CigarOperationType::HARD_CLIP;
+    };
+
+    auto cigar = impl_.CigarData();
+    if (exciseAllClips) {
+        cigar.erase(std::remove_if(cigar.begin(), cigar.end(), isClippingOp), cigar.end());
+    }
+    return cigar;
+}
+
+BamRecord& BamRecord::Clip(const ClipType clipType, const Position start, const Position end,
+                           const bool exciseFlankingInserts)
+{
+    switch (clipType) {
+        case ClipType::CLIP_NONE:
+            return *this;
+        case ClipType::CLIP_TO_QUERY:
+            // exciseFlankingInserts ignored, just clipping to query coordinates
+            return ClipToQuery(start, end);
+        case ClipType::CLIP_TO_REFERENCE:
+            return ClipToReference(start, end, exciseFlankingInserts);
+        default:
+            throw std::runtime_error{"BamRecord: unsupported clip type requested"};
+    }
+}
+
+BamRecord BamRecord::Clipped(const BamRecord& input, const ClipType clipType,
+                             const PacBio::BAM::Position start, const PacBio::BAM::Position end,
+                             const bool exciseFlankingInserts)
+{
+    return input.Clipped(clipType, start, end, exciseFlankingInserts);
+}
+
+BamRecord BamRecord::Clipped(const ClipType clipType, const PacBio::BAM::Position start,
+                             const PacBio::BAM::Position end,
+                             const bool exciseFlankingInserts) const
+{
+    BamRecord result(*this);
+    result.Clip(clipType, start, end, exciseFlankingInserts);
+    return result;
+}
+
+void BamRecord::ClipTags(const size_t clipFrom, const size_t clipLength)
+{
+    const auto ipdCodec = ReadGroup().IpdCodec();
+    const auto pwCodec = ReadGroup().PulseWidthCodec();
+
+    // update BAM tags
+    TagCollection tags = impl_.Tags();
+    if (HasDeletionQV())
+        tags[Label(BamRecordTag::DELETION_QV)] =
+            ClipSeqQV(DeletionQV(Orientation::NATIVE), clipFrom, clipLength).Fastq();
+    if (HasInsertionQV())
+        tags[Label(BamRecordTag::INSERTION_QV)] =
+            ClipSeqQV(InsertionQV(Orientation::NATIVE), clipFrom, clipLength).Fastq();
+    if (HasMergeQV())
+        tags[Label(BamRecordTag::MERGE_QV)] =
+            ClipSeqQV(MergeQV(Orientation::NATIVE), clipFrom, clipLength).Fastq();
+    if (HasSubstitutionQV())
+        tags[Label(BamRecordTag::SUBSTITUTION_QV)] =
+            ClipSeqQV(SubstitutionQV(Orientation::NATIVE), clipFrom, clipLength).Fastq();
+    if (HasIPD()) {
+        if (ipdCodec == FrameCodec::RAW)
+            tags[Label(BamRecordTag::IPD)] =
+                ClipSeqQV(IPD(Orientation::NATIVE).Data(), clipFrom, clipLength);
+        else if (ipdCodec == FrameCodec::V1)
+            tags[Label(BamRecordTag::IPD)] =
+                ClipSeqQV(IPD(Orientation::NATIVE).Encode(), clipFrom, clipLength);
+    }
+    if (HasPulseWidth()) {
+        if (pwCodec == FrameCodec::RAW)
+            tags[Label(BamRecordTag::PULSE_WIDTH)] =
+                ClipSeqQV(PulseWidth(Orientation::NATIVE).Data(), clipFrom, clipLength);
+        else if (pwCodec == FrameCodec::V1)
+            tags[Label(BamRecordTag::PULSE_WIDTH)] =
+                ClipSeqQV(PulseWidth(Orientation::NATIVE).Encode(), clipFrom, clipLength);
+    }
+    if (HasDeletionTag())
+        tags[Label(BamRecordTag::DELETION_TAG)] =
+            ClipSeqQV(DeletionTag(Orientation::NATIVE), clipFrom, clipLength);
+    if (HasSubstitutionTag())
+        tags[Label(BamRecordTag::SUBSTITUTION_TAG)] =
+            ClipSeqQV(SubstitutionTag(Orientation::NATIVE), clipFrom, clipLength);
+
+    // internal BAM tags
+    if (HasPulseCall()) {
+
+        // ensure p2bCache initialized
+        CalculatePulse2BaseCache();
+        Pulse2BaseCache* p2bCache = p2bCache_.get();
+
+        if (HasAltLabelQV())
+            tags[Label(BamRecordTag::ALT_LABEL_QV)] =
+                ClipPulse(AltLabelQV(Orientation::NATIVE), p2bCache, clipFrom, clipLength).Fastq();
+        if (HasLabelQV())
+            tags[Label(BamRecordTag::LABEL_QV)] =
+                ClipPulse(LabelQV(Orientation::NATIVE), p2bCache, clipFrom, clipLength).Fastq();
+        if (HasPulseMergeQV())
+            tags[Label(BamRecordTag::PULSE_MERGE_QV)] =
+                ClipPulse(PulseMergeQV(Orientation::NATIVE), p2bCache, clipFrom, clipLength)
+                    .Fastq();
+        if (HasAltLabelTag())
+            tags[Label(BamRecordTag::ALT_LABEL_TAG)] =
+                ClipPulse(AltLabelTag(Orientation::NATIVE), p2bCache, clipFrom, clipLength);
+        if (HasPulseCall())
+            tags[Label(BamRecordTag::PULSE_CALL)] =
+                ClipPulse(PulseCall(Orientation::NATIVE), p2bCache, clipFrom, clipLength);
+        if (HasPkmean())
+            tags[Label(BamRecordTag::PKMEAN)] = EncodePhotons(
+                ClipPulse(Pkmean(Orientation::NATIVE), p2bCache, clipFrom, clipLength));
+        if (HasPkmid())
+            tags[Label(BamRecordTag::PKMID)] = EncodePhotons(
+                ClipPulse(Pkmid(Orientation::NATIVE), p2bCache, clipFrom, clipLength));
+        if (HasPkmean2())
+            tags[Label(BamRecordTag::PKMEAN_2)] = EncodePhotons(
+                ClipPulse(Pkmean2(Orientation::NATIVE), p2bCache, clipFrom, clipLength));
+        if (HasPkmid2())
+            tags[Label(BamRecordTag::PKMID_2)] = EncodePhotons(
+                ClipPulse(Pkmid2(Orientation::NATIVE), p2bCache, clipFrom, clipLength));
+        if (HasPrePulseFrames())
+            tags[Label(BamRecordTag::PRE_PULSE_FRAMES)] = ClipPulse(
+                PrePulseFrames(Orientation::NATIVE).Data(), p2bCache, clipFrom, clipLength);
+        if (HasPulseCallWidth())
+            tags[Label(BamRecordTag::PULSE_CALL_WIDTH)] = ClipPulse(
+                PulseCallWidth(Orientation::NATIVE).Data(), p2bCache, clipFrom, clipLength);
+        if (HasStartFrame())
+            tags[Label(BamRecordTag::START_FRAME)] =
+                ClipPulse(StartFrame(Orientation::NATIVE), p2bCache, clipFrom, clipLength);
+    }
+
+    impl_.Tags(tags);
+}
+
+void BamRecord::ClipFields(const size_t clipFrom, const size_t clipLength)
+{
+    const bool isForwardStrand = (AlignedStrand() == Strand::FORWARD);
+
+    // clip seq, quals
+    std::string sequence{ClipSeqQV(Sequence(Orientation::NATIVE), clipFrom, clipLength)};
+    QualityValues qualities{ClipSeqQV(Qualities(Orientation::NATIVE), clipFrom, clipLength)};
+    if (!isForwardStrand) {
+        ReverseComplement(sequence);
+        Reverse(qualities);
+    }
+    impl_.SetSequenceAndQualities(sequence, qualities.Fastq());
+
+    ClipTags(clipFrom, clipLength);
+}
+
+BamRecord& BamRecord::ClipToQuery(const Position start, const Position end)
+{
+    // cache original coords, skip out if clip not needed
+    const size_t seqLength = impl_.SequenceLength();
+    const bool isCcsOrTranscript = IsCcsOrTranscript(Type());
+    const Position origQStart = isCcsOrTranscript ? 0 : QueryStart();
+    const Position origQEnd = isCcsOrTranscript ? seqLength : QueryEnd();
+    if (start <= origQStart && end >= origQEnd) return *this;
+
+    // calculate clipping
+    Data::ClipToQueryConfig clipConfig{
+        impl_.SequenceLength(), origQStart,      origQEnd,          start,           end,
+        impl_.Position(),       AlignedStrand(), impl_.CigarData(), impl_.IsMapped()};
+    auto result = Data::ClipToQuery(clipConfig);
+
+    // update alignment info
+    if (IsMapped()) {
+        impl_.CigarData(std::move(result.cigar_));
+        impl_.Position(result.refPos_);
+    }
+
+    // clip SEQ, QUAL, tags
+    const auto clipFrom = result.clipOffset_;
+    const auto clipLength = (end - start);
+    ClipFields(clipFrom, clipLength);
+
+    // update query start/end
+    // TODO: update name to reflect new QS/QE ???
+    CreateOrEdit(BamRecordTag::QUERY_START, start, &impl_);
+    CreateOrEdit(BamRecordTag::QUERY_END, end, &impl_);
+
+    // reset any cached aligned start/end
+    ResetCachedPositions();
+    return *this;
+}
+
+BamRecord& BamRecord::ClipToReference(const Position start, const Position end,
+                                      const bool exciseFlankingInserts)
+{
+    // skip if not mapped, clipping to reference doesn't make sense
+    // or should we even consider throwing here?
+    if (!IsMapped()) return *this;
+
+    // cache original coords
+    const int seqLength = static_cast<int>(impl_.SequenceLength());
+    const bool isCcsOrTranscript = IsCcsOrTranscript(Type());
+    const Position origQStart = isCcsOrTranscript ? 0 : QueryStart();
+    const Position origQEnd = isCcsOrTranscript ? seqLength : QueryEnd();
+    const Position origTStart = ReferenceStart();
+    const Position origTEnd = ReferenceEnd();
+
+    // skip if already within requested clip range
+    if (start <= origTStart && end >= origTEnd) return *this;
+    assert(AlignedStart() >= origQStart);
+    assert(AlignedEnd() <= origQEnd);
+
+    // calculate clipping
+    Data::ClipToReferenceConfig clipConfig{
+        Data::ClipToQueryConfig{impl_.SequenceLength(), origQStart, origQEnd, start, end,
+                                impl_.Position(), AlignedStrand(), impl_.CigarData(),
+                                impl_.IsMapped()},
+        ReferenceEnd(), start, end, exciseFlankingInserts};
+    auto result = Data::ClipToReference(clipConfig);
+
+    // update CIGAR and position
+    impl_.CigarData(std::move(result.cigar_));
+    impl_.Position(result.refPos_);
+
+    // clip SEQ, QUAL, tags
+    const Position qStart = result.qStart_;
+    const Position qEnd = result.qEnd_;
+    const size_t clipFrom = result.clipOffset_;
+    const size_t clipLength = qEnd - qStart;
+    ClipFields(clipFrom, clipLength);
+
+    // update query start/end
+    CreateOrEdit(BamRecordTag::QUERY_START, qStart, &impl_);
+    CreateOrEdit(BamRecordTag::QUERY_END, qEnd, &impl_);
+
+    // reset any cached aligned start/end
+    ResetCachedPositions();
+    return *this;
+}
+
+QualityValues BamRecord::DeletionQV(Orientation orientation, bool aligned,
+                                    bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::DELETION_QV, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::DeletionQV(const QualityValues& deletionQVs)
+{
+    CreateOrEdit(BamRecordTag::DELETION_QV, deletionQVs.Fastq(), &impl_);
+    return *this;
+}
+
+std::string BamRecord::DeletionTag(Orientation orientation, bool aligned,
+                                   bool exciseSoftClips) const
+{
+    return FetchBases(BamRecordTag::DELETION_TAG, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::DeletionTag(const std::string& tags)
+{
+    CreateOrEdit(BamRecordTag::DELETION_TAG, tags, &impl_);
+    return *this;
+}
+
+std::vector<uint16_t> BamRecord::EncodePhotons(const std::vector<float>& data)
+{
+    std::vector<uint16_t> encoded;
+    encoded.reserve(data.size());
+    for (const auto& d : data)
+        encoded.emplace_back(d * photonFactor);
+    return encoded;
+}
+
+std::string BamRecord::FetchBasesRaw(const BamRecordTag tag) const
+{
+    const Tag seqTag = impl_.TagValue(tag);
+    return seqTag.ToString();
+}
+
+std::string BamRecord::FetchBases(const BamRecordTag tag, const Orientation orientation,
+                                  const bool aligned, const bool exciseSoftClips,
+                                  const PulseBehavior pulseBehavior) const
+{
+    const bool isBamSeq = (tag == BamRecordTag::SEQ);
+    const bool isPulse = BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    std::string bases;
+    Orientation current;
+    if (isBamSeq) {  // SEQ stored in genomic orientation
+        bases = impl_.Sequence();
+        current = Orientation::GENOMIC;
+    } else {  // all tags stored in native orientation
+        bases = FetchBasesRaw(tag);
+        current = Orientation::NATIVE;
+    }
+
+    // maybe strip 'squashed' pulse loci
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        CalculatePulse2BaseCache();
+        bases = p2bCache_->RemoveSquashedPulses(bases);
+    }
+
+    // if we need to touch CIGAR
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error{
+                "BamRecord: cannot return data at all pulses when gapping and/or soft-clipping are "
+                "requested. "
+                "Use PulseBehavior::BASECALLS_ONLY instead."};
+
+        // force into genomic orientation
+        OrientBasesAsRequested(&bases, current, Orientation::GENOMIC, impl_.IsReverseStrand(),
+                               isPulse);
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        ClipAndGapifyBases(impl_, aligned, exciseSoftClips, &bases);
+    }
+
+    // return in the orientation requested
+    OrientBasesAsRequested(&bases, current, orientation, impl_.IsReverseStrand(), isPulse);
+    return bases;
+}
+
+Frames BamRecord::FetchFramesRaw(const BamRecordTag tag) const
+{
+    const Tag frameTag = impl_.TagValue(tag);
+    if (frameTag.IsNull()) return {};  // throw ?
+
+    // lossy frame codes
+    if (frameTag.IsUInt8Array()) {
+        const auto codes = frameTag.ToUInt8Array();
+        return Frames::Decode(codes);
+    }
+
+    // lossless frame data
+    else {
+        assert(frameTag.IsUInt16Array());
+        return Frames{frameTag.ToUInt16Array()};
+    }
+}
+
+Frames BamRecord::FetchFrames(const BamRecordTag tag, const Orientation orientation,
+                              const bool aligned, const bool exciseSoftClips,
+                              const PulseBehavior pulseBehavior) const
+{
+    const bool isPulse = BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    Frames frames = FetchFramesRaw(tag);
+    Orientation current = Orientation::NATIVE;
+
+    // maybe strip 'squashed' pulse loci
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        CalculatePulse2BaseCache();
+        frames.DataRaw() = p2bCache_->RemoveSquashedPulses(frames.Data());
+    }
+
+    // if we need to touch the CIGAR
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error{
+                "BamRecord: cannot return data at all pulses when gapping and/or soft-clipping are "
+                "requested. "
+                "Use PulseBehavior::BASECALLS_ONLY instead."};
+
+        // force into genomic orientation
+        OrientTagDataAsRequested(&frames, current, Orientation::GENOMIC, impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        ClipAndGapifyFrames(impl_, aligned, exciseSoftClips, &frames);
+    }
+
+    // return in the orientation requested
+    OrientTagDataAsRequested(&frames, current, orientation, impl_.IsReverseStrand());
+    return frames;
+}
+
+std::vector<float> BamRecord::FetchPhotonsRaw(const BamRecordTag tag) const
+{
+    const Tag frameTag = impl_.TagValue(tag);
+    if (frameTag.IsNull()) return {};
+    if (!frameTag.IsUInt16Array())
+        throw std::runtime_error{"BamRecord: photons are not a uint16_t array, tag " +
+                                 BamRecordTags::LabelFor(tag)};
+
+    const auto data = frameTag.ToUInt16Array();
+    std::vector<float> photons;
+    photons.reserve(data.size());
+    for (const auto& d : data)
+        photons.emplace_back(d / photonFactor);
+    return photons;
+}
+
+std::vector<float> BamRecord::FetchPhotons(const BamRecordTag tag, const Orientation orientation,
+                                           const bool aligned, const bool exciseSoftClips,
+                                           const PulseBehavior pulseBehavior) const
+{
+    const bool isPulse = BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    auto data = FetchPhotonsRaw(tag);
+    Orientation current = Orientation::NATIVE;
+
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        // strip 'squashed' pulse loci
+        CalculatePulse2BaseCache();
+        data = p2bCache_->RemoveSquashedPulses(data);
+    }
+
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error{
+                "BamRecord: cannot return data at all pulses when gapping and/or soft-clipping are "
+                "requested. "
+                "Use PulseBehavior::BASECALLS_ONLY instead."};
+
+        // force into genomic orientation
+        OrientTagDataAsRequested(&data, current, Orientation::GENOMIC, impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        ClipAndGapifyPhotons(impl_, aligned, exciseSoftClips, &data);
+    }
+
+    // return in the orientation requested
+    OrientTagDataAsRequested(&data, current, orientation, impl_.IsReverseStrand());
+    return data;
+}
+
+QualityValues BamRecord::FetchQualitiesRaw(const BamRecordTag tag) const
+{
+    const Tag qvsTag = impl_.TagValue(tag);
+    return QualityValues::FromFastq(qvsTag.ToString());
+}
+
+QualityValues BamRecord::FetchQualities(const BamRecordTag tag, const Orientation orientation,
+                                        const bool aligned, const bool exciseSoftClips,
+                                        const PulseBehavior pulseBehavior) const
+{
+    // requested data info
+    const bool isBamQual = (tag == BamRecordTag::QUAL);
+    const bool isPulse = BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    QualityValues quals;
+    Orientation current;
+    if (isBamQual) {  // QUAL stored in genomic orientation
+        quals = impl_.Qualities();
+        current = Orientation::GENOMIC;
+    } else {  // all tags stored in native orientation
+        quals = FetchQualitiesRaw(tag);
+        current = Orientation::NATIVE;
+    }
+
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        // strip 'squashed' pulse loci
+        CalculatePulse2BaseCache();
+        quals = p2bCache_->RemoveSquashedPulses(quals);
+    }
+
+    // if we need to touch CIGAR
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error{
+                "BamRecord: cannot return data at all pulses when gapping and/or soft-clipping are "
+                "requested. "
+                "Use PulseBehavior::BASECALLS_ONLY instead."};
+
+        // force into genomic orientation
+        OrientTagDataAsRequested(&quals, current, Orientation::GENOMIC, impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        ClipAndGapifyQualities(impl_, aligned, exciseSoftClips, &quals);
+    }
+
+    // return in the orientation requested
+    OrientTagDataAsRequested(&quals, current, orientation, impl_.IsReverseStrand());
+    return quals;
+}
+
+std::vector<uint32_t> BamRecord::FetchUInt32sRaw(const BamRecordTag tag) const
+{
+    // fetch tag data
+    const Tag frameTag = impl_.TagValue(tag);
+    if (frameTag.IsNull()) return {};
+    if (!frameTag.IsUInt32Array())
+        throw std::runtime_error{"BamRecord: tag data are not a uint32_t array, tag " +
+                                 BamRecordTags::LabelFor(tag)};
+    return frameTag.ToUInt32Array();
+}
+
+std::vector<uint32_t> BamRecord::FetchUInt32s(const BamRecordTag tag, const Orientation orientation,
+                                              const bool aligned, const bool exciseSoftClips,
+                                              const PulseBehavior pulseBehavior) const
+{
+    const bool isPulse = BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    auto arr = FetchUInt32sRaw(tag);
+    Orientation current = Orientation::NATIVE;
+
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        // strip 'squashed' pulse loci
+        CalculatePulse2BaseCache();
+        arr = p2bCache_->RemoveSquashedPulses(arr);
+    }
+
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error{
+                "BamRecord: cannot return data at all pulses when gapping and/or soft-clipping are "
+                "requested. "
+                "Use PulseBehavior::BASECALLS_ONLY instead."};
+
+        // force into genomic orientation
+        OrientTagDataAsRequested(&arr, current, Orientation::GENOMIC, impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        ClipAndGapifyUInts(impl_, aligned, exciseSoftClips, &arr);
+    }
+
+    // return in the orientation requested
+    OrientTagDataAsRequested(&arr, current, orientation, impl_.IsReverseStrand());
+    return arr;
+}
+
+std::vector<uint8_t> BamRecord::FetchUInt8sRaw(const BamRecordTag tag) const
+{
+    // fetch tag data
+    const Tag frameTag = impl_.TagValue(tag);
+    if (frameTag.IsNull()) return {};
+    if (!frameTag.IsUInt8Array())
+        throw std::runtime_error{"BamRecord: tag data are not a uint8_t array, tag " +
+                                 BamRecordTags::LabelFor(tag)};
+    return frameTag.ToUInt8Array();
+}
+
+std::vector<uint8_t> BamRecord::FetchUInt8s(const BamRecordTag tag, const Orientation orientation,
+                                            const bool aligned, const bool exciseSoftClips,
+                                            const PulseBehavior pulseBehavior) const
+{
+    const bool isPulse = BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    auto arr = FetchUInt8sRaw(tag);
+    Orientation current = Orientation::NATIVE;
+
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        // strip 'squashed' pulse loci
+        CalculatePulse2BaseCache();
+        arr = p2bCache_->RemoveSquashedPulses(arr);
+    }
+
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error{
+                "BamRecord: cannot return data at all pulses when gapping and/or soft-clipping are "
+                "requested. "
+                "Use PulseBehavior::BASECALLS_ONLY instead."};
+
+        // force into genomic orientation
+        OrientTagDataAsRequested(&arr, current, Orientation::GENOMIC, impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        ClipAndGapifyUInt8s(impl_, aligned, exciseSoftClips, &arr);
+    }
+
+    // return in the orientation requested
+    OrientTagDataAsRequested(&arr, current, orientation, impl_.IsReverseStrand());
+    return arr;
+}
+
+std::string BamRecord::FullName() const { return impl_.Name(); }
+
+bool BamRecord::HasAltLabelQV() const { return impl_.HasTag(BamRecordTag::ALT_LABEL_QV); }
+
+bool BamRecord::HasAltLabelTag() const { return impl_.HasTag(BamRecordTag::ALT_LABEL_TAG); }
+
+bool BamRecord::HasBarcodes() const { return impl_.HasTag(BamRecordTag::BARCODES); }
+
+bool BamRecord::HasBarcodeQuality() const { return impl_.HasTag(BamRecordTag::BARCODE_QUALITY); }
+
+bool BamRecord::HasLabelQV() const { return impl_.HasTag(BamRecordTag::LABEL_QV); }
+
+bool BamRecord::HasDeletionQV() const { return impl_.HasTag(BamRecordTag::DELETION_QV); }
+
+bool BamRecord::HasDeletionTag() const { return impl_.HasTag(BamRecordTag::DELETION_TAG); }
+
+bool BamRecord::HasHoleNumber() const
+{
+    return impl_.HasTag(BamRecordTag::HOLE_NUMBER) &&
+           !impl_.TagValue(BamRecordTag::HOLE_NUMBER).IsNull();
+}
+
+bool BamRecord::HasInsertionQV() const { return impl_.HasTag(BamRecordTag::INSERTION_QV); }
+
+bool BamRecord::HasNumPasses() const { return impl_.HasTag(BamRecordTag::NUM_PASSES); }
+
+bool BamRecord::HasPreBaseFrames() const { return HasIPD(); }
+
+bool BamRecord::HasIPD() const { return impl_.HasTag(BamRecordTag::IPD); }
+
+bool BamRecord::HasLocalContextFlags() const { return impl_.HasTag(BamRecordTag::CONTEXT_FLAGS); }
+
+bool BamRecord::HasMergeQV() const { return impl_.HasTag(BamRecordTag::MERGE_QV); }
+
+bool BamRecord::HasPulseMergeQV() const { return impl_.HasTag(BamRecordTag::PULSE_MERGE_QV); }
+
+bool BamRecord::HasPkmean() const { return impl_.HasTag(BamRecordTag::PKMEAN); }
+
+bool BamRecord::HasPkmean2() const { return impl_.HasTag(BamRecordTag::PKMEAN_2); }
+
+bool BamRecord::HasPkmid() const { return impl_.HasTag(BamRecordTag::PKMID); }
+
+bool BamRecord::HasPkmid2() const { return impl_.HasTag(BamRecordTag::PKMID_2); }
+
+bool BamRecord::HasPrePulseFrames() const { return impl_.HasTag(BamRecordTag::PRE_PULSE_FRAMES); }
+
+bool BamRecord::HasPulseCall() const
+{
+    return impl_.HasTag(BamRecordTag::PULSE_CALL) &&
+           !impl_.TagValue(BamRecordTag::PULSE_CALL).IsNull();
+}
+
+bool BamRecord::HasPulseExclusion() const { return impl_.HasTag(BamRecordTag::PULSE_EXCLUSION); }
+
+bool BamRecord::HasPulseCallWidth() const { return impl_.HasTag(BamRecordTag::PULSE_CALL_WIDTH); }
+
+bool BamRecord::HasPulseWidth() const { return impl_.HasTag(BamRecordTag::PULSE_WIDTH); }
+
+bool BamRecord::HasQueryEnd() const { return impl_.HasTag(BamRecordTag::QUERY_END); }
+
+bool BamRecord::HasQueryEndFrameNumber() const
+{
+    return impl_.HasTag(BamRecordTag::QUERY_END_FRAME_NUMBER);
+}
+
+bool BamRecord::HasQueryStart() const { return impl_.HasTag(BamRecordTag::QUERY_START); }
+
+bool BamRecord::HasQueryStartFrameNumber() const
+{
+    return impl_.HasTag(BamRecordTag::QUERY_START_FRAME_NUMBER);
+}
+
+bool BamRecord::HasReadAccuracy() const
+{
+    return impl_.HasTag(BamRecordTag::READ_ACCURACY) &&
+           !impl_.TagValue(BamRecordTag::READ_ACCURACY).IsNull();
+}
+
+bool BamRecord::HasScrapRegionType() const
+{
+    return impl_.HasTag(BamRecordTag::SCRAP_REGION_TYPE) &&
+           !impl_.TagValue(BamRecordTag::SCRAP_REGION_TYPE).IsNull();
+}
+
+bool BamRecord::HasScrapZmwType() const
+{
+    return impl_.HasTag(BamRecordTag::SCRAP_ZMW_TYPE) &&
+           !impl_.TagValue(BamRecordTag::SCRAP_ZMW_TYPE).IsNull();
+}
+
+bool BamRecord::HasStartFrame() const { return impl_.HasTag(BamRecordTag::START_FRAME); }
+
+bool BamRecord::HasSignalToNoise() const { return impl_.HasTag(BamRecordTag::SIGNAL_TO_NOISE); }
+
+bool BamRecord::HasSubstitutionQV() const { return impl_.HasTag(BamRecordTag::SUBSTITUTION_QV); }
+
+bool BamRecord::HasSubstitutionTag() const { return impl_.HasTag(BamRecordTag::SUBSTITUTION_TAG); }
+
+BamHeader BamRecord::Header() const { return header_; }
+
+int32_t BamRecord::HoleNumber() const
+{
+    const Tag holeNumber = impl_.TagValue(BamRecordTag::HOLE_NUMBER);
+    if (!holeNumber.IsNull()) return holeNumber.ToInt32();
+
+    // missing zm tag - try to pull from name
+    return HoleNumberFromName(FullName());
+}
+
+BamRecord& BamRecord::HoleNumber(const int32_t holeNumber)
+{
+    CreateOrEdit(BamRecordTag::HOLE_NUMBER, holeNumber, &impl_);
+    return *this;
+}
+
+BamRecordImpl& BamRecord::Impl() { return impl_; }
+
+const BamRecordImpl& BamRecord::Impl() const { return impl_; }
+
+QualityValues BamRecord::InsertionQV(Orientation orientation, bool aligned,
+                                     bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::INSERTION_QV, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::InsertionQV(const QualityValues& insertionQVs)
+{
+    CreateOrEdit(BamRecordTag::INSERTION_QV, insertionQVs.Fastq(), &impl_);
+    return *this;
+}
+
+Frames BamRecord::IPD(Orientation orientation, bool aligned, bool exciseSoftClips) const
+{
+    return FetchFrames(BamRecordTag::IPD, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::IPD(const Frames& frames, const FrameEncodingType encoding)
+{
+    if (encoding == FrameEncodingType::LOSSY)
+        CreateOrEdit(BamRecordTag::IPD, frames.Encode(), &impl_);
+    else
+        CreateOrEdit(BamRecordTag::IPD, frames.Data(), &impl_);
+    return *this;
+}
+
+Frames BamRecord::IPDRaw(Orientation orientation) const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::IPD);
+    const Tag frameTag = impl_.TagValue(tagName);
+    if (frameTag.IsNull()) return {};
+
+    Frames frames;
+
+    // lossy frame codes
+    if (frameTag.IsUInt8Array()) {
+        const auto codes = frameTag.ToUInt8Array();
+        const std::vector<uint16_t> codes16(codes.begin(), codes.end());
+        frames.Data(std::move(codes16));
+    }
+
+    // lossless frame data
+    else {
+        assert(frameTag.IsUInt16Array());
+        frames.Data(frameTag.ToUInt16Array());
+    }
+
+    // return in requested orientation
+    OrientTagDataAsRequested(&frames,
+                             Orientation::NATIVE,  // current
+                             orientation,          // requested
+                             impl_.IsReverseStrand());
+    return frames;
+}
+
+bool BamRecord::IsMapped() const { return impl_.IsMapped(); }
+
+QualityValues BamRecord::LabelQV(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior) const
+{
+    return FetchQualities(BamRecordTag::LABEL_QV, orientation, aligned, exciseSoftClips,
+                          pulseBehavior);
+}
+
+BamRecord& BamRecord::LabelQV(const QualityValues& labelQVs)
+{
+    CreateOrEdit(BamRecordTag::LABEL_QV, labelQVs.Fastq(), &impl_);
+    return *this;
+}
+
+LocalContextFlags BamRecord::LocalContextFlags() const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::CONTEXT_FLAGS);
+    const Tag cxTag = impl_.TagValue(tagName);
+    return static_cast<PacBio::BAM::LocalContextFlags>(cxTag.ToUInt8());
+}
+
+BamRecord& BamRecord::LocalContextFlags(const PacBio::BAM::LocalContextFlags flags)
+{
+    CreateOrEdit(BamRecordTag::CONTEXT_FLAGS, static_cast<uint8_t>(flags), &impl_);
+    return *this;
+}
+
+BamRecord& BamRecord::Map(const int32_t referenceId, const Position refStart, const Strand strand,
+                          const Cigar& cigar, const uint8_t mappingQuality)
+{
+    impl_.Position(refStart);
+    impl_.ReferenceId(referenceId);
+    impl_.CigarData(cigar);
+    impl_.MapQuality(mappingQuality);
+    impl_.SetMapped(true);
+
+    if (strand == Strand::FORWARD)
+        impl_.SetReverseStrand(false);
+
+    else {
+        assert(strand == Strand::REVERSE);
+        impl_.SetReverseStrand(true);
+
+        // switch seq & qual
+        std::string sequence = impl_.Sequence();
+        QualityValues qualities = impl_.Qualities();
+
+        ReverseComplement(sequence);
+        Reverse(qualities);
+
+        impl_.SetSequenceAndQualities(sequence, qualities.Fastq());
+    }
+
+    // reset any cached aligned start/end
+    alignedStart_ = PacBio::BAM::UnmappedPosition;
+    alignedEnd_ = PacBio::BAM::UnmappedPosition;
+
+    return *this;
+}
+
+BamRecord BamRecord::Mapped(const BamRecord& input, const int32_t referenceId,
+                            const Position refStart, const Strand strand, const Cigar& cigar,
+                            const uint8_t mappingQuality)
+{
+    return input.Mapped(referenceId, refStart, strand, cigar, mappingQuality);
+}
+
+BamRecord BamRecord::Mapped(const int32_t referenceId, const Position refStart, const Strand strand,
+                            const Cigar& cigar, const uint8_t mappingQuality) const
+{
+    BamRecord result(*this);
+    result.Map(referenceId, refStart, strand, cigar, mappingQuality);
+    return result;
+}
+
+uint8_t BamRecord::MapQuality() const { return impl_.MapQuality(); }
+
+QualityValues BamRecord::MergeQV(Orientation orientation, bool aligned, bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::MERGE_QV, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::MergeQV(const QualityValues& mergeQVs)
+{
+    CreateOrEdit(BamRecordTag::MERGE_QV, mergeQVs.Fastq(), &impl_);
+    return *this;
+}
+
+std::string BamRecord::MovieName() const
+{
+    const auto& rgId = ReadGroupId();
+    if (!rgId.empty())
+        return header_.ReadGroup(rgId).MovieName();
+    else {
+        const auto nameParts = Split(FullName(), '/');
+        if (nameParts.empty())
+            throw std::runtime_error{"BamRecord: has invalid name: '" + FullName() + "'"};
+        return nameParts[0];
+    }
+}
+
+size_t BamRecord::NumDeletedBases() const
+{
+    size_t count = 0;
+
+    auto b = BamRecordMemory::GetRawData(this);
+    uint32_t* cigarData = bam_get_cigar(b.get());
+    for (uint32_t i = 0; i < b->core.n_cigar; ++i) {
+        const auto type = static_cast<CigarOperationType>(bam_cigar_op(cigarData[i]));
+        if (type == CigarOperationType::DELETION) count += bam_cigar_oplen(cigarData[i]);
+    }
+    return count;
+}
+
+size_t BamRecord::NumInsertedBases() const
+{
+    size_t count = 0;
+
+    auto b = BamRecordMemory::GetRawData(this);
+    uint32_t* cigarData = bam_get_cigar(b.get());
+    for (uint32_t i = 0; i < b->core.n_cigar; ++i) {
+        const auto type = static_cast<CigarOperationType>(bam_cigar_op(cigarData[i]));
+        if (type == CigarOperationType::INSERTION) count += bam_cigar_oplen(cigarData[i]);
+    }
+    return count;
+}
+
+size_t BamRecord::NumMatches() const { return NumMatchesAndMismatches().first; }
+
+std::pair<size_t, size_t> BamRecord::NumMatchesAndMismatches() const
+{
+    std::pair<size_t, size_t> result = std::make_pair(0, 0);
+
+    auto b = BamRecordMemory::GetRawData(this);
+    uint32_t* cigarData = bam_get_cigar(b.get());
+    for (uint32_t i = 0; i < b->core.n_cigar; ++i) {
+        const auto type = static_cast<CigarOperationType>(bam_cigar_op(cigarData[i]));
+        if (type == CigarOperationType::SEQUENCE_MATCH)
+            result.first += bam_cigar_oplen(cigarData[i]);
+        else if (type == CigarOperationType::SEQUENCE_MISMATCH)
+            result.second += bam_cigar_oplen(cigarData[i]);
+    }
+    return result;
+}
+
+size_t BamRecord::NumMismatches() const { return NumMatchesAndMismatches().second; }
+
+int32_t BamRecord::NumPasses() const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::NUM_PASSES);
+    const Tag numPasses = impl_.TagValue(tagName);
+    return numPasses.ToInt32();
+}
+
+BamRecord& BamRecord::NumPasses(const int32_t numPasses)
+{
+    CreateOrEdit(BamRecordTag::NUM_PASSES, numPasses, &impl_);
+    return *this;
+}
+
+std::vector<float> BamRecord::Pkmean(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                     PulseBehavior pulseBehavior) const
+{
+    return FetchPhotons(BamRecordTag::PKMEAN, orientation, aligned, exciseSoftClips, pulseBehavior);
+}
+
+BamRecord& BamRecord::Pkmean(const std::vector<float>& photons)
+{
+    Pkmean(EncodePhotons(photons));
+    return *this;
+}
+
+BamRecord& BamRecord::Pkmean(const std::vector<uint16_t>& encodedPhotons)
+{
+    CreateOrEdit(BamRecordTag::PKMEAN, encodedPhotons, &impl_);
+    return *this;
+}
+
+std::vector<float> BamRecord::Pkmid(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                    PulseBehavior pulseBehavior) const
+{
+    return FetchPhotons(BamRecordTag::PKMID, orientation, aligned, exciseSoftClips, pulseBehavior);
+}
+
+BamRecord& BamRecord::Pkmid(const std::vector<float>& photons)
+{
+    Pkmid(EncodePhotons(photons));
+    return *this;
+}
+
+BamRecord& BamRecord::Pkmid(const std::vector<uint16_t>& encodedPhotons)
+{
+    CreateOrEdit(BamRecordTag::PKMID, encodedPhotons, &impl_);
+    return *this;
+}
+
+std::vector<float> BamRecord::Pkmean2(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                      PulseBehavior pulseBehavior) const
+{
+    return FetchPhotons(BamRecordTag::PKMEAN_2, orientation, aligned, exciseSoftClips,
+                        pulseBehavior);
+}
+
+BamRecord& BamRecord::Pkmean2(const std::vector<float>& photons)
+{
+    Pkmean2(EncodePhotons(photons));
+    return *this;
+}
+
+BamRecord& BamRecord::Pkmean2(const std::vector<uint16_t>& encodedPhotons)
+{
+    CreateOrEdit(BamRecordTag::PKMEAN_2, encodedPhotons, &impl_);
+    return *this;
+}
+
+std::vector<float> BamRecord::Pkmid2(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                     PulseBehavior pulseBehavior) const
+{
+    return FetchPhotons(BamRecordTag::PKMID_2, orientation, aligned, exciseSoftClips,
+                        pulseBehavior);
+}
+
+BamRecord& BamRecord::Pkmid2(const std::vector<float>& photons)
+{
+    Pkmid2(EncodePhotons(photons));
+    return *this;
+}
+
+BamRecord& BamRecord::Pkmid2(const std::vector<uint16_t>& encodedPhotons)
+{
+    CreateOrEdit(BamRecordTag::PKMID_2, encodedPhotons, &impl_);
+    return *this;
+}
+
+Frames BamRecord::PreBaseFrames(Orientation orientation, bool aligned, bool exciseSoftClips) const
+{
+    return IPD(orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::PreBaseFrames(const Frames& frames, const FrameEncodingType encoding)
+{
+    return IPD(frames, encoding);
+}
+
+Frames BamRecord::PrePulseFrames(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior) const
+{
+    return FetchFrames(BamRecordTag::PRE_PULSE_FRAMES, orientation, aligned, exciseSoftClips,
+                       pulseBehavior);
+}
+
+BamRecord& BamRecord::PrePulseFrames(const Frames& frames, const FrameEncodingType encoding)
+{
+    if (encoding == FrameEncodingType::LOSSY) {
+        CreateOrEdit(BamRecordTag::PRE_PULSE_FRAMES, frames.Encode(), &impl_);
+    } else {
+        CreateOrEdit(BamRecordTag::PRE_PULSE_FRAMES, frames.Data(), &impl_);
+    }
+    return *this;
+}
+
+Frames BamRecord::PulseWidthRaw(Orientation orientation, bool /* aligned */,
+                                bool /* exciseSoftClips */) const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::PULSE_WIDTH);
+    const Tag frameTag = impl_.TagValue(tagName);
+    if (frameTag.IsNull()) return {};
+
+    Frames frames;
+
+    // lossy frame codes
+    if (frameTag.IsUInt8Array()) {
+        const auto codes = frameTag.ToUInt8Array();
+        const std::vector<uint16_t> codes16(codes.begin(), codes.end());
+        frames.Data(std::move(codes16));
+    }
+
+    // lossless frame data
+    else {
+        assert(frameTag.IsUInt16Array());
+        frames.Data(frameTag.ToUInt16Array());
+    }
+
+    // return in requested orientation
+    OrientTagDataAsRequested(&frames,
+                             Orientation::NATIVE,  // current
+                             orientation,          // requested
+                             impl_.IsReverseStrand());
+    return frames;
+}
+
+QualityValues BamRecord::PulseMergeQV(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                      PulseBehavior pulseBehavior) const
+{
+    return FetchQualities(BamRecordTag::PULSE_MERGE_QV, orientation, aligned, exciseSoftClips,
+                          pulseBehavior);
+}
+
+BamRecord& BamRecord::PulseMergeQV(const QualityValues& mergeQVs)
+{
+    CreateOrEdit(BamRecordTag::PULSE_MERGE_QV, mergeQVs.Fastq(), &impl_);
+    return *this;
+}
+
+std::string BamRecord::PulseCall(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior) const
+{
+    return FetchBases(BamRecordTag::PULSE_CALL, orientation, aligned, exciseSoftClips,
+                      pulseBehavior);
+}
+
+BamRecord& BamRecord::PulseCall(const std::string& tags)
+{
+    CreateOrEdit(BamRecordTag::PULSE_CALL, tags, &impl_);
+    return *this;
+}
+
+Frames BamRecord::PulseCallWidth(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior) const
+{
+    return FetchFrames(BamRecordTag::PULSE_CALL_WIDTH, orientation, aligned, exciseSoftClips,
+                       pulseBehavior);
+}
+
+BamRecord& BamRecord::PulseCallWidth(const Frames& frames, const FrameEncodingType encoding)
+{
+    if (encoding == FrameEncodingType::LOSSY) {
+        CreateOrEdit(BamRecordTag::PULSE_CALL_WIDTH, frames.Encode(), &impl_);
+    } else {
+        CreateOrEdit(BamRecordTag::PULSE_CALL_WIDTH, frames.Data(), &impl_);
+    }
+    return *this;
+}
+
+std::vector<PacBio::BAM::PulseExclusionReason> BamRecord::PulseExclusionReason(
+    Orientation orientation, bool aligned, bool exciseSoftClips, PulseBehavior pulseBehavior) const
+{
+    std::vector<PacBio::BAM::PulseExclusionReason> reasons;
+
+    const auto reasonNums = FetchUInt8s(BamRecordTag::PULSE_EXCLUSION, orientation, aligned,
+                                        exciseSoftClips, pulseBehavior);
+
+    std::transform(
+        reasonNums.cbegin(), reasonNums.cend(), std::back_inserter(reasons),
+        [](const uint8_t num) { return static_cast<PacBio::BAM::PulseExclusionReason>(num); });
+
+    return reasons;
+}
+
+BamRecord& BamRecord::PulseExclusionReason(
+    const std::vector<PacBio::BAM::PulseExclusionReason>& reasons)
+{
+    std::vector<uint8_t> reasonNums;
+    std::transform(reasons.cbegin(), reasons.cend(), std::back_inserter(reasonNums),
+                   [](const PacBio::BAM::PulseExclusionReason& reason) {
+                       return static_cast<uint8_t>(reason);
+                   });
+
+    CreateOrEdit(BamRecordTag::PULSE_EXCLUSION, reasonNums, &impl_);
+    return *this;
+}
+
+Frames BamRecord::PulseWidth(Orientation orientation, bool aligned, bool exciseSoftClips) const
+{
+    return FetchFrames(BamRecordTag::PULSE_WIDTH, orientation, aligned, exciseSoftClips,
+                       PulseBehavior::ALL);
+}
+
+BamRecord& BamRecord::PulseWidth(const Frames& frames, const FrameEncodingType encoding)
+{
+    if (encoding == FrameEncodingType::LOSSY) {
+        CreateOrEdit(BamRecordTag::PULSE_WIDTH, frames.Encode(), &impl_);
+    } else {
+        CreateOrEdit(BamRecordTag::PULSE_WIDTH, frames.Data(), &impl_);
+    }
+    return *this;
+}
+
+QualityValues BamRecord::Qualities(Orientation orientation, bool aligned,
+                                   bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::QUAL, orientation, aligned, exciseSoftClips);
+}
+
+Position BamRecord::QueryEnd() const
+{
+    // try 'qe' tag
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::QUERY_END);
+    const Tag qe = impl_.TagValue(tagName);
+    if (!qe.IsNull()) return qe.ToInt32();
+
+    // tag missing, need to check movie name (fallback for non-PB BAMs, but ignore for CCS reads)
+    RecordType type;
+    try {
+        type = Type();
+    } catch (std::exception&) {
+        return 0;
+    }
+    if (type == RecordType::CCS)
+        throw std::runtime_error{"BamRecord: no query end for CCS read type"};
+    if (type == RecordType::TRANSCRIPT)
+        throw std::runtime_error{"BamRecord: no query end for transcript read type"};
+
+    // PacBio BAM, non-CCS/transcript
+    try {
+        return QueryEndFromName(FullName());
+    } catch (std::exception&) {
+        // return fallback position
+        return 0;
+    }
+}
+
+BamRecord& BamRecord::QueryEnd(const Position pos)
+{
+    CreateOrEdit(BamRecordTag::QUERY_END, static_cast<int32_t>(pos), &impl_);
+    UpdateName();
+    return *this;
+}
+
+int32_t BamRecord::QueryEndFrameNumber() const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::QUERY_END_FRAME_NUMBER);
+    const Tag qs = impl_.TagValue(tagName);
+    if (!qs.IsNull()) return qs.ToInt32();
+    return 0;
+}
+
+BamRecord& BamRecord::QueryEndFrameNumber(const int32_t frameNumber)
+{
+    CreateOrEdit(BamRecordTag::QUERY_END_FRAME_NUMBER, frameNumber, &impl_);
+    return *this;
+}
+
+Position BamRecord::QueryStart() const
+{
+    // try 'qs' tag
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::QUERY_START);
+    const Tag qs = impl_.TagValue(tagName);
+    if (!qs.IsNull()) return qs.ToInt32();
+
+    // tag missing, need to check movie name (fallback for non-PB BAMs, but ignore for CCS reads)
+    RecordType type;
+    try {
+        type = Type();
+    } catch (std::exception&) {
+        return 0;
+    }
+    if (type == RecordType::CCS)
+        throw std::runtime_error{"BamRecord: no query start for CCS read type"};
+    if (type == RecordType::TRANSCRIPT)
+        throw std::runtime_error{"BamRecord: no query start for transcript read type"};
+
+    // PacBio BAM, non-CCS/transcript
+    try {
+        return QueryStartFromName(FullName());
+    } catch (std::exception&) {
+        // return fallback position
+        return 0;
+    }
+}
+
+BamRecord& BamRecord::QueryStart(const Position pos)
+{
+    CreateOrEdit(BamRecordTag::QUERY_START, static_cast<int32_t>(pos), &impl_);
+    UpdateName();
+    return *this;
+}
+
+int32_t BamRecord::QueryStartFrameNumber() const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::QUERY_START_FRAME_NUMBER);
+    const Tag qs = impl_.TagValue(tagName);
+    if (!qs.IsNull()) return qs.ToInt32();
+    return 0;
+}
+
+BamRecord& BamRecord::QueryStartFrameNumber(const int32_t frameNumber)
+{
+    CreateOrEdit(BamRecordTag::QUERY_START_FRAME_NUMBER, frameNumber, &impl_);
+    return *this;
+}
+
+Accuracy BamRecord::ReadAccuracy() const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::READ_ACCURACY);
+    const Tag readAccuracy = impl_.TagValue(tagName);
+    return {readAccuracy.ToFloat()};
+}
+
+BamRecord& BamRecord::ReadAccuracy(const Accuracy& accuracy)
+{
+    CreateOrEdit(BamRecordTag::READ_ACCURACY, static_cast<float>(accuracy), &impl_);
+    return *this;
+}
+
+ReadGroupInfo BamRecord::ReadGroup() const { return header_.ReadGroup(ReadGroupId()); }
+
+BamRecord& BamRecord::ReadGroup(const ReadGroupInfo& rg)
+{
+    CreateOrEdit(BamRecordTag::READ_GROUP, rg.Id(), &impl_);
+    UpdateName();
+    return *this;
+}
+
+std::string BamRecord::ReadGroupId() const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::READ_GROUP);
+    const Tag rgTag = impl_.TagValue(tagName);
+    if (rgTag.IsNull()) return {};
+    return rgTag.ToString();
+}
+
+std::string BamRecord::ReadGroupBaseId() const { return ReadGroup().BaseId(); }
+
+BamRecord& BamRecord::ReadGroupId(const std::string& id)
+{
+    CreateOrEdit(BamRecordTag::READ_GROUP, id, &impl_);
+    UpdateName();
+    return *this;
+}
+
+int32_t BamRecord::ReadGroupNumericId() const { return ReadGroupInfo::IdToInt(ReadGroupBaseId()); }
+
+Position BamRecord::ReferenceEnd() const
+{
+    if (!impl_.IsMapped()) return PacBio::BAM::UnmappedPosition;
+    const auto htsData = BamRecordMemory::GetRawData(impl_);
+    if (!htsData) return PacBio::BAM::UnmappedPosition;
+    return bam_endpos(htsData.get());
+}
+
+int32_t BamRecord::ReferenceId() const { return impl_.ReferenceId(); }
+
+std::string BamRecord::ReferenceName() const
+{
+    if (IsMapped())
+        return Header().SequenceName(ReferenceId());
+    else
+        throw std::runtime_error{"BamRecord: unmapped record has no associated reference name"};
+}
+
+Position BamRecord::ReferenceStart() const { return impl_.Position(); }
+
+void BamRecord::ResetCachedPositions() const
+{
+    alignedEnd_ = PacBio::BAM::UnmappedPosition;
+    alignedStart_ = PacBio::BAM::UnmappedPosition;
+}
+
+void BamRecord::ResetCachedPositions()
+{
+    alignedEnd_ = PacBio::BAM::UnmappedPosition;
+    alignedStart_ = PacBio::BAM::UnmappedPosition;
+}
+
+VirtualRegionType BamRecord::ScrapRegionType() const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::SCRAP_REGION_TYPE);
+    const Tag srTag = impl_.TagValue(tagName);
+    return VirtualRegionTypeMap::ParseChar[srTag.ToUInt8()];
+}
+
+BamRecord& BamRecord::ScrapRegionType(const VirtualRegionType type)
+{
+    CreateOrEdit(BamRecordTag::SCRAP_REGION_TYPE, static_cast<uint8_t>(type), &impl_);
+    return *this;
+}
+
+BamRecord& BamRecord::ScrapRegionType(const char type)
+{
+    CreateOrEdit(BamRecordTag::SCRAP_REGION_TYPE, type, &impl_);
+    return *this;
+}
+
+ZmwType BamRecord::ScrapZmwType() const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::SCRAP_ZMW_TYPE);
+    const Tag szTag = impl_.TagValue(tagName);
+    return ZmwTypeMap::ParseChar[szTag.ToUInt8()];
+}
+
+BamRecord& BamRecord::ScrapZmwType(const ZmwType type)
+{
+    CreateOrEdit(BamRecordTag::SCRAP_ZMW_TYPE, static_cast<uint8_t>(type), &impl_);
+    return *this;
+}
+
+BamRecord& BamRecord::ScrapZmwType(const char type)
+{
+    CreateOrEdit(BamRecordTag::SCRAP_ZMW_TYPE, type, &impl_);
+    return *this;
+}
+
+std::string BamRecord::Sequence(const Orientation orientation, bool aligned,
+                                bool exciseSoftClips) const
+{
+    return FetchBases(BamRecordTag::SEQ, orientation, aligned, exciseSoftClips);
+}
+
+std::vector<float> BamRecord::SignalToNoise() const
+{
+    const auto tagName = BamRecordTags::LabelFor(BamRecordTag::SIGNAL_TO_NOISE);
+    const Tag snTag = impl_.TagValue(tagName);
+    return snTag.ToFloatArray();
+}
+
+BamRecord& BamRecord::SignalToNoise(const std::vector<float>& snr)
+{
+    CreateOrEdit(BamRecordTag::SIGNAL_TO_NOISE, snr, &impl_);
+    return *this;
+}
+
+std::vector<uint32_t> BamRecord::StartFrame(Orientation orientation, bool aligned,
+                                            bool exciseSoftClips, PulseBehavior pulseBehavior) const
+{
+    return FetchUInt32s(BamRecordTag::START_FRAME, orientation, aligned, exciseSoftClips,
+                        pulseBehavior);
+}
+
+BamRecord& BamRecord::StartFrame(const std::vector<uint32_t>& startFrame)
+{
+    CreateOrEdit(BamRecordTag::START_FRAME, startFrame, &impl_);
+    return *this;
+}
+
+QualityValues BamRecord::SubstitutionQV(Orientation orientation, bool aligned,
+                                        bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::SUBSTITUTION_QV, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::SubstitutionQV(const QualityValues& substitutionQVs)
+{
+    CreateOrEdit(BamRecordTag::SUBSTITUTION_QV, substitutionQVs.Fastq(), &impl_);
+    return *this;
+}
+
+std::string BamRecord::SubstitutionTag(Orientation orientation, bool aligned,
+                                       bool exciseSoftClips) const
+{
+    return FetchBases(BamRecordTag::SUBSTITUTION_TAG, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::SubstitutionTag(const std::string& tags)
+{
+    CreateOrEdit(BamRecordTag::SUBSTITUTION_TAG, tags, &impl_);
+    return *this;
+}
+
+Data::Read BamRecord::ToRead() const
+{
+    Data::Read result{FullName(),      Sequence(),   Qualities(),
+                      SignalToNoise(), QueryStart(), QueryEnd()};
+
+    if (HasIPD()) result.IPD = IPD();
+    if (HasPulseWidth()) result.PulseWidth = PulseWidth();
+
+    if (IsMapped() && AlignedStrand() == Data::Strand::REVERSE) {
+        ReverseComplement(result.Seq);
+        Reverse(result.Qualities);
+    }
+    return result;
+}
+
+Data::MappedRead BamRecord::ToMappedRead() const
+{
+    if (!IsMapped()) {
+        throw std::runtime_error{"BAM record '" + FullName() +
+                                 "' cannot be converted to MappedRead because it is not mapped"};
+    }
+
+    return {ToRead(), AlignedStrand(), ReferenceStart(), ReferenceEnd(), CigarData(), MapQuality()};
+}
+
+RecordType BamRecord::Type() const
+{
+    try {
+        const auto typeName = ReadGroup().ReadType();
+        return NameToType(typeName);
+    } catch (std::exception&) {
+
+        // read group not found, peek at name to see if we're possibly
+        // CCS or TRANSCRIPT
+        //
+        const auto name = FullName();
+        if (name.find("transcript") == 0)
+            return RecordType::TRANSCRIPT;
+        else if (name.find("/ccs") != std::string::npos)
+            return RecordType::CCS;
+        else
+            return RecordType::UNKNOWN;
+    }
+}
+
+void BamRecord::UpdateName()
+{
+    std::string newName;
+    newName.reserve(100);
+
+    const auto holeNumber = (HasHoleNumber() ? std::to_string(HoleNumber()) : "?");
+    if (Type() == RecordType::TRANSCRIPT) {
+        newName = "transcript/" + holeNumber;
+    } else {
+        newName += MovieName();
+        newName += "/";
+        newName += holeNumber;
+        newName += "/";
+
+        if (Type() == RecordType::CCS)
+            newName += "ccs";
+
+        else {
+            if (HasQueryStart())
+                newName += std::to_string(QueryStart());
+            else
+                newName += "?";
+
+            newName += '_';
+
+            if (HasQueryEnd())
+                newName += std::to_string(QueryEnd());
+            else
+                newName += "?";
+        }
+    }
+    impl_.Name(newName);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamRecordBuilder.cpp b/src/BamRecordBuilder.cpp

new file mode 100644 (file)

index 0000000..96d8059
--- /dev/null
+++ b/src/BamRecordBuilder.cpp
@@ -0,0 +1,396 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamRecordBuilder.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <sstream>
+#include <type_traits>
+
+#include <htslib/sam.h>
+
+#include "MemoryUtils.h"
+#include "pbbam/BamTagCodec.h"
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(std::is_copy_constructible<BamRecordBuilder>::value,
+              "BamRecordBuilder(const BamRecordBuilder&) is not = default");
+static_assert(std::is_copy_assignable<BamRecordBuilder>::value,
+              "BamRecordBuilder& operator=(const BamRecordBuilder&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<BamRecordBuilder>::value,
+              "BamRecordBuilder(BamRecordBuilder&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<BamRecordBuilder>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+BamRecordBuilder::BamRecordBuilder()
+{
+    // ensure proper clean slate
+    Reset();
+
+    // initialize with some space for data
+    name_.reserve(256);
+    sequence_.reserve(2096);
+    qualities_.reserve(2096);
+    cigar_.reserve(256);
+}
+
+BamRecordBuilder::BamRecordBuilder(BamHeader header) : header_{std::move(header)}
+{
+    // ensure proper clean slate
+    Reset();
+
+    // initialize with some space for data
+    name_.reserve(256);
+    sequence_.reserve(2096);
+    qualities_.reserve(2096);
+    cigar_.reserve(256);
+}
+
+BamRecordBuilder::BamRecordBuilder(const BamRecord& prototype) : header_{prototype.Header()}
+{
+    Reset(prototype);
+}
+
+BamRecordBuilder& BamRecordBuilder::Bin(const uint32_t bin)
+{
+    core_.bin = bin;
+    return *this;
+}
+
+BamRecord BamRecordBuilder::Build() const
+{
+    BamRecord result{header_};
+    BuildInPlace(result);
+    return result;
+}
+
+bool BamRecordBuilder::BuildInPlace(BamRecord& record) const
+{
+    // initialize with basic 'core data'
+    auto recordRawData = BamRecordMemory::GetRawData(record);
+    if (!recordRawData || !recordRawData->data)
+        throw std::runtime_error{
+            "BamRecordBuilder: cannot build record, target memory is in an invalid state"};
+    recordRawData->core = core_;
+
+    // setup variable length data
+    const auto encodedTags = BamTagCodec::Encode(tags_);
+
+    const size_t nameLength = name_.size() + 1;
+    const size_t numCigarOps = cigar_.size();
+    const size_t cigarLength = numCigarOps * sizeof(uint32_t);
+    const size_t seqLength = sequence_.size();
+    const size_t qualLength = seqLength;
+    const size_t tagLength = encodedTags.size();
+    const size_t dataLength = nameLength + cigarLength + seqLength + qualLength + tagLength;
+
+    // realloc if necessary
+    uint8_t* varLengthDataBlock = recordRawData->data;
+    if (!varLengthDataBlock)
+        throw std::runtime_error{
+            "BamRecordBuilder: cannot build record, target memory is in an invalid state"};
+
+    size_t allocatedDataLength = recordRawData->m_data;
+    if (allocatedDataLength < dataLength) {
+        allocatedDataLength = dataLength;
+        kroundup32(allocatedDataLength);
+        varLengthDataBlock =
+            static_cast<uint8_t*>(realloc(varLengthDataBlock, allocatedDataLength));
+    }
+    recordRawData->data = varLengthDataBlock;
+    recordRawData->l_data = dataLength;
+    recordRawData->m_data = allocatedDataLength;
+
+    size_t index = 0;
+
+    // name
+    memcpy(&varLengthDataBlock[index], name_.c_str(), nameLength);
+    index += nameLength;
+
+    // cigar
+    if (cigarLength > 0) {
+        std::vector<uint32_t> encodedCigar(numCigarOps);
+        for (size_t i = 0; i < numCigarOps; ++i) {
+            const auto& op = cigar_.at(i);
+            encodedCigar[i] = op.Length() << BAM_CIGAR_SHIFT;
+            const auto type = static_cast<uint8_t>(op.Type());
+            if (type >= 8)
+                throw std::runtime_error{"BamRecordBuilder: invalid CIGAR op type: " +
+                                         std::to_string(type)};
+            encodedCigar[i] |= type;
+        }
+        memcpy(&varLengthDataBlock[index], &encodedCigar[0], cigarLength);
+        index += cigarLength;
+
+        // update bin after we've calculated cigar info
+        const int32_t endPosition = bam_cigar2rlen(recordRawData->core.n_cigar, &encodedCigar[0]);
+        recordRawData->core.bin = hts_reg2bin(core_.pos, endPosition, 14, 5);
+    }
+
+    // seq & qual
+    if (seqLength > 0) {
+
+        uint8_t* s = &varLengthDataBlock[index];
+        for (size_t i = 0; i < seqLength; ++i)
+            s[i >> 1] |= (seq_nt16_table[static_cast<int>(sequence_.at(i))] << ((~i & 1) << 2));
+        index += seqLength;
+
+        uint8_t* q = &varLengthDataBlock[index];
+        if (!qualities_.empty())
+            memset(q, 0xFF, seqLength);
+        else {
+            for (size_t i = 0; i < seqLength; ++i)
+                q[i] = qualities_.at(i) - 33;
+        }
+        index += seqLength;
+    }
+
+    // tags
+    if (tagLength > 0) {
+        if (encodedTags.empty())
+            throw std::runtime_error{"BamRecordBuilder: expected tags but none are present"};
+        memcpy(&varLengthDataBlock[index], &encodedTags[0], tagLength);
+        index += tagLength;
+    }
+
+    // sanity check
+    if (index != dataLength) {
+        std::ostringstream s;
+        s << "BamRecordBuilder: incorrect number of bytes written to record:\n"
+          << "  expected: " << dataLength << '\n'
+          << "  actual: " << index;
+        throw std::runtime_error{s.str()};
+    }
+    return true;
+}
+
+BamRecordBuilder& BamRecordBuilder::Cigar(PacBio::BAM::Cigar cigar)
+{
+    core_.n_cigar = cigar.size();
+    cigar_ = std::move(cigar);
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::Flag(const uint32_t flag)
+{
+    core_.flag = flag;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::InsertSize(const int32_t iSize)
+{
+    core_.isize = iSize;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::MapQuality(const uint8_t mapQual)
+{
+    core_.qual = mapQual;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::MatePosition(const int32_t pos)
+{
+    core_.mpos = pos;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::MateReferenceId(const int32_t id)
+{
+    core_.mtid = id;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::Name(std::string name)
+{
+    core_.l_qname = name.size() + 1;  // (NULL-term)
+    name_ = std::move(name);
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::Position(const int32_t pos)
+{
+    core_.pos = pos;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::Qualities(std::string qualities)
+{
+    qualities_ = std::move(qualities);
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::ReferenceId(const int32_t id)
+{
+    core_.tid = id;
+    return *this;
+}
+
+void BamRecordBuilder::Reset()
+{
+    // zeroize fixed-length data
+    memset(&core_, 0, sizeof(bam1_core_t));
+    core_.l_qname = 1;  // always has a NULL-term
+
+    // reset variable-length data
+    name_.clear();
+    sequence_.clear();
+    qualities_.clear();
+    cigar_.clear();
+    tags_.clear();
+}
+
+void BamRecordBuilder::Reset(BamRecord prototype)
+{
+    // ensure clean slate
+    Reset();
+    header_ = prototype.Header();
+
+    // reset variable-length data
+    const BamRecordImpl& impl = BamRecordMemory::GetImpl(prototype);
+    name_ = impl.Name();
+    sequence_ = impl.Sequence();
+    qualities_ = impl.Qualities().Fastq();
+    cigar_ = impl.CigarData();
+    tags_ = impl.Tags();
+
+    // reset core data
+    const auto rawData = BamRecordMemory::GetRawData(prototype);
+    if (!rawData)
+        throw std::runtime_error{
+            "BamRecordBuilder: cannot build record, target memory is in an invalid state"};
+    core_ = std::move(rawData->core);
+}
+
+BamRecordBuilder& BamRecordBuilder::Sequence(std::string sequence)
+{
+    core_.l_qseq = sequence.size();
+    sequence_ = std::move(sequence);
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetDuplicate(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::DUPLICATE;
+    else
+        core_.flag &= ~BamRecordImpl::DUPLICATE;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetFailedQC(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::FAILED_QC;
+    else
+        core_.flag &= ~BamRecordImpl::FAILED_QC;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetFirstMate(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::MATE_1;
+    else
+        core_.flag &= ~BamRecordImpl::MATE_1;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetMapped(bool ok)
+{
+    if (ok)
+        core_.flag &= ~BamRecordImpl::UNMAPPED;
+    else
+        core_.flag |= BamRecordImpl::UNMAPPED;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetMateMapped(bool ok)
+{
+    if (ok)
+        core_.flag &= ~BamRecordImpl::MATE_UNMAPPED;
+    else
+        core_.flag |= BamRecordImpl::MATE_UNMAPPED;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetMateReverseStrand(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::MATE_REVERSE_STRAND;
+    else
+        core_.flag &= ~BamRecordImpl::MATE_REVERSE_STRAND;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetPaired(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::PAIRED;
+    else
+        core_.flag &= ~BamRecordImpl::PAIRED;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetPrimaryAlignment(bool ok)
+{
+    if (ok)
+        core_.flag &= ~BamRecordImpl::SECONDARY;
+    else
+        core_.flag |= BamRecordImpl::SECONDARY;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetProperPair(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::PROPER_PAIR;
+    else
+        core_.flag &= ~BamRecordImpl::PROPER_PAIR;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetReverseStrand(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::REVERSE_STRAND;
+    else
+        core_.flag &= ~BamRecordImpl::REVERSE_STRAND;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetSecondMate(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::MATE_2;
+    else
+        core_.flag &= ~BamRecordImpl::MATE_2;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetSupplementaryAlignment(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::SUPPLEMENTARY;
+    else
+        core_.flag &= ~BamRecordImpl::SUPPLEMENTARY;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::Tags(TagCollection tags)
+{
+    tags_ = std::move(tags);
+    return *this;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamRecordImpl.cpp b/src/BamRecordImpl.cpp

new file mode 100644 (file)

index 0000000..1281b92
--- /dev/null
+++ b/src/BamRecordImpl.cpp
@@ -0,0 +1,820 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamRecordImpl.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <tuple>
+#include <utility>
+
+#include <htslib/hts_endian.h>
+
+#include "BamRecordTags.h"
+#include "MemoryUtils.h"
+#include "pbbam/BamTagCodec.h"
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace {
+
+static const bool has_native_long_cigar_support = DoesHtslibSupportLongCigar();
+
+Cigar FetchRawCigar(const uint32_t* const src, const uint32_t len)
+{
+    Cigar result;
+    result.reserve(len);
+    for (uint32_t i = 0; i < len; ++i) {
+        const uint32_t length = bam_cigar_oplen(src[i]);
+        const auto type = static_cast<CigarOperationType>(bam_cigar_op(src[i]));
+        result.push_back(CigarOperation(type, length));
+    }
+    return result;
+}
+
+bool HasLongCigar(const bam1_t* const b)
+{
+    auto* c = &b->core;
+
+    // if empty CIGAR or unmapped
+    if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return false;
+
+    // if existing CIGAR doesn't look like a 'fake CIGAR'
+    const auto firstCigarOp = *(bam_get_cigar(b));
+    if (bam_cigar_op(firstCigarOp) != static_cast<uint32_t>(CigarOperationType::SOFT_CLIP) ||
+        static_cast<int32_t>(bam_cigar_oplen(firstCigarOp)) != c->l_qseq) {
+        return false;
+    }
+
+    // if CG tag missing, not expected type
+    const uint8_t* const CG = bam_aux_get(b, "CG");
+    if (CG == nullptr) return false;
+    if (CG[0] != 'B' || CG[1] != 'I') return false;
+
+    // if CG tag data is empty
+    uint32_t numElements = 0;
+    memcpy(&numElements, &CG[2], sizeof(uint32_t));
+    if (numElements == 0) return false;
+
+    // we've found long CIGAR data in the CG tag
+    return true;
+}
+
+}  // namespace
+
+BamRecordImpl::BamRecordImpl() : d_{nullptr}
+{
+    InitializeData();
+    assert(d_);
+}
+
+BamRecordImpl::BamRecordImpl(const BamRecordImpl& other)
+    : d_{bam_dup1(other.d_.get()), HtslibRecordDeleter()}, tagOffsets_{other.tagOffsets_}
+{
+    assert(d_);
+}
+
+BamRecordImpl& BamRecordImpl::operator=(const BamRecordImpl& other)
+{
+    if (this != &other) {
+        if (d_ == nullptr) InitializeData();
+        bam_copy1(d_.get(), other.d_.get());
+        tagOffsets_ = other.tagOffsets_;
+    }
+    assert(d_);
+    return *this;
+}
+
+bool BamRecordImpl::AddTag(const std::string& tagName, const Tag& value)
+{
+    return AddTag(tagName, value, TagModifier::NONE);
+}
+
+bool BamRecordImpl::AddTag(const BamRecordTag tag, const Tag& value)
+{
+    return AddTag(BamRecordTags::LabelFor(tag), value, TagModifier::NONE);
+}
+
+bool BamRecordImpl::AddTag(const std::string& tagName, const Tag& value,
+                           const TagModifier additionalModifier)
+{
+    if (tagName.size() != 2 || HasTag(tagName)) return false;
+    const auto added = AddTagImpl(tagName, value, additionalModifier);
+    if (added) UpdateTagMap();
+    return added;
+}
+
+bool BamRecordImpl::AddTag(const BamRecordTag tag, const Tag& value,
+                           const TagModifier additionalModifier)
+{
+    return AddTag(BamRecordTags::LabelFor(tag), value, additionalModifier);
+}
+
+bool BamRecordImpl::AddTagImpl(const std::string& tagName, const Tag& value,
+                               const TagModifier additionalModifier)
+{
+    const auto rawData = BamTagCodec::ToRawData(value, additionalModifier);
+    if (rawData.empty()) return false;
+
+    bam_aux_append(d_.get(), tagName.c_str(), BamTagCodec::TagTypeCode(value, additionalModifier),
+                   rawData.size(), const_cast<uint8_t*>(rawData.data()));
+    return true;
+}
+
+uint32_t BamRecordImpl::Bin() const { return d_->core.bin; }
+
+BamRecordImpl& BamRecordImpl::Bin(uint32_t bin)
+{
+    d_->core.bin = bin;
+    return *this;
+}
+
+Cigar BamRecordImpl::CigarData() const
+{
+    const auto* b = d_.get();
+    if (!has_native_long_cigar_support && HasLongCigar(b)) {
+        // fetch long CIGAR from tag
+        const auto cigarTag = TagValue("CG");
+        const auto cigarTagValue = cigarTag.ToUInt32Array();
+        return FetchRawCigar(cigarTagValue.data(), cigarTagValue.size());
+    } else {
+        // fetch CIGAR from the standard location
+        return FetchRawCigar(bam_get_cigar(b), b->core.n_cigar);
+    }
+}
+
+BamRecordImpl& BamRecordImpl::CigarData(const Cigar& cigar)
+{
+    // if long CIGAR, using htslib version < 1.7, set it "manually"
+    if (!has_native_long_cigar_support && cigar.size() >= 65536) {
+        // Add the 'fake' CIGAR in normal place.
+        Cigar fake;
+        fake.emplace_back(CigarOperationType::SOFT_CLIP, SequenceLength());
+        const uint32_t alignedLength =
+            static_cast<uint32_t>(bam_cigar2rlen(d_->core.n_cigar, bam_get_cigar(d_.get())));
+        fake.emplace_back(CigarOperationType::REFERENCE_SKIP, alignedLength);
+        SetCigarData(fake);
+
+        // Add raw CIGAR data to CG tag.
+        std::vector<uint32_t> cigarData(cigar.size());
+        cigarData.reserve(cigar.size());
+        for (size_t i = 0; i < cigar.size(); ++i) {
+            const CigarOperation& op = cigar.at(i);
+            cigarData[i] = bam_cigar_gen(op.Length(), static_cast<int>(op.Type()));
+        }
+        if (HasTag("CG"))
+            EditTag("CG", Tag{cigarData});
+        else
+            AddTag("CG", Tag{cigarData});
+    }
+
+    // otherwise (v1.7+ or short CIGAR), use standard APIs
+    else {
+        if (HasTag("CG")) RemoveTag("CG");
+        SetCigarData(cigar);
+    }
+
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::CigarData(const std::string& cigarString)
+{
+    return CigarData(Cigar::FromStdString(cigarString));
+}
+
+bool BamRecordImpl::EditTag(const std::string& tagName, const Tag& newValue)
+{
+    return EditTag(tagName, newValue, TagModifier::NONE);
+}
+
+bool BamRecordImpl::EditTag(const BamRecordTag tag, const Tag& newValue)
+{
+    return EditTag(BamRecordTags::LabelFor(tag), newValue, TagModifier::NONE);
+}
+
+bool BamRecordImpl::EditTag(const std::string& tagName, const Tag& newValue,
+                            const TagModifier additionalModifier)
+{
+    // try remove old value (with delayed tag map update)
+    const bool removed = RemoveTagImpl(tagName);
+    if (!removed) return false;
+
+    // if old value removed, add new value
+    const bool added = AddTagImpl(tagName, newValue, additionalModifier);
+    if (added) UpdateTagMap();
+    return added;
+}
+
+bool BamRecordImpl::EditTag(const BamRecordTag tag, const Tag& newValue,
+                            const TagModifier additionalModifier)
+{
+    return EditTag(BamRecordTags::LabelFor(tag), newValue, additionalModifier);
+}
+
+uint32_t BamRecordImpl::Flag() const { return d_->core.flag; }
+
+BamRecordImpl& BamRecordImpl::Flag(uint32_t flag)
+{
+    d_->core.flag = flag;
+    return *this;
+}
+
+BamRecordImpl BamRecordImpl::FromRawData(const std::shared_ptr<bam1_t>& rawData)
+{
+    BamRecordImpl result;
+    bam_copy1(result.d_.get(), rawData.get());
+    return result;
+}
+
+bool BamRecordImpl::HasTag(const std::string& tagName) const
+{
+    if (tagName.size() != 2) return false;
+    return TagOffset(tagName) != -1;
+}
+
+bool BamRecordImpl::HasTag(const BamRecordTag tag) const
+{
+    return HasTag(BamRecordTags::LabelFor(tag));
+}
+
+void BamRecordImpl::InitializeData()
+{
+    d_.reset(bam_init1(), HtslibRecordDeleter());
+    d_->data = static_cast<uint8_t*>(
+        calloc(0x800, sizeof(uint8_t)));  // maybe make this value tune-able later?
+    d_->m_data = 0x800;
+
+    // init unmapped
+    Position(PacBio::BAM::UnmappedPosition);
+    MatePosition(PacBio::BAM::UnmappedPosition);
+    ReferenceId(-1);
+    MateReferenceId(-1);
+    SetMapped(false);
+    MapQuality(255);
+
+    // initialized with empty qname (null term + 3 'extra nulls' for alignment
+    d_->core.l_extranul = 3;
+    d_->core.l_qname = 4;
+    d_->l_data = 4;
+}
+
+int32_t BamRecordImpl::InsertSize() const { return d_->core.isize; }
+
+BamRecordImpl& BamRecordImpl::InsertSize(int32_t iSize)
+{
+    d_->core.isize = iSize;
+    return *this;
+}
+
+bool BamRecordImpl::IsDuplicate() const { return (d_->core.flag & BamRecordImpl::DUPLICATE) != 0; }
+
+bool BamRecordImpl::IsFailedQC() const { return (d_->core.flag & BamRecordImpl::FAILED_QC) != 0; }
+
+bool BamRecordImpl::IsFirstMate() const { return (d_->core.flag & BamRecordImpl::MATE_1) != 0; }
+
+bool BamRecordImpl::IsMapped() const { return (d_->core.flag & BamRecordImpl::UNMAPPED) == 0; }
+
+bool BamRecordImpl::IsMateMapped() const
+{
+    return (d_->core.flag & BamRecordImpl::MATE_UNMAPPED) == 0;
+}
+
+bool BamRecordImpl::IsMateReverseStrand() const
+{
+    return (d_->core.flag & BamRecordImpl::MATE_REVERSE_STRAND) != 0;
+}
+
+bool BamRecordImpl::IsPaired() const { return (d_->core.flag & BamRecordImpl::PAIRED) != 0; }
+
+bool BamRecordImpl::IsPrimaryAlignment() const
+{
+    return (d_->core.flag & BamRecordImpl::SECONDARY) == 0;
+}
+
+bool BamRecordImpl::IsProperPair() const
+{
+    return (d_->core.flag & BamRecordImpl::PROPER_PAIR) != 0;
+}
+
+bool BamRecordImpl::IsReverseStrand() const
+{
+    return (d_->core.flag & BamRecordImpl::REVERSE_STRAND) != 0;
+}
+
+bool BamRecordImpl::IsSecondMate() const { return (d_->core.flag & BamRecordImpl::MATE_2) != 0; }
+
+bool BamRecordImpl::IsSupplementaryAlignment() const
+{
+    return (d_->core.flag & BamRecordImpl::SUPPLEMENTARY) != 0;
+}
+
+uint8_t BamRecordImpl::MapQuality() const { return d_->core.qual; }
+
+BamRecordImpl& BamRecordImpl::MapQuality(uint8_t mapQual)
+{
+    d_->core.qual = mapQual;
+    return *this;
+}
+
+PacBio::BAM::Position BamRecordImpl::MatePosition() const { return d_->core.mpos; }
+
+BamRecordImpl& BamRecordImpl::MatePosition(PacBio::BAM::Position pos)
+{
+    d_->core.mpos = pos;
+    return *this;
+}
+
+int32_t BamRecordImpl::MateReferenceId() const { return d_->core.mtid; }
+
+BamRecordImpl& BamRecordImpl::MateReferenceId(int32_t id)
+{
+    d_->core.mtid = id;
+    return *this;
+}
+
+PacBio::BAM::Position BamRecordImpl::Position() const { return d_->core.pos; }
+
+BamRecordImpl& BamRecordImpl::Position(PacBio::BAM::Position pos)
+{
+    d_->core.pos = pos;
+    return *this;
+}
+
+int32_t BamRecordImpl::ReferenceId() const { return d_->core.tid; }
+
+BamRecordImpl& BamRecordImpl::ReferenceId(int32_t id)
+{
+    d_->core.tid = id;
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::SetDuplicate(bool ok)
+{
+    if (ok)
+        d_->core.flag |= BamRecordImpl::DUPLICATE;
+    else
+        d_->core.flag &= ~BamRecordImpl::DUPLICATE;
+    return *this;
+}
+
+void BamRecordImpl::MaybeReallocData()
+{
+    // about to grow data contents to l_data size, but m_data is our current max.
+    // so we may need to grow. if so, use kroundup to double to next power of 2
+    //
+    // from sam.h:
+    //   decltype(m_data) = uint32_t
+    //   decltype(l_data) = int
+    if (d_->m_data < static_cast<uint32_t>(d_->l_data)) {
+        d_->m_data = d_->l_data;
+        kroundup32(d_->m_data);
+        d_->data = static_cast<uint8_t*>(realloc(d_->data, d_->m_data));
+    }
+}
+
+std::string BamRecordImpl::Name() const { return std::string(bam_get_qname(d_)); }
+
+BamRecordImpl& BamRecordImpl::Name(const std::string& name)
+{
+    // determine change in memory needed
+    // diffNumBytes: pos -> growing, neg -> shrinking
+    const size_t numChars = name.size() + 1;  // +1 for NULL-term
+    const size_t numExtraNulls = 4 - (numChars % 4);
+    const size_t totalNameSize = numChars + numExtraNulls;
+
+    const int diffNumBytes = totalNameSize - d_->core.l_qname;
+    const int oldLengthData = d_->l_data;
+    d_->l_data += diffNumBytes;
+    MaybeReallocData();
+
+    // shift trailing data (cigar, seq, qual, tags) as needed
+    const uint32_t* oldCigarStart = bam_get_cigar(d_);
+    const size_t trailingDataLength =
+        oldLengthData - (reinterpret_cast<const unsigned char*>(oldCigarStart) -
+                         reinterpret_cast<const unsigned char*>(d_->data));
+    d_->core.l_qname = totalNameSize;
+    d_->core.l_extranul = numExtraNulls;
+    uint32_t* newCigarStart = bam_get_cigar(d_);
+    memmove(newCigarStart, oldCigarStart, trailingDataLength);
+
+    // fill in new name
+    memcpy(d_->data, name.c_str(), numChars);
+    memset(d_->data + numChars, '\0', numExtraNulls);
+    return *this;
+}
+
+QualityValues BamRecordImpl::Qualities() const
+{
+    if (d_->core.l_qseq == 0) return QualityValues();
+
+    uint8_t* qualData = bam_get_qual(d_);
+    if (qualData[0] == 0xff) return QualityValues();
+
+    const size_t numQuals = d_->core.l_qseq;
+    QualityValues result;
+    result.reserve(numQuals);
+    for (size_t i = 0; i < numQuals; ++i)
+        result.push_back(QualityValue(qualData[i]));
+    return result;
+}
+
+bool BamRecordImpl::RemoveTag(const std::string& tagName)
+{
+    const bool removed = RemoveTagImpl(tagName);
+    if (removed) UpdateTagMap();
+    return removed;
+}
+
+bool BamRecordImpl::RemoveTag(const BamRecordTag tag)
+{
+    return RemoveTag(BamRecordTags::LabelFor(tag));
+}
+
+bool BamRecordImpl::RemoveTagImpl(const std::string& tagName)
+{
+    if (tagName.size() != 2) return false;
+    uint8_t* data = bam_aux_get(d_.get(), tagName.c_str());
+    if (data == nullptr) return false;
+    const bool ok = bam_aux_del(d_.get(), data) == 0;
+    return ok;
+}
+
+std::string BamRecordImpl::Sequence() const
+{
+    std::string result(d_->core.l_qseq, '\0');
+    static const constexpr std::array<char, 16> DnaLookup{
+        {'=', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'}};
+    const uint8_t* seqData = bam_get_seq(d_);
+    for (int i = 0; i < d_->core.l_qseq; ++i)
+        result[i] = DnaLookup[bam_seqi(seqData, i)];
+    return result;
+}
+
+size_t BamRecordImpl::SequenceLength() const { return d_->core.l_qseq; }
+
+void BamRecordImpl::SetCigarData(const Cigar& cigar)
+{
+    // determine change in memory needed
+    // diffNumBytes: pos -> growing, neg -> shrinking
+    const size_t numCigarOps = cigar.size();
+    const int diffNumCigars = numCigarOps - d_->core.n_cigar;
+    const int diffNumBytes = diffNumCigars * sizeof(uint32_t);
+    const int oldLengthData = d_->l_data;
+    d_->l_data += diffNumBytes;
+    MaybeReallocData();
+
+    // shift trailing data (seq, qual, tags) as needed
+    const uint8_t* oldSequenceStart = bam_get_seq(d_);
+    const size_t trailingDataLength = oldLengthData - (oldSequenceStart - d_->data);
+    d_->core.n_cigar = numCigarOps;
+    uint8_t* newSequenceStart = bam_get_seq(d_);
+    memmove(newSequenceStart, oldSequenceStart, trailingDataLength);
+
+    // fill in new CIGAR data
+    uint32_t* cigarDataStart = bam_get_cigar(d_);
+    for (size_t i = 0; i < numCigarOps; ++i) {
+        const CigarOperation& cigarOp = cigar.at(i);
+        cigarDataStart[i] = bam_cigar_gen(cigarOp.Length(), static_cast<int>(cigarOp.Type()));
+    }
+}
+
+BamRecordImpl& BamRecordImpl::SetFailedQC(bool ok)
+{
+    if (ok)
+        d_->core.flag |= BamRecordImpl::FAILED_QC;
+    else
+        d_->core.flag &= ~BamRecordImpl::FAILED_QC;
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::SetFirstMate(bool ok)
+{
+    if (ok)
+        d_->core.flag |= BamRecordImpl::MATE_1;
+    else
+        d_->core.flag &= ~BamRecordImpl::MATE_1;
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::SetMapped(bool ok)
+{
+    if (ok)
+        d_->core.flag &= ~BamRecordImpl::UNMAPPED;
+    else
+        d_->core.flag |= BamRecordImpl::UNMAPPED;
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::SetMateMapped(bool ok)
+{
+    if (ok)
+        d_->core.flag &= ~BamRecordImpl::MATE_UNMAPPED;
+    else
+        d_->core.flag |= BamRecordImpl::MATE_UNMAPPED;
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::SetMateReverseStrand(bool ok)
+{
+    if (ok)
+        d_->core.flag |= BamRecordImpl::MATE_REVERSE_STRAND;
+    else
+        d_->core.flag &= ~BamRecordImpl::MATE_REVERSE_STRAND;
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::SetPaired(bool ok)
+{
+    if (ok)
+        d_->core.flag |= BamRecordImpl::PAIRED;
+    else
+        d_->core.flag &= ~BamRecordImpl::PAIRED;
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::SetPrimaryAlignment(bool ok)
+{
+    if (ok)
+        d_->core.flag &= ~BamRecordImpl::SECONDARY;
+    else
+        d_->core.flag |= BamRecordImpl::SECONDARY;
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::SetProperPair(bool ok)
+{
+    if (ok)
+        d_->core.flag |= BamRecordImpl::PROPER_PAIR;
+    else
+        d_->core.flag &= ~BamRecordImpl::PROPER_PAIR;
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::SetReverseStrand(bool ok)
+{
+    if (ok)
+        d_->core.flag |= BamRecordImpl::REVERSE_STRAND;
+    else
+        d_->core.flag &= ~BamRecordImpl::REVERSE_STRAND;
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::SetSecondMate(bool ok)
+{
+    if (ok)
+        d_->core.flag |= BamRecordImpl::MATE_2;
+    else
+        d_->core.flag &= ~BamRecordImpl::MATE_2;
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::SetSequenceAndQualities(const std::string& sequence,
+                                                      const std::string& qualities)
+{
+    if (!qualities.empty() && (sequence.size() != qualities.size())) {
+        std::ostringstream s;
+        s << "BamRecord: if qualities are provided, the length must match the sequence length:\n"
+          << "  seq: " << sequence.size() << '\n'
+          << "  qualities: " << qualities.size();
+        throw std::runtime_error{s.str()};
+    }
+    return SetSequenceAndQualitiesInternal(sequence.c_str(), sequence.size(), qualities.c_str(),
+                                           false);
+}
+
+BamRecordImpl& BamRecordImpl::SetSequenceAndQualities(const char* sequence,
+                                                      const size_t sequenceLength,
+                                                      const char* qualities)
+{
+    return SetSequenceAndQualitiesInternal(sequence, sequenceLength, qualities, false);
+}
+
+BamRecordImpl& BamRecordImpl::SetPreencodedSequenceAndQualities(const char* encodedSequence,
+                                                                const size_t rawSequenceLength,
+                                                                const char* qualities)
+{
+    return SetSequenceAndQualitiesInternal(encodedSequence, rawSequenceLength, qualities, true);
+}
+
+BamRecordImpl& BamRecordImpl::SetSequenceAndQualitiesInternal(const char* sequence,
+                                                              const size_t sequenceLength,
+                                                              const char* qualities,
+                                                              bool isPreencoded)
+{
+    // determine change in memory needed
+    // diffNumBytes: pos -> growing, neg -> shrinking
+    const auto encodedSequenceLength = static_cast<int>((sequenceLength + 1) / 2);
+    const int oldSeqAndQualLength =
+        ((d_->core.l_qseq + 1) / 2) + d_->core.l_qseq;                       // encoded seq + qual
+    const int newSeqAndQualLength = encodedSequenceLength + sequenceLength;  // encoded seq + qual
+    const int diffNumBytes = newSeqAndQualLength - oldSeqAndQualLength;
+    const int oldLengthData = d_->l_data;
+    d_->l_data += diffNumBytes;
+    MaybeReallocData();
+
+    // shift trailing data (tags) as needed
+    const unsigned char* oldTagStart = bam_get_aux(d_);
+    const size_t trailingDataLength =
+        oldLengthData - (oldTagStart - reinterpret_cast<const unsigned char*>(d_->data));
+    d_->core.l_qseq = sequenceLength;
+    uint8_t* newTagStart = bam_get_aux(d_);
+    memmove(newTagStart, oldTagStart, trailingDataLength);
+
+    // fill in new sequence
+    uint8_t* pEncodedSequence = bam_get_seq(d_);
+    if (isPreencoded) {
+        memcpy(pEncodedSequence, sequence, encodedSequenceLength);
+    } else {
+        memset(pEncodedSequence, 0, encodedSequenceLength);
+        for (size_t i = 0; i < sequenceLength; ++i)
+            pEncodedSequence[i >> 1] |= seq_nt16_table[static_cast<int>(sequence[i])]
+                                        << ((~i & 1) << 2);
+    }
+
+    // fill in quality values
+    uint8_t* encodedQualities = bam_get_qual(d_);
+    if ((qualities == nullptr) || (strlen(qualities) == 0))
+        memset(encodedQualities, 0xff, sequenceLength);
+    else {
+        for (size_t i = 0; i < sequenceLength; ++i)
+            encodedQualities[i] = qualities[i] - 33;  // FASTQ ASCII -> int conversion
+    }
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::SetSupplementaryAlignment(bool ok)
+{
+    if (ok)
+        d_->core.flag |= BamRecordImpl::SUPPLEMENTARY;
+    else
+        d_->core.flag &= ~BamRecordImpl::SUPPLEMENTARY;
+    return *this;
+}
+
+int BamRecordImpl::TagOffset(const std::string& tagName) const
+{
+    if (tagName.size() != 2)
+        throw std::runtime_error{"BamRecord: tag name (" + tagName +
+                                 ") must have 2 characters only"};
+
+    if (tagOffsets_.empty()) UpdateTagMap();
+
+    const uint16_t tagCode =
+        (static_cast<uint8_t>(tagName.at(0)) << 8) | static_cast<uint8_t>(tagName.at(1));
+    const auto found = tagOffsets_.find(tagCode);
+    return (found != tagOffsets_.cend() ? found->second : -1);
+}
+
+BamRecordImpl& BamRecordImpl::Tags(const TagCollection& tags)
+{
+    // convert tags to binary
+    const std::vector<uint8_t> tagData = BamTagCodec::Encode(tags);
+    const size_t numBytes = tagData.size();
+    const uint8_t* data = tagData.data();
+
+    // determine change in memory needed
+    uint8_t* tagStart = bam_get_aux(d_);
+    const size_t oldNumBytes = d_->l_data - (tagStart - d_->data);
+    const int diffNumBytes = numBytes - oldNumBytes;
+    d_->l_data += diffNumBytes;
+    MaybeReallocData();
+    tagStart = bam_get_aux(d_);
+
+    // fill in new tag data
+    memcpy(static_cast<void*>(tagStart), data, numBytes);
+
+    // update tag info
+    UpdateTagMap();
+    return *this;
+}
+
+TagCollection BamRecordImpl::Tags() const
+{
+    const uint8_t* tagDataStart = bam_get_aux(d_);
+    const size_t numBytes = d_->l_data - (tagDataStart - d_->data);
+    return BamTagCodec::Decode(std::vector<uint8_t>(tagDataStart, tagDataStart + numBytes));
+}
+
+Tag BamRecordImpl::TagValue(const std::string& tagName) const
+{
+    if (tagName.size() != 2) return {};
+
+    const int offset = TagOffset(tagName);
+    if (offset == -1) return {};
+
+    bam1_t* b = d_.get();
+    assert(bam_get_aux(b));
+    uint8_t* tagData = bam_get_aux(b) + offset;
+    if (offset >= b->l_data) return {};
+
+    // skip tag name
+    return BamTagCodec::FromRawData(tagData);
+}
+
+Tag BamRecordImpl::TagValue(const BamRecordTag tag) const
+{
+    return TagValue(BamRecordTags::LabelFor(tag));
+}
+
+void BamRecordImpl::UpdateTagMap() const
+{
+    // clear out offsets, leave map structure basically intact
+    for (auto& tag : tagOffsets_)
+        tag.second = -1;
+
+    const uint8_t* tagStart = bam_get_aux(d_);
+    if (tagStart == nullptr) return;
+    const ptrdiff_t numBytes = d_->l_data - (tagStart - d_->data);
+
+    // NOTE: using a 16-bit 'code' for tag name here instead of string, to avoid
+    // a lot of string constructions & comparisons. All valid tags will be 2 chars
+    // anyway, so this should be a nice lookup mechanism.
+    //
+    uint16_t tagNameCode;
+    int64_t i = 0;
+    while (i < numBytes) {
+
+        // store (tag name code -> start offset into tag data)
+        tagNameCode = static_cast<char>(tagStart[i]) << 8 | static_cast<char>(tagStart[i + 1]);
+        i += 2;
+        tagOffsets_[tagNameCode] = i;
+
+        // skip tag contents
+        const auto tagType = static_cast<char>(tagStart[i++]);
+        switch (tagType) {
+            case 'A':
+            case 'a':
+            case 'c':
+            case 'C': {
+                i += 1;
+                break;
+            }
+            case 's':
+            case 'S': {
+                i += 2;
+                break;
+            }
+            case 'i':
+            case 'I':
+            case 'f': {
+                i += 4;
+                break;
+            }
+
+            case 'Z':
+            case 'H': {
+                // null-terminated string
+                i += strlen(reinterpret_cast<const char*>(&tagStart[i])) + 1;
+                break;
+            }
+
+            case 'B': {
+                const char subTagType = tagStart[i++];
+                size_t elementSize = 0;
+                switch (subTagType) {
+                    case 'c':
+                    case 'C':
+                        elementSize = 1;
+                        break;
+                    case 's':
+                    case 'S':
+                        elementSize = 2;
+                        break;
+                    case 'i':
+                    case 'I':
+                    case 'f':
+                        elementSize = 4;
+                        break;
+
+                    // unknown subTagType
+                    default:
+                        throw std::runtime_error{
+                            "BamRecord: unsupported array-tag-type encountered: " +
+                            std::string{1, subTagType}};
+                }
+
+                uint32_t numElements = 0;
+                memcpy(&numElements, &tagStart[i], sizeof(uint32_t));
+                i += (4 + (elementSize * numElements));
+                break;
+            }
+
+            // unknown tagType
+            default:
+                throw std::runtime_error{"BamRecord: unsupported tag-type encountered: " +
+                                         std::string{1, tagType}};
+        }
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamRecordTags.cpp b/src/BamRecordTags.cpp

new file mode 100644 (file)

index 0000000..731c7a1
--- /dev/null
+++ b/src/BamRecordTags.cpp
@@ -0,0 +1,66 @@
+// File Description
+/// \file BamRecordTags.h
+/// \brief Implements the BamRecordTags utility class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "BamRecordTags.h"
+
+#include <cassert>
+#include <unordered_map>
+
+namespace PacBio {
+namespace BAM {
+
+// clang-format off
+const BamRecordTags::TagLookupType BamRecordTags::tagLookup =
+{
+    //     enum name                          label  isPulse?
+    //     ---------                          -----  --------
+    { BamRecordTag::ALT_LABEL_QV,             {"pv", true}  },
+    { BamRecordTag::ALT_LABEL_TAG,            {"pt", true}  },
+    { BamRecordTag::BARCODE_QUALITY,          {"bq", false} },
+    { BamRecordTag::BARCODES,                 {"bc", false} },
+    { BamRecordTag::CONTEXT_FLAGS,            {"cx", false} },
+    { BamRecordTag::DELETION_QV,              {"dq", false} },
+    { BamRecordTag::DELETION_TAG,             {"dt", false} },
+    { BamRecordTag::HOLE_NUMBER,              {"zm", false} },
+    { BamRecordTag::INSERTION_QV,             {"iq", false} },
+    { BamRecordTag::IPD,                      {"ip", false} },
+    { BamRecordTag::LABEL_QV,                 {"pq", true}  },
+    { BamRecordTag::LONG_CIGAR,               {"CG", false} },
+    { BamRecordTag::MERGE_QV,                 {"mq", false} },
+    { BamRecordTag::NUM_PASSES,               {"np", false} },
+    { BamRecordTag::PKMEAN,                   {"pa", true}  },
+    { BamRecordTag::PKMEAN_2,                 {"ps", true}  },
+    { BamRecordTag::PKMID,                    {"pm", true}  },
+    { BamRecordTag::PKMID_2,                  {"pi", true}  },
+    { BamRecordTag::PRE_PULSE_FRAMES,         {"pd", true}  },
+    { BamRecordTag::PULSE_CALL,               {"pc", true}  },
+    { BamRecordTag::PULSE_CALL_WIDTH,         {"px", true}  },
+    { BamRecordTag::PULSE_EXCLUSION,          {"pe", true}  },
+    { BamRecordTag::PULSE_MERGE_QV,           {"pg", true}  },
+    { BamRecordTag::PULSE_WIDTH,              {"pw", false} }, // 'pulse' in the name; but stored per-base, not per-pulse
+    { BamRecordTag::QUERY_END,                {"qe", false} },
+    { BamRecordTag::QUERY_END_FRAME_NUMBER,   {"we", false} },
+    { BamRecordTag::QUERY_START,              {"qs", false} },
+    { BamRecordTag::QUERY_START_FRAME_NUMBER, {"ws", false} },
+    { BamRecordTag::READ_ACCURACY,            {"rq", false} },
+    { BamRecordTag::READ_GROUP,               {"RG", false} },
+    { BamRecordTag::SCRAP_REGION_TYPE,        {"sc", false} },
+    { BamRecordTag::SCRAP_ZMW_TYPE,           {"sz", false} },
+    { BamRecordTag::SIGNAL_TO_NOISE,          {"sn", false} },
+    { BamRecordTag::START_FRAME,              {"sf", true}  },
+    { BamRecordTag::SUBSTITUTION_QV,          {"sq", false} },
+    { BamRecordTag::SUBSTITUTION_TAG,         {"st", false} },
+
+    // faux tags
+    { BamRecordTag::SEQ,  {"  ",  false} },
+    { BamRecordTag::QUAL, {"  ", false} }
+};
+// clang-format on
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamRecordTags.h b/src/BamRecordTags.h

new file mode 100644 (file)

index 0000000..0b31bdc
--- /dev/null
+++ b/src/BamRecordTags.h
@@ -0,0 +1,56 @@
+// File Description
+/// \file BamRecordTags.h
+/// \brief Defines the BamRecordTags utility class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDTAGS_H
+#define BAMRECORDTAGS_H
+
+#include "pbbam/Config.h"
+
+#include <cassert>
+#include <string>
+#include <unordered_map>
+
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamRecordImpl.h>
+#include <pbbam/BamRecordTag.h>
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecordTags
+{
+public:
+    // tag info
+    static inline bool IsPulse(const BamRecordTag tag);
+    static inline std::string LabelFor(const BamRecordTag tag);
+
+private:
+    struct BamRecordTagData
+    {
+        const std::string label_;  //[3]; // 2-char tag plus NULL
+        const bool isPulse_;
+    };
+
+    using TagLookupType = std::unordered_map<BamRecordTag, BamRecordTagData>;
+    static const TagLookupType tagLookup;
+};
+
+inline bool BamRecordTags::IsPulse(const BamRecordTag tag)
+{
+    assert(tagLookup.find(tag) != tagLookup.cend());
+    return tagLookup.at(tag).isPulse_;
+}
+
+inline std::string BamRecordTags::LabelFor(const BamRecordTag tag)
+{
+    assert(tagLookup.find(tag) != tagLookup.cend());
+    return tagLookup.at(tag).label_;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMRECORDTAGS_H
diff --git a/src/BamRecordView.cpp b/src/BamRecordView.cpp

new file mode 100644 (file)

index 0000000..a08b7d4
--- /dev/null
+++ b/src/BamRecordView.cpp
@@ -0,0 +1,138 @@
+// File Description
+/// \file BamRecordView.cpp
+/// \brief Implements the BamRecordTags utility class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamRecordView.h"
+
+namespace PacBio {
+namespace BAM {
+
+BamRecordView::BamRecordView(const BamRecord& record, const Orientation orientation,
+                             const bool aligned, const bool exciseSoftClips,
+                             const PulseBehavior pulseBehavior)
+    : record_(record)
+    , orientation_{orientation}
+    , aligned_{aligned}
+    , exciseSoftClips_{exciseSoftClips}
+    , pulseBehavior_{pulseBehavior}
+{
+}
+
+QualityValues BamRecordView::AltLabelQVs() const
+{
+    return record_.AltLabelQV(orientation_, aligned_, exciseSoftClips_, pulseBehavior_);
+}
+
+std::string BamRecordView::AltLabelTags() const
+{
+    return record_.AltLabelTag(orientation_, aligned_, exciseSoftClips_, pulseBehavior_);
+}
+
+QualityValues BamRecordView::DeletionQVs() const
+{
+    return record_.DeletionQV(orientation_, aligned_, exciseSoftClips_);
+}
+
+std::string BamRecordView::DeletionTags() const
+{
+    return record_.DeletionTag(orientation_, aligned_, exciseSoftClips_);
+}
+
+QualityValues BamRecordView::InsertionQVs() const
+{
+    return record_.InsertionQV(orientation_, aligned_, exciseSoftClips_);
+}
+
+Frames BamRecordView::IPD() const { return record_.IPD(orientation_, aligned_, exciseSoftClips_); }
+
+Frames BamRecordView::PrebaseFrames() const
+{
+    return record_.IPD(orientation_, aligned_, exciseSoftClips_);
+}
+
+QualityValues BamRecordView::LabelQVs() const
+{
+    return record_.LabelQV(orientation_, aligned_, exciseSoftClips_, pulseBehavior_);
+}
+
+QualityValues BamRecordView::MergeQVs() const
+{
+    return record_.MergeQV(orientation_, aligned_, exciseSoftClips_);
+}
+
+QualityValues BamRecordView::PulseMergeQVs() const
+{
+    return record_.PulseMergeQV(orientation_, aligned_, exciseSoftClips_, pulseBehavior_);
+}
+
+std::vector<float> BamRecordView::Pkmean() const
+{
+    return record_.Pkmean(orientation_, aligned_, exciseSoftClips_, pulseBehavior_);
+}
+
+std::vector<float> BamRecordView::Pkmid() const
+{
+    return record_.Pkmid(orientation_, aligned_, exciseSoftClips_, pulseBehavior_);
+}
+
+std::vector<float> BamRecordView::Pkmean2() const
+{
+    return record_.Pkmean2(orientation_, aligned_, exciseSoftClips_, pulseBehavior_);
+}
+
+std::vector<float> BamRecordView::Pkmid2() const
+{
+    return record_.Pkmid2(orientation_, aligned_, exciseSoftClips_, pulseBehavior_);
+}
+
+Frames BamRecordView::PrePulseFrames() const
+{
+    return record_.PrePulseFrames(orientation_, aligned_, exciseSoftClips_, pulseBehavior_);
+}
+
+std::string BamRecordView::PulseCalls() const
+{
+    return record_.PulseCall(orientation_, aligned_, exciseSoftClips_, pulseBehavior_);
+}
+
+Frames BamRecordView::PulseCallWidth() const
+{
+    return record_.PulseCallWidth(orientation_, aligned_, exciseSoftClips_, pulseBehavior_);
+}
+
+Frames BamRecordView::PulseWidths() const
+{
+    return record_.PulseWidth(orientation_, aligned_, exciseSoftClips_);
+}
+
+QualityValues BamRecordView::Qualities() const
+{
+    return record_.Qualities(orientation_, aligned_, exciseSoftClips_);
+}
+
+std::string BamRecordView::Sequence() const
+{
+    return record_.Sequence(orientation_, aligned_, exciseSoftClips_);
+}
+
+std::vector<uint32_t> BamRecordView::StartFrames() const
+{
+    return record_.StartFrame(orientation_, aligned_, exciseSoftClips_, pulseBehavior_);
+}
+
+QualityValues BamRecordView::SubstitutionQVs() const
+{
+    return record_.SubstitutionQV(orientation_, aligned_, exciseSoftClips_);
+}
+
+std::string BamRecordView::SubstitutionTags() const
+{
+    return record_.SubstitutionTag(orientation_, aligned_, exciseSoftClips_);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamTagCodec.cpp b/src/BamTagCodec.cpp

new file mode 100644 (file)

index 0000000..42fa048
--- /dev/null
+++ b/src/BamTagCodec.cpp
@@ -0,0 +1,546 @@
+// File Description
+/// \file BamTagCodec.cpp
+/// \brief Implements the BamTagCodec class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamTagCodec.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include <htslib/kstring.h>
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+template <typename T>
+inline void appendBamValue(const T& value, kstring_t* str)
+{
+    kputsn_(reinterpret_cast<const char*>(&value), sizeof(value), str);
+}
+
+template <typename T>
+inline void appendBamMultiValue(const std::vector<T>& container, kstring_t* str)
+{
+    const uint32_t n = container.size();
+    kputsn_(&n, sizeof(n), str);
+    kputsn_(reinterpret_cast<const char*>(&container[0]), n * sizeof(T), str);
+}
+
+template <typename T>
+inline T readBamValue(const uint8_t* src, size_t& offset)
+{
+    T value;
+    memcpy(&value, &src[offset], sizeof(value));
+    offset += sizeof(value);
+    return value;
+}
+
+template <typename T>
+std::vector<T> readBamMultiValue(const uint8_t* src, size_t& offset)
+{
+    uint32_t numElements;
+    memcpy(&numElements, &src[offset], sizeof(uint32_t));
+    offset += 4;
+
+    std::vector<T> result;
+    result.reserve(numElements);
+    for (size_t i = 0; i < numElements; ++i) {
+        const T value = readBamValue<T>(src, offset);
+        result.push_back(value);
+    }
+    return result;
+}
+
+}  // anonymous
+
+TagCollection BamTagCodec::Decode(const std::vector<uint8_t>& data)
+{
+    TagCollection tags;
+
+    // NOTE: not completely safe - no real bounds-checking yet on input data
+
+    const uint8_t* pData = data.data();
+    const size_t numBytes = data.size();
+    size_t i = 0;
+    while (i < numBytes) {
+
+        std::string tagName;
+        tagName.reserve(2);
+        tagName.append(1, pData[i++]);
+        tagName.append(1, pData[i++]);
+
+        const auto tagType = static_cast<char>(pData[i++]);
+        switch (tagType) {
+            case 'A':
+            case 'a': {
+                tags[tagName] = readBamValue<uint8_t>(pData, i);
+                tags[tagName].Modifier(TagModifier::ASCII_CHAR);
+                break;
+            }
+
+            case 'c':
+                tags[tagName] = readBamValue<int8_t>(pData, i);
+                break;
+            case 'C':
+                tags[tagName] = readBamValue<uint8_t>(pData, i);
+                break;
+            case 's':
+                tags[tagName] = readBamValue<int16_t>(pData, i);
+                break;
+            case 'S':
+                tags[tagName] = readBamValue<uint16_t>(pData, i);
+                break;
+            case 'i':
+                tags[tagName] = readBamValue<int32_t>(pData, i);
+                break;
+            case 'I':
+                tags[tagName] = readBamValue<uint32_t>(pData, i);
+                break;
+            case 'f':
+                tags[tagName] = readBamValue<float>(pData, i);
+                break;
+
+            case 'Z':
+            case 'H': {
+                const size_t dataLength = strlen(reinterpret_cast<const char*>(&pData[i]));
+                std::string value(reinterpret_cast<const char*>(&pData[i]), dataLength);
+                tags[tagName] = value;
+                if (tagType == 'H') tags[tagName].Modifier(TagModifier::HEX_STRING);
+                i += dataLength + 1;
+                break;
+            }
+
+            case 'B': {
+                const char subTagType = pData[i++];
+                switch (subTagType) {
+                    case 'c':
+                        tags[tagName] = readBamMultiValue<int8_t>(pData, i);
+                        break;
+                    case 'C':
+                        tags[tagName] = readBamMultiValue<uint8_t>(pData, i);
+                        break;
+                    case 's':
+                        tags[tagName] = readBamMultiValue<int16_t>(pData, i);
+                        break;
+                    case 'S':
+                        tags[tagName] = readBamMultiValue<uint16_t>(pData, i);
+                        break;
+                    case 'i':
+                        tags[tagName] = readBamMultiValue<int32_t>(pData, i);
+                        break;
+                    case 'I':
+                        tags[tagName] = readBamMultiValue<uint32_t>(pData, i);
+                        break;
+                    case 'f':
+                        tags[tagName] = readBamMultiValue<float>(pData, i);
+                        break;
+
+                    // unknown subTagType
+                    default:
+                        throw std::runtime_error{
+                            "BamTagCodec: unsupported array-tag-type encountered: " +
+                            std::string{1, subTagType}};
+                }
+                break;
+            }
+
+            // unknown tagType
+            default:
+                throw std::runtime_error{"BamTagCodec: unsupported tag-type encountered: " +
+                                         std::string{1, tagType}};
+        }
+    }
+
+    return tags;
+}
+
+std::vector<uint8_t> BamTagCodec::Encode(const TagCollection& tags)
+{
+    kstring_t str = {0, 0, nullptr};
+
+    for (const auto& tagIter : tags) {
+
+        const auto& name = tagIter.first;
+        if (name.size() != 2)
+            throw std::runtime_error{"BamTagCodec: tag name (" + name +
+                                     ") must have 2 characters only"};
+
+        const auto& tag = tagIter.second;
+        if (tag.IsNull()) continue;
+
+        // "<TAG>:"
+        kputsn_(name.c_str(), 2, &str);
+
+        // "<TYPE>:<DATA>" for printable, ASCII char
+        if (tag.HasModifier(TagModifier::ASCII_CHAR)) {
+            const char c = tag.ToAscii();
+            if (c != '\0') {
+                kputc_('A', &str);
+                kputc_(c, &str);
+                continue;
+            }
+        }
+
+        // "<TYPE>:<DATA>" for all other data
+        switch (tag.Type()) {
+            case TagDataType::INT8: {
+                kputc_('c', &str);
+                appendBamValue(tag.ToInt8(), &str);
+                break;
+            }
+            case TagDataType::UINT8: {
+                kputc_('C', &str);
+                appendBamValue(tag.ToUInt8(), &str);
+                break;
+            }
+            case TagDataType::INT16: {
+                kputc_('s', &str);
+                appendBamValue(tag.ToInt16(), &str);
+                break;
+            }
+            case TagDataType::UINT16: {
+                kputc_('S', &str);
+                appendBamValue(tag.ToUInt16(), &str);
+                break;
+            }
+            case TagDataType::INT32: {
+                kputc_('i', &str);
+                appendBamValue(tag.ToInt32(), &str);
+                break;
+            }
+            case TagDataType::UINT32: {
+                kputc_('I', &str);
+                appendBamValue(tag.ToUInt32(), &str);
+                break;
+            }
+            case TagDataType::FLOAT: {
+                kputc_('f', &str);
+                appendBamValue(tag.ToFloat(), &str);
+                break;
+            }
+
+            case TagDataType::STRING: {
+                if (tag.HasModifier(TagModifier::HEX_STRING))
+                    kputc_('H', &str);
+                else
+                    kputc_('Z', &str);
+                const auto s = tag.ToString();
+                kputsn_(s.c_str(), s.size() + 1, &str);  // this adds the null-term
+                break;
+            }
+
+            case TagDataType::INT8_ARRAY: {
+                kputc_('B', &str);
+                kputc_('c', &str);
+                appendBamMultiValue(tag.ToInt8Array(), &str);
+                break;
+            }
+            case TagDataType::UINT8_ARRAY: {
+                kputc_('B', &str);
+                kputc_('C', &str);
+                appendBamMultiValue(tag.ToUInt8Array(), &str);
+                break;
+            }
+            case TagDataType::INT16_ARRAY: {
+                kputc_('B', &str);
+                kputc_('s', &str);
+                appendBamMultiValue(tag.ToInt16Array(), &str);
+                break;
+            }
+            case TagDataType::UINT16_ARRAY: {
+                kputc_('B', &str);
+                kputc_('S', &str);
+                appendBamMultiValue(tag.ToUInt16Array(), &str);
+                break;
+            }
+            case TagDataType::INT32_ARRAY: {
+                kputc_('B', &str);
+                kputc_('i', &str);
+                appendBamMultiValue(tag.ToInt32Array(), &str);
+                break;
+            }
+            case TagDataType::UINT32_ARRAY: {
+                kputc_('B', &str);
+                kputc_('I', &str);
+                appendBamMultiValue(tag.ToUInt32Array(), &str);
+                break;
+            }
+            case TagDataType::FLOAT_ARRAY: {
+                kputc_('B', &str);
+                kputc_('f', &str);
+                appendBamMultiValue(tag.ToFloatArray(), &str);
+                break;
+            }
+
+            // unsupported tag type
+            default: {
+                free(str.s);
+                throw std::runtime_error{"BamTagCodec: unsupported tag-type encountered: " +
+                                         std::to_string(static_cast<uint16_t>(tag.Type()))};
+            }
+        }
+    }
+
+    std::vector<uint8_t> result;
+    result.resize(str.l);
+    memcpy(reinterpret_cast<char*>(result.data()), str.s, str.l);
+    free(str.s);
+    return result;
+}
+
+Tag BamTagCodec::FromRawData(uint8_t* rawData)
+{
+    size_t offset = 0;
+    const auto tagType = static_cast<char>(*rawData++);
+    switch (tagType) {
+        case 'A':
+        case 'a': {
+            Tag t{readBamValue<uint8_t>(rawData, offset)};
+            t.Modifier(TagModifier::ASCII_CHAR);
+            return t;
+        }
+
+        case 'c':
+            return {readBamValue<int8_t>(rawData, offset)};
+        case 'C':
+            return {readBamValue<uint8_t>(rawData, offset)};
+        case 's':
+            return {readBamValue<int16_t>(rawData, offset)};
+        case 'S':
+            return {readBamValue<uint16_t>(rawData, offset)};
+        case 'i':
+            return {readBamValue<int32_t>(rawData, offset)};
+        case 'I':
+            return {readBamValue<uint32_t>(rawData, offset)};
+        case 'f':
+            return {readBamValue<float>(rawData, offset)};
+
+        case 'Z':
+        case 'H': {
+            const size_t dataLength = strlen(reinterpret_cast<const char*>(&rawData[0]));
+            std::string value(reinterpret_cast<const char*>(&rawData[0]), dataLength);
+            Tag t{value};
+            if (tagType == 'H') t.Modifier(TagModifier::HEX_STRING);
+            return t;
+        }
+
+        case 'B': {
+            const char subTagType = *rawData++;
+            switch (subTagType) {
+
+                case 'c':
+                    return {readBamMultiValue<int8_t>(rawData, offset)};
+                case 'C':
+                    return {readBamMultiValue<uint8_t>(rawData, offset)};
+                case 's':
+                    return {readBamMultiValue<int16_t>(rawData, offset)};
+                case 'S':
+                    return {readBamMultiValue<uint16_t>(rawData, offset)};
+                case 'i':
+                    return {readBamMultiValue<int32_t>(rawData, offset)};
+                case 'I':
+                    return {readBamMultiValue<uint32_t>(rawData, offset)};
+                case 'f':
+                    return {readBamMultiValue<float>(rawData, offset)};
+
+                // unknown subTagType
+                default:
+                    throw std::runtime_error{
+                        "BamTagCodec: unsupported array-tag-type encountered: " +
+                        std::string{1, subTagType}};
+            }
+            break;
+        }
+
+        // unknown tagType
+        default:
+            throw std::runtime_error{"BamTagCodec: unsupported tag-type encountered: " +
+                                     std::string{1, tagType}};
+    }
+    return Tag();  // to avoid compiler warning
+}
+
+std::vector<uint8_t> BamTagCodec::ToRawData(const Tag& tag, const TagModifier& additionalModifier)
+{
+    // temp raw data destination (for use with htslib methods)
+    kstring_t str = {0, 0, nullptr};
+
+    // "<TYPE>:<DATA>" for printable, ASCII char
+    if (tag.HasModifier(TagModifier::ASCII_CHAR) || additionalModifier == TagModifier::ASCII_CHAR) {
+        const char c = tag.ToAscii();
+        if (c != '\0') kputc_(c, &str);
+    }
+
+    // for all others
+    else {
+        switch (tag.Type()) {
+
+            // single, numeric values
+            case TagDataType::INT8:
+                appendBamValue(tag.ToInt8(), &str);
+                break;
+            case TagDataType::UINT8:
+                appendBamValue(tag.ToUInt8(), &str);
+                break;
+            case TagDataType::INT16:
+                appendBamValue(tag.ToInt16(), &str);
+                break;
+            case TagDataType::UINT16:
+                appendBamValue(tag.ToUInt16(), &str);
+                break;
+            case TagDataType::INT32:
+                appendBamValue(tag.ToInt32(), &str);
+                break;
+            case TagDataType::UINT32:
+                appendBamValue(tag.ToUInt32(), &str);
+                break;
+            case TagDataType::FLOAT:
+                appendBamValue(tag.ToFloat(), &str);
+                break;
+
+            // string & hex-string values
+            case TagDataType::STRING: {
+                const auto s = tag.ToString();
+                kputsn_(s.c_str(), s.size() + 1, &str);  // this adds the null-term
+                break;
+            }
+
+            // array-type values
+            case TagDataType::INT8_ARRAY: {
+                kputc_('c', &str);
+                appendBamMultiValue(tag.ToInt8Array(), &str);
+                break;
+            }
+            case TagDataType::UINT8_ARRAY: {
+                kputc_('C', &str);
+                appendBamMultiValue(tag.ToUInt8Array(), &str);
+                break;
+            }
+            case TagDataType::INT16_ARRAY: {
+                kputc_('s', &str);
+                appendBamMultiValue(tag.ToInt16Array(), &str);
+                break;
+            }
+            case TagDataType::UINT16_ARRAY: {
+                kputc_('S', &str);
+                appendBamMultiValue(tag.ToUInt16Array(), &str);
+                break;
+            }
+            case TagDataType::INT32_ARRAY: {
+                kputc_('i', &str);
+                appendBamMultiValue(tag.ToInt32Array(), &str);
+                break;
+            }
+            case TagDataType::UINT32_ARRAY: {
+                kputc_('I', &str);
+                appendBamMultiValue(tag.ToUInt32Array(), &str);
+                break;
+            }
+            case TagDataType::FLOAT_ARRAY: {
+                kputc_('f', &str);
+                appendBamMultiValue(tag.ToFloatArray(), &str);
+                break;
+            }
+
+            // unsupported tag type
+            default: {
+                free(str.s);
+                throw std::runtime_error{"BamTagCodec: unsupported tag-type encountered: " +
+                                         std::to_string(static_cast<uint16_t>(tag.Type()))};
+            }
+        }
+    }
+
+    // store temp contents in actual destination
+    std::vector<uint8_t> result;
+    result.resize(str.l);
+    memcpy(reinterpret_cast<char*>(&result[0]), str.s, str.l);
+    free(str.s);
+    return result;
+}
+
+uint8_t BamTagCodec::TagTypeCode(const Tag& tag, const TagModifier& additionalModifier)
+{
+    if (tag.HasModifier(TagModifier::ASCII_CHAR) || additionalModifier == TagModifier::ASCII_CHAR) {
+        int64_t value = 0;
+        switch (tag.Type()) {
+            case TagDataType::INT8:
+                value = static_cast<int64_t>(tag.ToInt8());
+                break;
+            case TagDataType::UINT8:
+                value = static_cast<int64_t>(tag.ToUInt8());
+                break;
+            case TagDataType::INT16:
+                value = static_cast<int64_t>(tag.ToInt16());
+                break;
+            case TagDataType::UINT16:
+                value = static_cast<int64_t>(tag.ToUInt16());
+                break;
+            case TagDataType::INT32:
+                value = static_cast<int64_t>(tag.ToInt32());
+                break;
+            case TagDataType::UINT32:
+                value = static_cast<int64_t>(tag.ToUInt32());
+                break;
+            default:
+                // non integers not allowed
+                throw std::runtime_error{
+                    "BamTagCodec: tag-type not convertible to ASCII, tag-type: " +
+                    std::to_string(static_cast<uint16_t>(tag.Type()))};
+        }
+
+        // ensure value is in valid ASCII char range
+        if (value < 33 || value > 126)
+            throw std::runtime_error{"BamTagCodec: invalid integer value for ASCII char, value: " +
+                                     std::to_string(value)};
+
+        return static_cast<uint8_t>('A');
+    }
+
+    switch (tag.Type()) {
+        case TagDataType::INT8:
+            return static_cast<uint8_t>('c');
+        case TagDataType::UINT8:
+            return static_cast<uint8_t>('C');
+        case TagDataType::INT16:
+            return static_cast<uint8_t>('s');
+        case TagDataType::UINT16:
+            return static_cast<uint8_t>('S');
+        case TagDataType::INT32:
+            return static_cast<uint8_t>('i');
+        case TagDataType::UINT32:
+            return static_cast<uint8_t>('I');
+        case TagDataType::FLOAT:
+            return static_cast<uint8_t>('f');
+
+        case TagDataType::STRING: {
+            if (tag.HasModifier(TagModifier::HEX_STRING) ||
+                additionalModifier == TagModifier::HEX_STRING)
+                return static_cast<uint8_t>('H');
+            return static_cast<uint8_t>('Z');
+        }
+
+        case TagDataType::INT8_ARRAY:    // fall through
+        case TagDataType::UINT8_ARRAY:   // .
+        case TagDataType::INT16_ARRAY:   // .
+        case TagDataType::UINT16_ARRAY:  // .
+        case TagDataType::INT32_ARRAY:   // .
+        case TagDataType::UINT32_ARRAY:  // .
+        case TagDataType::FLOAT_ARRAY:
+            return static_cast<uint8_t>('B');
+
+        default:
+            throw std::runtime_error{"BamTagCodec: unsupported tag-type encountered: " +
+                                     std::to_string(static_cast<uint16_t>(tag.Type()))};
+    }
+    return 0;  // to avoid compiler warning
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamWriter.cpp b/src/BamWriter.cpp

new file mode 100644 (file)

index 0000000..3422e14
--- /dev/null
+++ b/src/BamWriter.cpp
@@ -0,0 +1,166 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamWriter.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <thread>
+#include <type_traits>
+
+#include <htslib/bgzf.h>
+#include <htslib/hfile.h>
+#include <htslib/hts.h>
+
+#include "Autovalidate.h"
+#include "FileProducer.h"
+#include "MemoryUtils.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/Unused.h"
+#include "pbbam/Validator.h"
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(!std::is_copy_constructible<BamWriter>::value,
+              "BamWriter(const BamWriter&) is not = delete");
+static_assert(!std::is_copy_assignable<BamWriter>::value,
+              "BamWriter& operator=(const BamWriter&) is not = delete");
+
+class BamWriter::BamWriterPrivate
+{
+public:
+    BamWriterPrivate(const std::string& filename, const std::shared_ptr<bam_hdr_t> rawHeader,
+                     const BamWriter::CompressionLevel compressionLevel, const size_t numThreads,
+                     const BamWriter::BinCalculationMode binCalculationMode, const bool useTempFile)
+        : calculateBins_{binCalculationMode == BamWriter::BinCalculation_ON}, header_{rawHeader}
+    {
+        if (!header_) throw std::runtime_error{"BamWriter: null header provided for: " + filename};
+
+        if (useTempFile) fileProducer_ = std::make_unique<FileProducer>(filename);
+
+        // open file
+        const auto usingFilename = (fileProducer_ ? fileProducer_->TempFilename() : filename);
+        const auto mode = std::string("wb") + std::to_string(static_cast<int>(compressionLevel));
+        file_.reset(sam_open(usingFilename.c_str(), mode.c_str()));
+        if (!file_)
+            throw std::runtime_error{"BamWriter: could not open BAM file for writing: " +
+                                     usingFilename};
+
+        // if no explicit thread count given, attempt built-in check
+        size_t actualNumThreads = numThreads;
+        if (actualNumThreads == 0) {
+            actualNumThreads = std::thread::hardware_concurrency();
+
+            // if still unknown, default to single-threaded
+            if (actualNumThreads == 0) actualNumThreads = 1;
+        }
+
+        // if multithreading requested, enable it
+        if (actualNumThreads > 1) hts_set_threads(file_.get(), actualNumThreads);
+
+        // write header
+        const auto ret = sam_hdr_write(file_.get(), header_.get());
+        if (ret != 0)
+            throw std::runtime_error{"BamWriter: could not write header for file: " +
+                                     usingFilename};
+    }
+
+    void Write(const BamRecord& record)
+    {
+#if PBBAM_AUTOVALIDATE
+        Validator::Validate(record);
+#endif
+
+        const auto rawRecord = BamRecordMemory::GetRawData(record);
+
+        // (probably) store bins
+        // min_shift=14 & n_lvls=5 are BAM "magic numbers"
+        if (calculateBins_)
+            rawRecord->core.bin =
+                hts_reg2bin(rawRecord->core.pos, bam_endpos(rawRecord.get()), 14, 5);
+
+        // write record to file
+        const auto ret = sam_write1(file_.get(), header_.get(), rawRecord.get());
+        if (ret <= 0) throw std::runtime_error{"BamWriter: could not write record to file"};
+    }
+
+    void Write(const BamRecord& record, int64_t* vOffset)
+    {
+        BGZF* bgzf = file_.get()->fp.bgzf;
+        assert(bgzf);
+        assert(vOffset);
+
+        // ensure offsets up-to-date
+        const auto ret = bgzf_flush(bgzf);
+        UNUSED(ret);
+
+        // capture virtual offset where we’re about to write
+        const auto rawTell = htell(bgzf->fp);
+        const auto length = bgzf->block_offset;
+        *vOffset = (rawTell << 16) | length;
+
+        // now write data
+        Write(record);
+    }
+
+    void Write(const BamRecordImpl& recordImpl) { Write(BamRecord(recordImpl)); }
+
+    bool calculateBins_;
+    std::unique_ptr<samFile, HtslibFileDeleter> file_;
+    std::shared_ptr<bam_hdr_t> header_;
+    std::unique_ptr<FileProducer> fileProducer_;
+};
+
+BamWriter::BamWriter(const std::string& filename, const BamHeader& header,
+                     const BamWriter::CompressionLevel compressionLevel, const size_t numThreads,
+                     const BinCalculationMode binCalculationMode, const bool useTempFile)
+    : IRecordWriter()
+{
+#if PBBAM_AUTOVALIDATE
+    Validator::Validate(header);
+#endif
+    d_ = std::make_unique<BamWriterPrivate>(filename, BamHeaderMemory::MakeRawHeader(header),
+                                            compressionLevel, numThreads, binCalculationMode,
+                                            useTempFile);
+}
+
+BamWriter::BamWriter(const std::string& filename, const BamHeader& header,
+                     const BamWriter::Config& config)
+    : BamWriter{filename,
+                header,
+                config.compressionLevel,
+                config.numThreads,
+                config.binCalculationMode,
+                config.useTempFile}
+{
+}
+
+BamWriter::BamWriter(BamWriter&&) noexcept = default;
+
+BamWriter& BamWriter::operator=(BamWriter&&) noexcept = default;
+
+BamWriter::~BamWriter()
+{
+    const auto ret = bgzf_flush(d_->file_.get()->fp.bgzf);
+    UNUSED(ret);
+}
+
+void BamWriter::TryFlush()
+{
+    // TODO: sanity checks on file_ & fp
+    const auto ret = bgzf_flush(d_->file_.get()->fp.bgzf);
+    if (ret != 0) throw std::runtime_error{"BamWriter: could not flush output buffer contents"};
+}
+
+void BamWriter::Write(const BamRecord& record) { d_->Write(record); }
+
+void BamWriter::Write(const BamRecord& record, int64_t* vOffset) { d_->Write(record, vOffset); }
+
+void BamWriter::Write(const BamRecordImpl& recordImpl) { d_->Write(recordImpl); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BarcodeQuery.cpp b/src/BarcodeQuery.cpp

new file mode 100644 (file)

index 0000000..ae95d1f
--- /dev/null
+++ b/src/BarcodeQuery.cpp
@@ -0,0 +1,40 @@
+// File Description
+/// \file BarcodeQuery.cpp
+/// \brief Implements the BarcodeQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BarcodeQuery.h"
+
+#include <cstdint>
+
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/PbiFilterTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BarcodeQuery::BarcodeQueryPrivate
+{
+public:
+    BarcodeQueryPrivate(const int16_t barcode, const DataSet& dataset)
+        : reader_{PbiBarcodeFilter{barcode}, dataset}
+    {
+    }
+
+    PbiFilterCompositeBamReader<Compare::None> reader_;  // unsorted
+};
+
+BarcodeQuery::BarcodeQuery(const int16_t barcode, const DataSet& dataset)
+    : internal::IQuery(), d_{std::make_unique<BarcodeQueryPrivate>(barcode, dataset)}
+{
+}
+
+BarcodeQuery::~BarcodeQuery() = default;
+
+bool BarcodeQuery::GetNext(BamRecord& r) { return d_->reader_.GetNext(r); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BgzipFastaWriter.cpp b/src/BgzipFastaWriter.cpp

new file mode 100644 (file)

index 0000000..a9df895
--- /dev/null
+++ b/src/BgzipFastaWriter.cpp
@@ -0,0 +1,53 @@
+// File Description
+/// \file BgzipFastaWriter.cpp
+/// \brief Implements the BgzipFastaWriter class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BgzipFastaWriter.h"
+
+#include <stdexcept>
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/FastqSequence.h"
+#include "pbbam/FormatUtils.h"
+
+namespace PacBio {
+namespace BAM {
+
+BgzipFastaWriter::BgzipFastaWriter(const std::string& fn) : IFastaWriter{}, writer_{fn}
+{
+    if (!FormatUtils::IsFastaFilename(fn)) {
+        throw std::runtime_error{"BgzipFastaWriter: filename '" + fn +
+                                 "' is not recognized as a FASTA file."};
+    }
+}
+
+BgzipFastaWriter::BgzipFastaWriter(const std::string& fn, const BgzipWriterConfig& config)
+    : IFastaWriter{}, writer_{fn, config}
+{
+    if (!FormatUtils::IsFastaFilename(fn)) {
+        throw std::runtime_error{"BgzipFastaWriter: filename '" + fn +
+                                 "' is not recognized as a FASTA file."};
+    }
+}
+
+void BgzipFastaWriter::TryFlush() {}
+
+void BgzipFastaWriter::Write(const BamRecordImpl& bam) { Write(bam.Name(), bam.Sequence()); }
+
+void BgzipFastaWriter::Write(const FastaSequence& fastq) { Write(fastq.Name(), fastq.Bases()); }
+
+void BgzipFastaWriter::Write(const BamRecord& bam) { Write(bam.FullName(), bam.Sequence()); }
+
+void BgzipFastaWriter::Write(const std::string& name, const std::string& bases)
+{
+    // TODO: wrap bases
+    std::string out{">" + name + '\n' + bases + '\n'};
+    writer_.Write(out);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BgzipFastqWriter.cpp b/src/BgzipFastqWriter.cpp

new file mode 100644 (file)

index 0000000..41744e5
--- /dev/null
+++ b/src/BgzipFastqWriter.cpp
@@ -0,0 +1,73 @@
+// File Description
+/// \file BgzipFastqWriter.cpp
+/// \brief Implements the BgzipFastqWriter class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BgzipFastqWriter.h"
+
+#include <stdexcept>
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/FastqSequence.h"
+#include "pbbam/FormatUtils.h"
+
+namespace PacBio {
+namespace BAM {
+
+BgzipFastqWriter::BgzipFastqWriter(const std::string& fn) : IFastqWriter{}, writer_{fn}
+{
+    if (!FormatUtils::IsFastqFilename(fn)) {
+        throw std::runtime_error{"BgzipFastqWriter: filename '" + fn +
+                                 "' is not recognized as a FASTQ file."};
+    }
+}
+
+BgzipFastqWriter::BgzipFastqWriter(const std::string& fn, const BgzipWriterConfig& config)
+    : IFastqWriter{}, writer_{fn, config}
+{
+    if (!FormatUtils::IsFastqFilename(fn)) {
+        throw std::runtime_error{"BgzipFastqWriter: filename '" + fn +
+                                 "' is not recognized as a FASTQ file."};
+    }
+}
+
+void BgzipFastqWriter::TryFlush() {}
+
+void BgzipFastqWriter::Write(const FastqSequence& fastq)
+{
+    Write(fastq.Name(), fastq.Bases(), fastq.Qualities());
+}
+
+void BgzipFastqWriter::Write(const BamRecord& bam)
+{
+    Write(bam.FullName(), bam.Sequence(), bam.Qualities());
+}
+
+void BgzipFastqWriter::Write(const BamRecordImpl& bam)
+{
+    Write(bam.Name(), bam.Sequence(), bam.Qualities());
+}
+
+void BgzipFastqWriter::Write(const std::string& name, const std::string& bases,
+                             const Data::QualityValues& quals)
+{
+    Write(name, bases, quals.Fastq());
+}
+
+void BgzipFastqWriter::Write(const std::string& name, const std::string& bases,
+                             const std::string& quals)
+{
+    std::string out{"@" + name + '\n' + bases + "\n+\n"};
+    if (!quals.empty())
+        out += quals;
+    else
+        out += std::string(bases.size(), '!');
+    out.push_back('\n');
+    writer_.Write(out);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BgzipWriter.cpp b/src/BgzipWriter.cpp

new file mode 100644 (file)

index 0000000..4429777
--- /dev/null
+++ b/src/BgzipWriter.cpp
@@ -0,0 +1,95 @@
+// File Description
+/// \file BgzipWriter.cpp
+/// \brief Implements the BgzipWriter class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BgzipWriter.h"
+
+#include <cassert>
+#include <stdexcept>
+#include <thread>
+#include <type_traits>
+
+#include <htslib/bgzf.h>
+#include <htslib/hts.h>
+
+#include "FileProducer.h"
+#include "MemoryUtils.h"
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(!std::is_copy_constructible<BgzipWriter>::value,
+              "BgzipWriter(const BgzipWriter&) is not = delete");
+static_assert(!std::is_copy_assignable<BgzipWriter>::value,
+              "BgzipWriter& operator=(const BgzipWriter&) is not = delete");
+
+class BgzipWriter::BgzipWriterPrivate
+{
+public:
+    BgzipWriterPrivate(std::string filename,
+                       const BgzipWriterConfig& config = BgzipWriterConfig{})  // : filename_
+    {
+        if (config.UseTempFile) fileProducer_ = std::make_unique<FileProducer>(filename);
+
+        // open file
+        usingFilename_ = (fileProducer_ ? fileProducer_->TempFilename() : filename);
+        const auto mode =
+            std::string("wb") + std::to_string(static_cast<int>(config.CompressionLevel));
+        bgzf_.reset(bgzf_open(usingFilename_.c_str(), mode.c_str()));
+        if (!bgzf_) {
+            throw std::runtime_error{"BgzipWriter: could not open file for writing: " +
+                                     usingFilename_};
+        }
+
+        // if no explicit thread count given, attempt built-in check
+        size_t actualNumThreads = config.NumThreads;
+        if (actualNumThreads == 0) {
+            actualNumThreads = std::thread::hardware_concurrency();
+
+            // if still unknown, default to single-threaded
+            if (actualNumThreads == 0) actualNumThreads = 1;
+        }
+
+        // if multithreading requested, enable it
+        if (actualNumThreads > 1) bgzf_mt(bgzf_.get(), actualNumThreads, 256);
+    }
+
+    size_t Write(const void* data, size_t numBytes)
+    {
+        const int written = bgzf_write(bgzf_.get(), data, numBytes);
+        if (written < 0)
+            throw std::runtime_error{"BgzipWriter: error writing to " + usingFilename_};
+        return static_cast<size_t>(written);
+    }
+
+    std::string usingFilename_;
+    std::unique_ptr<FileProducer> fileProducer_;
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> bgzf_;
+};
+
+BgzipWriter::BgzipWriter(std::string filename)
+    : d_{std::make_unique<BgzipWriterPrivate>(std::move(filename))}
+{
+}
+
+BgzipWriter::BgzipWriter(std::string filename, const BgzipWriterConfig& config)
+    : d_{std::make_unique<BgzipWriterPrivate>(std::move(filename), config)}
+{
+}
+
+BgzipWriter::BgzipWriter(BgzipWriter&&) noexcept = default;
+
+BgzipWriter& BgzipWriter::operator=(BgzipWriter&&) noexcept = default;
+
+BgzipWriter::~BgzipWriter() = default;
+
+size_t BgzipWriter::Write(const void* data, size_t numBytes) { return d_->Write(data, numBytes); }
+
+size_t BgzipWriter::Write(const std::string& data) { return d_->Write(data.c_str(), data.size()); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ChemistryTable.cpp b/src/ChemistryTable.cpp

new file mode 100644 (file)

index 0000000..bebb5d5
--- /dev/null
+++ b/src/ChemistryTable.cpp
@@ -0,0 +1,179 @@
+// Author: Lance Hepler
+
+#include "PbbamInternalConfig.h"
+
+#include "ChemistryTable.h"
+
+#include <cstdlib>
+#include <fstream>
+#include <map>
+
+#include "FileUtils.h"
+#include "pbbam/exception/BundleChemistryMappingException.h"
+#include "pugixml/pugixml.hpp"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+ChemistryTable ChemistryTableFromXml(const std::string& mappingXml)
+{
+    if (!FileUtils::Exists(mappingXml))
+        throw BundleChemistryMappingException{
+            mappingXml, "SMRT_CHEMISTRY_BUNDLE_DIR defined but file not found"};
+
+    std::ifstream in(mappingXml);
+    pugi::xml_document doc;
+    const pugi::xml_parse_result loadResult = doc.load(in);
+    if (loadResult.status != pugi::status_ok)
+        throw BundleChemistryMappingException{
+            mappingXml, "unparseable XML, error code:" + std::to_string(loadResult.status)};
+
+    // parse top-level attributes
+    pugi::xml_node rootNode = doc.document_element();
+    if (rootNode == pugi::xml_node())
+        throw BundleChemistryMappingException{mappingXml, "could not fetch XML root node"};
+
+    if (std::string(rootNode.name()) != "MappingTable")
+        throw BundleChemistryMappingException{mappingXml, "MappingTable not found"};
+
+    ChemistryTable table;
+    try {
+        for (const auto& childNode : rootNode) {
+            const std::string childName = childNode.name();
+            if (childName != "Mapping") continue;
+            table.push_back({childNode.child("BindingKit").child_value(),
+                             childNode.child("SequencingKit").child_value(),
+                             childNode.child("SoftwareVersion").child_value(),
+                             childNode.child("SequencingChemistry").child_value()});
+        }
+    } catch (std::exception& e) {
+        const std::string msg = std::string{"Mapping entries unparseable - "} + e.what();
+        throw BundleChemistryMappingException{mappingXml, msg};
+    }
+    return table;
+}
+
+}  // namespace
+
+const ChemistryTable& BuiltInChemistryTable()
+{
+    // clang-format off
+    static ChemistryTable builtin{
+
+        // BindingKit, SequencingKit, BasecallerVersion, Chemistry
+
+        // 3.0 ("Dromedary"): S/P1-C1/beta
+        {{"100-619-300", "100-620-000", "3.0", "S/P1-C1/beta"}},
+        {{"100-619-300", "100-620-000", "3.1", "S/P1-C1/beta"}},
+
+        // 3.1 ("Echidna"): S/P1-C1.1
+        {{"100-619-300", "100-867-300", "3.1", "S/P1-C1.1"}},
+        {{"100-619-300", "100-867-300", "3.2", "S/P1-C1.1"}},
+        {{"100-619-300", "100-867-300", "3.3", "S/P1-C1.1"}},
+
+        // 3.1.1 ("Flea"): S/P1-C1.2
+        {{"100-619-300", "100-902-100", "3.1", "S/P1-C1.2"}},
+        {{"100-619-300", "100-902-100", "3.2", "S/P1-C1.2"}},
+        {{"100-619-300", "100-902-100", "3.3", "S/P1-C1.2"}},
+        {{"100-619-300", "100-902-100", "4.0", "S/P1-C1.2"}},
+        {{"100-619-300", "100-902-100", "4.1", "S/P1-C1.2"}},
+
+        // 3.2 ("Goat"): S/P1-C1.3
+        {{"100-619-300", "100-972-200", "3.2", "S/P1-C1.3"}},
+        {{"100-619-300", "100-972-200", "3.3", "S/P1-C1.3"}},
+        {{"100-619-300", "100-972-200", "4.0", "S/P1-C1.3"}},
+        {{"100-619-300", "100-972-200", "4.1", "S/P1-C1.3"}},
+
+        // 4.0 ("Seabiscuit"); S/P2-C2
+        {{"100-862-200", "100-861-800", "4.0", "S/P2-C2"}},
+        {{"100-862-200", "100-861-800", "4.1", "S/P2-C2"}},
+        {{"100-862-200", "101-093-700", "4.1", "S/P2-C2"}},
+
+        // 5.0 ("Iguana"); S/P2-C2
+        {{"100-862-200", "100-861-800", "5.0", "S/P2-C2/5.0"}},
+        {{"100-862-200", "101-093-700", "5.0", "S/P2-C2/5.0"}},
+
+        // 5.0.1 ChemRel ("Sequel® Sequencing Plate Silwet"); S/P2-C2
+        {{"100-862-200", "101-309-500", "5.0", "S/P2-C2/5.0"}},
+        // 5.0.1 ChemRel ("Sequel® Sequencing Plate Silwet (4 rxn)"); S/P2-C2
+        {{"100-862-200", "101-309-400", "5.0", "S/P2-C2/5.0"}},
+
+        // --- SG1/16509P/PA5.0 ---
+        // 2.1 binding kit/5.1PA support with ..
+        // 5.0 ("Iguana"); S/P2-C2
+        {{"101-365-900", "100-861-800", "5.0", "S/P2-C2/5.0"}},
+        {{"101-365-900", "101-093-700", "5.0", "S/P2-C2/5.0"}},
+
+        // 5.0.1 ChemRel; Sequel® Binding Kit 2.1; S/P2-C2
+        {{"101-365-900", "101-309-500", "5.0", "S/P2-C2/5.0"}}, // Sequel® Sequencing Plate 2.1 Silwet (8 rxn)
+        {{"101-365-900", "101-309-400", "5.0", "S/P2-C2/5.0"}}, // Sequel® Sequencing Plate 2.1 Silwet (4 rxn)
+
+        // 5.0.1 ChemRel; Sequel® Binding Kit 3.0; S/P3-C3
+        {{"101-500-400", "101-427-500", "5.0", "S/P3-C3/5.0", "TAGT-415"}}, // Sequel® Sequencing Plate 3.0 (8 rxn)
+        {{"101-500-400", "101-427-800", "5.0", "S/P3-C3/5.0", "TAGT-415"}}, // Sequel® Sequencing Plate 3.0 (4 rxn)
+
+        // 5.0.1 ChemRel; Sequel® Dev Binding Kit; S/P2-C2
+        {{"101-490-800", "101-490-900", "5.0", "S/P3-C1/5.0-8M", "TAGT-416"}}, // Sequel II® Sequencing Plate (4 rxn)
+        {{"101-490-800", "101-491-000", "5.0", "S/P3-C1/5.0-8M", "TAGT-416"}}, // Sequel II® Sequencing Plate (8 rxn)
+
+        // 5.0.1 ChemRel; Sequel® Sequencing Plate 3.1 for Dynamic Loading placeholder (4 rxn)
+        {{"101-500-400", "101-646-800", "5.0", "S/P3-C3/5.0", "TAGT-415"}}, // Sequel® Sequencing Plate 3.1 for Dynamic Loading placeholder
+
+        // 5.0.1 ChemRel; Sequel® Dev Sequencing Plate Dyn Loading (4 rxn)
+        {{"101-490-800", "101-644-500", "5.0", "S/P3-C1/5.0-8M", "TAGT-418"}}, // Sequel® Dev Sequencing Plate Dyn Loading
+
+        // 5.0.1 ChemRel; Sequel® Sequencing Plate Dyn Loading (4 rxn)
+        {{"101-490-800", "101-717-100", "5.0", "S/P3-C1/5.0-8M", "TAGT-418"}}, // Sequel® Dev Sequencing Plate Dyn Loading
+
+        // 5.0.1 ChemRel; Sequel® Dev Sequencing Plate Dyn Loading (4 rxn)
+        {{"101-717-300", "101-644-500", "5.0", "S/P3-C1/5.0-8M", "TAGT-418"}}, // Sequel® Dev Sequencing Plate Dyn Loading
+        // 5.0.1 ChemRel; Sequel® Sequencing Plate Dyn Loading (4 rxn)
+        {{"101-717-300", "101-717-100", "5.0", "S/P3-C1/5.0-8M", "TAGT-418"}}, // Sequel® Dev Sequencing Plate Dyn Loading
+
+        // 5.0.1 ChemRel; Sequel® Dev Sequencing Plate Dyn Loading (4 rxn)
+        {{"101-717-400", "101-644-500", "5.0", "S/P3-C1/5.0-8M", "TAGT-418"}}, // Sequel® Dev Sequencing Plate Dyn Loading
+        // 5.0.1 ChemRel; Sequel® Sequencing Plate Dyn Loading (4 rxn)
+        {{"101-717-400", "101-717-100", "5.0", "S/P3-C1/5.0-8M", "TAGT-418"}}, // Sequel® Dev Sequencing Plate Dyn Loading
+
+        // Sequel® II Binding Kit 2.0; Sequel® II Sequencing Plate 2.0EA (4 Rxn)
+        {{"101-789-500", "101-789-300", "5.0", "S/P4-C2/5.0-8M", "TAGT-419"}},
+        // Sequel® II Binding Kit 2.0; Sequel® II Sequencing Plate 2.0 (4 Rxn)
+        {{"101-789-500", "101-826-100", "5.0", "S/P4-C2/5.0-8M", "TAGT-420"}},
+        // Sequel® II Binding Kit 2.0; Sequel® II Sequencing Plate 2.0 (4 Rxn) - QC
+        {{"101-789-500", "101-820-300", "5.0", "S/P4-C2/5.0-8M", "TAGT-420"}},
+
+        // Sequel® II Binding Kit 2.1; Sequel® II Sequencing Plate 2.0EA (4 Rxn)
+        {{"101-820-500", "101-789-300", "5.0", "S/P4.1-C2/5.0-8M", "TAGT-419"}},
+        // Sequel® II Binding Kit 2.1; Sequel® II Sequencing Plate 2.0 (4 Rxn)
+        {{"101-820-500", "101-826-100", "5.0", "S/P4.1-C2/5.0-8M", "TAGT-420"}},
+        // Sequel® II Binding Kit 2.1; Sequel® II Sequencing Plate 2.0 (4 Rxn) - QC
+        {{"101-820-500", "101-820-300", "5.0", "S/P4.1-C2/5.0-8M", "TAGT-420"}}
+    };
+    // clang-format on
+
+    return builtin;
+}
+
+const ChemistryTable& GetChemistryTableFromEnv()
+{
+    static const ChemistryTable empty{};
+    static std::map<std::string, ChemistryTable> tableCache;
+
+    std::string chemPath;
+    const char* pth = getenv("SMRT_CHEMISTRY_BUNDLE_DIR");
+    if (pth != nullptr && pth[0] != '\0')
+        chemPath = pth;
+    else
+        return empty;
+
+    auto it = tableCache.find(chemPath);
+    if (it != tableCache.end()) return it->second;
+
+    auto tbl = ChemistryTableFromXml(chemPath + "/chemistry.xml");
+    it = tableCache.emplace(std::move(chemPath), std::move(tbl)).first;
+    return it->second;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ChemistryTable.h b/src/ChemistryTable.h

new file mode 100644 (file)

index 0000000..18258af
--- /dev/null
+++ b/src/ChemistryTable.h
@@ -0,0 +1,24 @@
+// Author: Lance Hepler
+
+#ifndef CHEMISTRYTABLE_H
+#define CHEMISTRYTABLE_H
+
+#include "pbbam/Config.h"
+
+#include <array>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+using ChemistryTable = std::vector<std::array<std::string, 5>>;
+
+const ChemistryTable& BuiltInChemistryTable();
+
+const ChemistryTable& GetChemistryTableFromEnv();
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // CHEMISTRYTABLE_H
diff --git a/src/Compare.cpp b/src/Compare.cpp

new file mode 100644 (file)

index 0000000..f9149bc
--- /dev/null
+++ b/src/Compare.cpp
@@ -0,0 +1,110 @@
+// File Description
+/// \file Compare.cpp
+/// \brief Implements the Compare class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/Compare.h"
+
+#include <cstddef>
+#include <functional>
+#include <unordered_map>
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+struct TypeAlias
+{
+    std::string name_;
+    std::string op_;
+    std::string opAlpha_;
+
+    TypeAlias(std::string name = std::string(), std::string op = std::string(),
+              std::string opAlpha = std::string())
+        : name_(std::move(name)), op_(std::move(op)), opAlpha_(std::move(opAlpha))
+    {
+    }
+};
+
+struct CompareTypeHash
+{
+    size_t operator()(const Compare::Type& t) const
+    {
+        return std::hash<int>()(static_cast<int>(t));
+    }
+};
+
+// clang-format off
+static const std::unordered_map<std::string, Compare::Type> opToTypeMap =
+{
+    // basic operators plus some permissiveness for other representations
+    { "==",     Compare::EQUAL },
+    { "=",      Compare::EQUAL },
+    { "eq",     Compare::EQUAL },
+    { "in",     Compare::EQUAL },
+    { "!=",     Compare::NOT_EQUAL },
+    { "ne",     Compare::NOT_EQUAL },
+    { "not_in", Compare::NOT_EQUAL },
+    { "<",      Compare::LESS_THAN },
+    { "lt",     Compare::LESS_THAN },
+    { "&lt;",   Compare::LESS_THAN },
+    { "<=",     Compare::LESS_THAN_EQUAL },
+    { "lte",    Compare::LESS_THAN_EQUAL },
+    { "&lt;=",  Compare::LESS_THAN_EQUAL },
+    { ">",      Compare::GREATER_THAN },
+    { "gt",     Compare::GREATER_THAN },
+    { "&gt;",   Compare::GREATER_THAN },
+    { ">=",     Compare::GREATER_THAN_EQUAL },
+    { "gte",    Compare::GREATER_THAN_EQUAL },
+    { "&gt;=",  Compare::GREATER_THAN_EQUAL },
+    { "&",      Compare::CONTAINS },
+    { "~",      Compare::NOT_CONTAINS }
+};
+
+static const std::unordered_map<Compare::Type, TypeAlias, CompareTypeHash> typeAliases =
+{
+    { Compare::EQUAL,              TypeAlias{ "Compare::EQUAL",              "==", "eq" } },
+    { Compare::NOT_EQUAL,          TypeAlias{ "Compare::NOT_EQUAL",          "!=", "ne" } },
+    { Compare::LESS_THAN,          TypeAlias{ "Compare::LESS_THAN",          "<",  "lt"  } },
+    { Compare::LESS_THAN_EQUAL,    TypeAlias{ "Compare::LESS_THAN_EQUAL",    "<=", "lte" } },
+    { Compare::GREATER_THAN,       TypeAlias{ "Compare::GREATER_THAN",       ">",  "gt"  } },
+    { Compare::GREATER_THAN_EQUAL, TypeAlias{ "Compare::GREATER_THAN_EQUAL", ">=", "gte" } },
+    { Compare::CONTAINS,           TypeAlias{ "Compare::CONTAINS",           "&",  "and" } },
+    { Compare::NOT_CONTAINS,       TypeAlias{ "Compare::NOT_CONTAINS",       "~",  "not" } }
+};
+// clang-format on
+
+}  // anonymous
+
+Compare::Type Compare::TypeFromOperator(const std::string& opString)
+{
+    try {
+        return opToTypeMap.at(opString);
+    } catch (std::exception&) {
+        throw std::runtime_error{"Compare: " + opString + " is not a valid comparison operator."};
+    }
+}
+
+std::string Compare::TypeToName(const Compare::Type& type)
+{
+    try {
+        return typeAliases.at(type).name_;
+    } catch (std::exception&) {
+        throw std::runtime_error{"Compare: invalid comparison type encountered"};
+    }
+}
+
+std::string Compare::TypeToOperator(const Compare::Type& type, bool asAlpha)
+{
+    try {
+        return asAlpha ? typeAliases.at(type).opAlpha_ : typeAliases.at(type).op_;
+    } catch (std::exception&) {
+        throw std::runtime_error{"Compare: invalid comparison type encountered"};
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/CompositeFastaReader.cpp b/src/CompositeFastaReader.cpp

new file mode 100644 (file)

index 0000000..a53daaf
--- /dev/null
+++ b/src/CompositeFastaReader.cpp
@@ -0,0 +1,42 @@
+// File Description
+/// \file BamRecordView.cpp
+/// \brief Implements the BamRecordTags utility class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/CompositeFastaReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+CompositeFastaReader::CompositeFastaReader(const std::vector<std::string>& fastaFiles)
+{
+    for (const auto& fn : fastaFiles)
+        readers_.emplace_back(std::make_unique<FastaReader>(fn));
+}
+
+CompositeFastaReader::CompositeFastaReader(const DataSet& dataset)
+    : CompositeFastaReader{dataset.FastaFiles()}
+{
+}
+
+bool CompositeFastaReader::GetNext(FastaSequence& seq)
+{
+    // try first reader, if successful return true
+    // else pop reader and try next, until all readers exhausted
+    while (!readers_.empty()) {
+        auto& reader = readers_.front();
+        if (reader->GetNext(seq))
+            return true;
+        else
+            readers_.pop_front();
+    }
+
+    // no readers available
+    return false;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Config.cpp b/src/Config.cpp

new file mode 100644 (file)

index 0000000..2fd4820
--- /dev/null
+++ b/src/Config.cpp
@@ -0,0 +1,58 @@
+// File Description
+/// \file Config.cpp
+/// \brief Initializes global variable defaults.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/Config.h"
+
+#include <stdexcept>
+#include <string>
+#include <tuple>
+
+#include <htslib/hts.h>
+#include <pbcopper/data/CigarOperation.h>
+
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+
+// Initialized to -1 to indicate default. We will set this to HTS_LOG_OFF unless
+// client code overrides. This keeps htslib from polluting stdout/stderr on its own.
+//
+int HtslibVerbosity = -1;
+
+bool DoesHtslibSupportLongCigar()
+{
+    const std::string htsVersion = hts_version();
+
+    // remove any "-<blah>" for non-release versions
+    const auto versionBase = PacBio::BAM::Split(htsVersion, '-');
+    if (versionBase.empty())
+        throw std::runtime_error{"invalid htslib version format: " + htsVersion};
+
+    // grab major/minor version numbers
+    const auto versionParts = PacBio::BAM::Split(versionBase[0], '.');
+    if (versionParts.size() < 2)
+        throw std::runtime_error{"invalid htslib version format: " + htsVersion};
+
+    // check against v1.7
+    const int versionMajor = std::stoi(versionParts[0]);
+    const int versionMinor = std::stoi(versionParts[1]);
+    static constexpr const int v17_major = 1;
+    static constexpr const int v17_minor = 7;
+    return std::tie(versionMajor, versionMinor) >= std::tie(v17_major, v17_minor);
+}
+
+#ifdef PBBAM_PERMISSIVE_CIGAR
+static const bool PermissiveCigar = []() {
+    Data::CigarOperation::DisableAutoValidation();
+    return true;
+}();
+#endif  // PBBAM_PERMISSIVE_CIGAR
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/DataSet.cpp b/src/DataSet.cpp

new file mode 100644 (file)

index 0000000..e6f4f89
--- /dev/null
+++ b/src/DataSet.cpp
@@ -0,0 +1,620 @@
+// File Description
+/// \file DataSet.cpp
+/// \brief Implements the DataSet class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/DataSet.h"
+
+#include <algorithm>
+#include <map>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/icl/interval_set.hpp>
+#include <boost/lexical_cast.hpp>
+#include <boost/optional.hpp>
+
+#include "DataSetIO.h"
+#include "DataSetUtils.h"
+#include "FileUtils.h"
+#include "TimeUtils.h"
+
+#include "pbbam/internal/DataSetBaseTypes.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+const std::string defaultVersion{"4.0.0"};
+
+void GetAllFiles(const ExternalResources& resources, std::vector<std::string>* result)
+{
+    for (const auto& resource : resources) {
+
+        // store this resource's path
+        result->push_back(resource.ResourceId());
+
+        // store any child indices
+        for (const auto& idx : resource.FileIndices())
+            result->push_back(idx.ResourceId());
+
+        // recurse into any other child resources
+        GetAllFiles(resource.ExternalResources(), result);
+    }
+}
+
+}  // namespace
+
+using internal::DataSetElement;
+
+DataSet::DataSet() : DataSet(DataSet::GENERIC) {}
+
+DataSet::DataSet(const DataSet::TypeEnum type)
+{
+    switch (type) {
+        case DataSet::GENERIC:
+            d_ = std::make_unique<DataSetBase>();
+            break;
+        case DataSet::ALIGNMENT:
+            d_ = std::make_unique<AlignmentSet>();
+            break;
+        case DataSet::BARCODE:
+            d_ = std::make_unique<BarcodeSet>();
+            break;
+        case DataSet::CONSENSUS_ALIGNMENT:
+            d_ = std::make_unique<ConsensusAlignmentSet>();
+            break;
+        case DataSet::CONSENSUS_READ:
+            d_ = std::make_unique<ConsensusReadSet>();
+            break;
+        case DataSet::CONTIG:
+            d_ = std::make_unique<ContigSet>();
+            break;
+        case DataSet::HDF_SUBREAD:
+            d_ = std::make_unique<HdfSubreadSet>();
+            break;
+        case DataSet::REFERENCE:
+            d_ = std::make_unique<ReferenceSet>();
+            break;
+        case DataSet::SUBREAD:
+            d_ = std::make_unique<SubreadSet>();
+            break;
+        case DataSet::TRANSCRIPT:
+            d_ = std::make_unique<TranscriptSet>();
+            break;
+        case DataSet::TRANSCRIPT_ALIGNMENT:
+            d_ = std::make_unique<TranscriptAlignmentSet>();
+            break;
+        default:
+            throw std::runtime_error{"DataSet: unsupported type"};
+    }
+
+    d_->Path(FileUtils::CurrentWorkingDirectory());
+}
+
+DataSet::DataSet(const BamFile& bamFile) : d_(DataSetIO::FromUri(bamFile.Filename()))
+{
+    d_->Path(FileUtils::CurrentWorkingDirectory());
+}
+
+DataSet::DataSet(const std::string& filename) : d_(DataSetIO::FromUri(filename))
+{
+    // for FOFN contents and raw BAM filenames, we can just use the current
+    // directory as the starting path.
+    //
+    // (any relative paths in the FOFN have already been resolved)
+    //
+    if (boost::algorithm::iends_with(filename, ".fofn") ||
+        boost::algorithm::iends_with(filename, ".bam") ||
+        boost::algorithm::iends_with(filename, ".fasta") ||
+        boost::algorithm::iends_with(filename, ".fa")) {
+        d_->Path(FileUtils::CurrentWorkingDirectory());
+    }
+
+    else {
+        if (boost::algorithm::iends_with(filename, ".xml")) d_->FromInputXml(true);
+        d_->Path(FileUtils::DirectoryName(filename));
+    }
+}
+
+DataSet::DataSet(const std::vector<std::string>& filenames) : d_(DataSetIO::FromUris(filenames))
+{
+    d_->Path(FileUtils::CurrentWorkingDirectory());
+}
+
+DataSet::DataSet(const DataSet& other)
+{
+    const bool otherFromXml = other.d_->FromInputXml();
+    std::ostringstream out;
+    DataSetIO::ToStream(other.d_, out);
+    const std::string xml = out.str();
+    d_ = DataSetIO::FromXmlString(xml);
+    d_->Path(other.d_->Path());
+    d_->FromInputXml(otherFromXml);
+}
+
+DataSet& DataSet::operator=(const DataSet& other)
+{
+    if (this != &other) *this = DataSet{other};
+    return *this;
+}
+
+DataSet& DataSet::operator+=(const DataSet& other)
+{
+    *d_.get() += *other.d_.get();
+    return *this;
+}
+
+std::vector<std::string> DataSet::AllFiles() const
+{
+    // get all files
+    std::vector<std::string> result;
+    GetAllFiles(ExternalResources(), &result);
+
+    // resolve relative paths
+    std::transform(result.begin(), result.end(), result.begin(),
+                   [this](const std::string& fn) { return this->ResolvePath(fn); });
+    return result;
+}
+
+const std::string& DataSet::Attribute(const std::string& name) const { return d_->Attribute(name); }
+
+std::string& DataSet::Attribute(const std::string& name) { return d_->Attribute(name); }
+
+DataSet& DataSet::Attribute(const std::string& name, const std::string& value)
+{
+    d_->Attribute(name, value);
+    return *this;
+}
+
+std::vector<BamFile> DataSet::BamFiles() const
+{
+    std::vector<BamFile> result;
+    std::vector<std::string> fns = BamFilenames();
+    result.reserve(fns.size());
+
+    for (const auto& fn : fns) {
+        result.emplace_back(fn);
+    }
+    return result;
+}
+
+std::vector<std::string> DataSet::BamFilenames() const
+{
+    std::vector<std::string> result;
+    const PacBio::BAM::ExternalResources& resources = ExternalResources();
+
+    result.reserve(resources.Size());
+    for (const ExternalResource& ext : resources) {
+
+        // only bother resolving file path if this is a BAM file
+        boost::iterator_range<std::string::const_iterator> bamFound =
+            boost::algorithm::ifind_first(ext.MetaType(), "bam");
+        if (!bamFound.empty()) {
+            const std::string fn = ResolvePath(ext.ResourceId());
+            result.emplace_back(fn);
+        }
+    }
+    return result;
+}
+
+const std::string& DataSet::CreatedAt() const { return d_->CreatedAt(); }
+
+std::string& DataSet::CreatedAt() { return d_->CreatedAt(); }
+
+DataSet& DataSet::CreatedAt(const std::string& createdAt)
+{
+    d_->CreatedAt(createdAt);
+    return *this;
+}
+
+const PacBio::BAM::Extensions& DataSet::Extensions() const { return d_->Extensions(); }
+
+PacBio::BAM::Extensions& DataSet::Extensions() { return d_->Extensions(); }
+
+DataSet& DataSet::Extensions(const PacBio::BAM::Extensions& extensions)
+{
+    d_->Extensions(extensions);
+    return *this;
+}
+
+const PacBio::BAM::ExternalResources& DataSet::ExternalResources() const
+{
+    return d_->ExternalResources();
+}
+
+PacBio::BAM::ExternalResources& DataSet::ExternalResources() { return d_->ExternalResources(); }
+
+DataSet& DataSet::ExternalResources(const PacBio::BAM::ExternalResources& resources)
+{
+    d_->ExternalResources(resources);
+    return *this;
+}
+
+std::vector<std::string> DataSet::FastaFiles() const
+{
+    const PacBio::BAM::ExternalResources& resources = ExternalResources();
+
+    std::vector<std::string> result;
+    result.reserve(resources.Size());
+    for (const ExternalResource& ext : resources) {
+
+        // only bother resolving file path if this is a BAM file
+        boost::iterator_range<std::string::const_iterator> fastaFound =
+            boost::algorithm::ifind_first(ext.MetaType(), "fasta");
+        if (!fastaFound.empty()) {
+            const std::string fn = ResolvePath(ext.ResourceId());
+            result.push_back(fn);
+        }
+    }
+    return result;
+}
+
+const PacBio::BAM::Filters& DataSet::Filters() const { return d_->Filters(); }
+
+PacBio::BAM::Filters& DataSet::Filters() { return d_->Filters(); }
+
+DataSet& DataSet::Filters(const PacBio::BAM::Filters& filters)
+{
+    d_->Filters(filters);
+    return *this;
+}
+
+const std::string& DataSet::Format() const { return d_->Format(); }
+
+std::string& DataSet::Format() { return d_->Format(); }
+
+DataSet& DataSet::Format(const std::string& format)
+{
+    d_->Format(format);
+    return *this;
+}
+
+DataSet DataSet::FromXml(const std::string& xml)
+{
+    DataSet result;
+    result.d_ = DataSetIO::FromXmlString(xml);
+    result.d_->Path(FileUtils::DirectoryName(xml));
+    result.d_->FromInputXml(true);
+    return result;
+}
+
+std::vector<GenomicInterval> DataSet::GenomicIntervals() const
+{
+    // need to gather the contig lengths
+    std::map<std::string, int32_t> contigLengths;
+    for (const BamFile& b : BamFiles()) {
+        const BamHeader& header = b.Header();
+        const int32_t numContigs = header.NumSequences();
+        for (int32_t i = 0; i < numContigs; ++i) {
+            const std::string refName = header.SequenceName(i);
+            const int32_t refLength = boost::lexical_cast<int32_t>(header.SequenceLength(i));
+
+            const auto it = contigLengths.find(refName);
+            if (it == contigLengths.cend())
+                contigLengths.emplace(refName, refLength);
+            else if (it->second != refLength) {
+                throw std::runtime_error{
+                    "DataSet: " + refName + " occurs twice with different lengths ('" +
+                    std::to_string(it->second) + "' and '" + std::to_string(refLength) + "')"};
+            }
+        }
+    }
+
+    // with the lengths of all contigs known, we can build
+    // the minimal interval set induced by the filters
+    using intT = boost::icl::interval_set<int32_t>;
+    using intInterval = intT::interval_type;
+
+    std::map<std::string, intT> contigIntervals;
+    int32_t numFilters = 0;
+
+    for (const auto& xmlFilter : Filters()) {
+        ++numFilters;
+        boost::optional<std::string> contigName;
+
+        intT intersectedInterval{intInterval{0, std::numeric_limits<int32_t>::max()}};
+
+        for (const auto& xmlProperty : xmlFilter.Properties()) {
+            const std::string XmlName = xmlProperty.Name();
+            const std::string XmlOperator = xmlProperty.Operator();
+            const std::string XmlValue = xmlProperty.Value();
+
+            if ("rname" == XmlName) {
+                if ("=" == XmlOperator) {
+                    contigName = XmlValue;
+
+                    const auto it = contigLengths.find(XmlValue);
+                    if (it == contigLengths.cend())
+                        throw std::runtime_error{"DataSet: Could not find contig '" + XmlValue +
+                                                 "' in BAM files"};
+                    else
+                        intersectedInterval &= intInterval(0, it->second);
+                } else
+                    throw std::runtime_error{
+                        "DatSet: '" + XmlOperator +
+                        "' is an unrecognized property operator, only '=' is recognized"};
+            } else if ("tstart" == XmlName) {
+                if ((XmlOperator != "<") && (XmlOperator != "<="))
+                    throw std::runtime_error{
+                        "DataSet: tstart only supports '<' and '<=' operators"};
+
+                const int32_t end = boost::lexical_cast<int32_t>(XmlValue) + ("<=" == XmlOperator);
+                intersectedInterval &= intInterval(0, end);
+            } else if ("tend" == XmlName) {
+                if ((XmlOperator != ">") && (XmlOperator != ">="))
+                    throw std::runtime_error{"DataSet: tend only supports '>' and '>=' operators"};
+
+                const int32_t start =
+                    boost::lexical_cast<int32_t>(XmlValue) - (">=" == XmlOperator);
+                intersectedInterval &= intInterval(start, std::numeric_limits<int32_t>::max());
+            } else
+                throw std::runtime_error{"DataSet: '" + XmlName +
+                                         "' is an unrecognized filter property name"};
+        }
+
+        if (contigName)
+            contigIntervals[contigName.value()] |= intersectedInterval;
+        else
+            throw std::runtime_error{
+                "DataSet: current filter does not have a valid 'rname' attribute"};
+    }
+
+    // extract all GenomicIntervals
+    std::vector<GenomicInterval> result;
+    if (numFilters) {
+        // have some filters, only return regions passing filters
+        for (const auto& contigs : contigIntervals) {
+            const std::string& contigName = contigs.first;
+            for (const auto& i : contigs.second) {
+                // don't append empty intervals to the result
+                if (boost::icl::length(i)) result.emplace_back(contigName, i.lower(), i.upper());
+            }
+        }
+    } else {
+        // no filters, return complete list of intervals
+        for (const auto& contigs : contigLengths)
+            result.emplace_back(contigs.first, 0, contigs.second);
+    }
+
+    return result;
+}
+
+const PacBio::BAM::DataSetMetadata& DataSet::Metadata() const { return d_->Metadata(); }
+
+PacBio::BAM::DataSetMetadata& DataSet::Metadata() { return d_->Metadata(); }
+
+DataSet& DataSet::Metadata(const PacBio::BAM::DataSetMetadata& metadata)
+{
+    d_->Metadata(metadata);
+    return *this;
+}
+
+const std::string& DataSet::MetaType() const { return d_->MetaType(); }
+
+std::string& DataSet::MetaType() { return d_->MetaType(); }
+
+DataSet& DataSet::MetaType(const std::string& metatype)
+{
+    d_->MetaType(metatype);
+    return *this;
+}
+
+const std::string& DataSet::ModifiedAt() const { return d_->ModifiedAt(); }
+
+std::string& DataSet::ModifiedAt() { return d_->ModifiedAt(); }
+
+DataSet& DataSet::ModifiedAt(const std::string& modifiedAt)
+{
+    d_->ModifiedAt(modifiedAt);
+    return *this;
+}
+
+const std::string& DataSet::Name() const { return d_->Name(); }
+
+std::string& DataSet::Name() { return d_->Name(); }
+
+DataSet& DataSet::Name(const std::string& name)
+{
+    d_->Name(name);
+    return *this;
+}
+
+const NamespaceRegistry& DataSet::Namespaces() const { return d_->Namespaces(); }
+
+NamespaceRegistry& DataSet::Namespaces() { return d_->Namespaces(); }
+
+DataSet::TypeEnum DataSet::NameToType(const std::string& typeName)
+{
+    static std::unordered_map<std::string, DataSet::TypeEnum> lookup;
+    if (lookup.empty()) {
+        lookup["DataSet"] = DataSet::GENERIC;
+        lookup["AlignmentSet"] = DataSet::ALIGNMENT;
+        lookup["BarcodeSet"] = DataSet::BARCODE;
+        lookup["ConsensusAlignmentSet"] = DataSet::CONSENSUS_ALIGNMENT;
+        lookup["ConsensusReadSet"] = DataSet::CONSENSUS_READ;
+        lookup["ContigSet"] = DataSet::CONTIG;
+        lookup["HdfSubreadSet"] = DataSet::HDF_SUBREAD;
+        lookup["ReferenceSet"] = DataSet::REFERENCE;
+        lookup["SubreadSet"] = DataSet::SUBREAD;
+        lookup["TranscriptSet"] = DataSet::TRANSCRIPT;
+        lookup["TranscriptAlignmentSet"] = DataSet::TRANSCRIPT_ALIGNMENT;
+    }
+    return lookup.at(typeName);  // throws if unknown typename
+}
+
+const std::string& DataSet::Path() const { return d_->Path(); }
+
+std::vector<std::string> DataSet::ResolvedResourceIds() const
+{
+    const PacBio::BAM::ExternalResources& resources = ExternalResources();
+
+    std::vector<std::string> result;
+    result.reserve(resources.Size());
+    for (const ExternalResource& ext : resources) {
+        result.push_back(ResolvePath(ext.ResourceId()));
+    }
+    return result;
+}
+
+std::string DataSet::ResolvePath(const std::string& originalPath) const
+{
+    return FileUtils::ResolvedFilePath(originalPath, d_->Path());
+}
+
+const std::string& DataSet::ResourceId() const { return d_->ResourceId(); }
+
+std::string& DataSet::ResourceId() { return d_->ResourceId(); }
+
+DataSet& DataSet::ResourceId(const std::string& resourceId)
+{
+    d_->ResourceId(resourceId);
+    return *this;
+}
+
+void DataSet::Save(const std::string& outputFilename) const
+{
+    DataSetIO::ToFile(d_, outputFilename);
+}
+
+void DataSet::SaveToStream(std::ostream& out) const { DataSetIO::ToStream(d_, out); }
+
+std::set<std::string> DataSet::SequencingChemistries() const
+{
+    const std::vector<BamFile> bamFiles{BamFiles()};
+
+    std::set<std::string> result;
+    for (const BamFile& bf : bamFiles) {
+        if (!bf.IsPacBioBAM())
+            throw std::runtime_error{
+                "DataSet: only PacBio BAMs are supported for fetching chemistry info"};
+        const std::vector<ReadGroupInfo> readGroups{bf.Header().ReadGroups()};
+        for (const ReadGroupInfo& rg : readGroups)
+            result.insert(rg.SequencingChemistry());
+    }
+    return result;
+}
+
+const PacBio::BAM::SubDataSets& DataSet::SubDataSets() const { return d_->SubDataSets(); }
+
+PacBio::BAM::SubDataSets& DataSet::SubDataSets() { return d_->SubDataSets(); }
+
+DataSet& DataSet::SubDataSets(const PacBio::BAM::SubDataSets& subdatasets)
+{
+    d_->SubDataSets(subdatasets);
+    return *this;
+}
+
+const std::string& DataSet::Tags() const { return d_->Tags(); }
+
+std::string& DataSet::Tags() { return d_->Tags(); }
+
+DataSet& DataSet::Tags(const std::string& tags)
+{
+    d_->Tags(tags);
+    return *this;
+}
+
+const std::string& DataSet::TimeStampedName() const { return d_->TimeStampedName(); }
+
+std::string& DataSet::TimeStampedName() { return d_->TimeStampedName(); }
+
+DataSet& DataSet::TimeStampedName(const std::string& timeStampedName)
+{
+    d_->TimeStampedName(timeStampedName);
+    return *this;
+}
+
+PacBio::BAM::DataSet::TypeEnum DataSet::Type() const { return DataSet::NameToType(TypeName()); }
+
+DataSet& DataSet::Type(const DataSet::TypeEnum type)
+{
+    d_->Label(DataSet::TypeToName(type));
+    return *this;
+}
+
+std::string DataSet::TypeName() const { return d_->LocalNameLabel().to_string(); }
+
+std::string DataSet::TypeToName(const DataSet::TypeEnum& type)
+{
+    switch (type) {
+        case DataSet::GENERIC:
+            return "DataSet";
+        case DataSet::ALIGNMENT:
+            return "AlignmentSet";
+        case DataSet::BARCODE:
+            return "BarcodeSet";
+        case DataSet::CONSENSUS_ALIGNMENT:
+            return "ConsensusAlignmentSet";
+        case DataSet::CONSENSUS_READ:
+            return "ConsensusReadSet";
+        case DataSet::CONTIG:
+            return "ContigSet";
+        case DataSet::HDF_SUBREAD:
+            return "HdfSubreadSet";
+        case DataSet::REFERENCE:
+            return "ReferenceSet";
+        case DataSet::SUBREAD:
+            return "SubreadSet";
+        case DataSet::TRANSCRIPT:
+            return "TranscriptSet";
+        case DataSet::TRANSCRIPT_ALIGNMENT:
+            return "TranscriptAlignmentSet";
+        default:
+            throw std::runtime_error{"DataSet: unsupported dataset type"};
+    }
+}
+
+const std::string& DataSet::UniqueId() const { return d_->UniqueId(); }
+
+std::string& DataSet::UniqueId() { return d_->UniqueId(); }
+
+DataSet& DataSet::UniqueId(const std::string& uuid)
+{
+    d_->UniqueId(uuid);
+    return *this;
+}
+
+const std::string& DataSet::Version() const { return d_->Version(); }
+
+std::string& DataSet::Version() { return d_->Version(); }
+
+DataSet& DataSet::Version(const std::string& version)
+{
+    d_->Version(version);
+    return *this;
+}
+
+// Exposed timestamp utils
+
+std::string CurrentTimestamp() { return TimeUtils::ToDataSetFormat(TimeUtils::CurrentTime()); }
+
+std::string ToDataSetFormat(const std::chrono::system_clock::time_point& tp)
+{
+    return TimeUtils::ToDataSetFormat(tp);
+}
+
+std::string ToDataSetFormat(const time_t& t)
+{
+    return TimeUtils::ToDataSetFormat(std::chrono::system_clock::from_time_t(t));
+}
+
+std::string ToIso8601(const std::chrono::system_clock::time_point& tp)
+{
+    return TimeUtils::ToIso8601(tp);
+}
+
+std::string ToIso8601(const time_t& t)
+{
+    return TimeUtils::ToIso8601(std::chrono::system_clock::from_time_t(t));
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/DataSetBaseTypes.cpp b/src/DataSetBaseTypes.cpp

new file mode 100644 (file)

index 0000000..63a820b
--- /dev/null
+++ b/src/DataSetBaseTypes.cpp
@@ -0,0 +1,322 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/internal/DataSetBaseTypes.h"
+
+#include <cstddef>
+
+#include <boost/algorithm/string.hpp>
+
+#include "DataSetUtils.h"
+#include "TimeUtils.h"
+#include "pbbam/DataSetTypes.h"
+#include "pbbam/Unused.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// ----------------
+// BaseEntityType
+// ----------------
+
+BaseEntityType::BaseEntityType(const std::string& label, const XsdType& xsd)
+    : DataSetElement(label, xsd)
+{
+    if (CreatedAt().empty()) CreatedAt(TimeUtils::ToIso8601(TimeUtils::CurrentTime()));
+    if (Version().empty()) Version(XML_VERSION);
+}
+
+BaseEntityType::BaseEntityType(const std::string& label, const FromInputXml& fromInputXml,
+                               const XsdType& xsd)
+    : DataSetElement(label, fromInputXml, xsd)
+{
+}
+
+const std::string& BaseEntityType::CreatedAt() const { return Attribute("CreatedAt"); }
+
+std::string& BaseEntityType::CreatedAt() { return Attribute("CreatedAt"); }
+
+BaseEntityType& BaseEntityType::CreatedAt(const std::string& createdAt)
+{
+    Attribute("CreatedAt", createdAt);
+    return *this;
+}
+
+const std::string& BaseEntityType::Description() const { return Attribute("Description"); }
+
+std::string& BaseEntityType::Description() { return Attribute("Description"); }
+
+BaseEntityType& BaseEntityType::Description(const std::string& description)
+{
+    Attribute("Description", description);
+    return *this;
+}
+
+DEFINE_ACCESSORS(BaseEntityType, Extensions, Extensions)
+
+BaseEntityType& BaseEntityType::Extensions(const PacBio::BAM::Extensions& extensions)
+{
+    Extensions() = extensions;
+    return *this;
+}
+
+const std::string& BaseEntityType::Format() const { return Attribute("Format"); }
+
+std::string& BaseEntityType::Format() { return Attribute("Format"); }
+
+BaseEntityType& BaseEntityType::Format(const std::string& format)
+{
+    Attribute("Format", format);
+    return *this;
+}
+
+const std::string& BaseEntityType::ModifiedAt() const { return Attribute("ModifiedAt"); }
+
+std::string& BaseEntityType::ModifiedAt() { return Attribute("ModifiedAt"); }
+
+BaseEntityType& BaseEntityType::ModifiedAt(const std::string& modifiedAt)
+{
+    Attribute("ModifiedAt", modifiedAt);
+    return *this;
+}
+
+const std::string& BaseEntityType::Name() const { return Attribute("Name"); }
+
+std::string& BaseEntityType::Name() { return Attribute("Name"); }
+
+BaseEntityType& BaseEntityType::Name(const std::string& name)
+{
+    Attribute("Name", name);
+    return *this;
+}
+
+const std::string& BaseEntityType::ResourceId() const { return Attribute("ResourceId"); }
+
+std::string& BaseEntityType::ResourceId() { return Attribute("ResourceId"); }
+
+BaseEntityType& BaseEntityType::ResourceId(const std::string& resourceId)
+{
+    Attribute("ResourceId", resourceId);
+    return *this;
+}
+
+const std::string& BaseEntityType::Tags() const { return Attribute("Tags"); }
+
+std::string& BaseEntityType::Tags() { return Attribute("Tags"); }
+
+BaseEntityType& BaseEntityType::Tags(const std::string& tags)
+{
+    Attribute("Tags", tags);
+    return *this;
+}
+
+const std::string& BaseEntityType::Version() const { return Attribute("Version"); }
+
+std::string& BaseEntityType::Version() { return Attribute("Version"); }
+
+BaseEntityType& BaseEntityType::Version(const std::string& version)
+{
+    Attribute("Version", version);
+    return *this;
+}
+
+// ----------------
+// DataEntityType
+// ----------------
+
+DataEntityType::DataEntityType(const std::string& label, const XsdType& xsd)
+    : BaseEntityType(label, xsd)
+{
+}
+
+DataEntityType::DataEntityType(const std::string& label, const FromInputXml& fromInputXml,
+                               const XsdType& xsd)
+    : BaseEntityType(label, fromInputXml, xsd)
+{
+}
+
+const std::string& DataEntityType::Checksum() const { return ChildText("Checksum"); }
+
+std::string& DataEntityType::Checksum() { return ChildText("Checksum"); }
+
+DataEntityType& DataEntityType::Checksum(const std::string& checksum)
+{
+    ChildText("Checksum", checksum);
+    return *this;
+}
+
+const std::string& DataEntityType::EncodedValue() const { return ChildText("EncodedValue"); }
+
+std::string& DataEntityType::EncodedValue() { return ChildText("EncodedValue"); }
+
+DataEntityType& DataEntityType::EncodedValue(const std::string& encodedValue)
+{
+    ChildText("EncodedValue", encodedValue);
+    return *this;
+}
+
+const std::string& DataEntityType::MetaType() const { return Attribute("MetaType"); }
+
+std::string& DataEntityType::MetaType() { return Attribute("MetaType"); }
+
+DataEntityType& DataEntityType::MetaType(const std::string& metatype)
+{
+    Attribute("MetaType", metatype);
+    return *this;
+}
+
+const std::string& DataEntityType::SimpleValue() const { return Attribute("SimpleValue"); }
+
+std::string& DataEntityType::SimpleValue() { return Attribute("SimpleValue"); }
+
+DataEntityType& DataEntityType::SimpleValue(const std::string& simpleValue)
+{
+    Attribute("SimpleValue", simpleValue);
+    return *this;
+}
+
+const std::string& DataEntityType::TimeStampedName() const { return Attribute("TimeStampedName"); }
+
+std::string& DataEntityType::TimeStampedName() { return Attribute("TimeStampedName"); }
+
+DataEntityType& DataEntityType::TimeStampedName(const std::string& timeStampedName)
+{
+    Attribute("TimeStampedName", timeStampedName);
+    return *this;
+}
+
+const std::string& DataEntityType::UniqueId() const { return Attribute("UniqueId"); }
+
+std::string& DataEntityType::UniqueId() { return Attribute("UniqueId"); }
+
+DataEntityType& DataEntityType::UniqueId(const std::string& uuid)
+{
+    Attribute("UniqueId", uuid);
+    return *this;
+}
+
+const std::string& DataEntityType::ValueDataType() const { return Attribute("ValueDataType"); }
+
+std::string& DataEntityType::ValueDataType() { return Attribute("ValueDataType"); }
+
+DataEntityType& DataEntityType::ValueDataType(const std::string& valueDataType)
+{
+    Attribute("ValueDataType", valueDataType);
+    return *this;
+}
+
+// -----------------
+// IndexedDataType
+// -----------------
+
+IndexedDataType::IndexedDataType(const std::string& metatype, const std::string& filename,
+                                 const std::string& label, const XsdType& xsd)
+    : InputOutputDataType(metatype, filename, label, xsd)
+{
+}
+
+IndexedDataType::IndexedDataType(const std::string& metatype, const std::string& filename,
+                                 const std::string& label, const FromInputXml& fromInputXml,
+                                 const XsdType& xsd)
+    : InputOutputDataType(metatype, filename, label, fromInputXml, xsd)
+{
+}
+
+DEFINE_ACCESSORS(IndexedDataType, FileIndices, FileIndices)
+
+IndexedDataType& IndexedDataType::FileIndices(const PacBio::BAM::FileIndices& indices)
+{
+    FileIndices() = indices;
+    return *this;
+}
+
+// ---------------------
+// InputOutputDataType
+// ---------------------
+
+InputOutputDataType::InputOutputDataType(const std::string& metatype, const std::string& filename,
+                                         const std::string& label, const XsdType& xsd)
+    : StrictEntityType(metatype, label, xsd)
+{
+    ResourceId(filename);
+}
+
+InputOutputDataType::InputOutputDataType(const std::string& metatype, const std::string& filename,
+                                         const std::string& label, const FromInputXml& fromInputXml,
+                                         const XsdType& xsd)
+    : StrictEntityType(metatype, label, fromInputXml, xsd)
+{
+    ResourceId(filename);
+}
+
+// ----------------
+// StrictEntityType
+// ----------------
+
+StrictEntityType::StrictEntityType(const std::string& metatype, const std::string& label,
+                                   const XsdType& xsd)
+    : BaseEntityType(label, xsd)
+{
+    // MetaType
+    MetaType(metatype);
+
+    // TimeStampedName
+    const size_t numChars = metatype.size();
+    std::string transformedMetatype;
+    transformedMetatype.resize(numChars);
+    for (size_t i = 0; i < numChars; ++i) {
+        const char c = metatype.at(i);
+        transformedMetatype[i] = ((c == '.') ? '_' : tolower(c));
+    }
+    const std::string tsn =
+        transformedMetatype + "-" + TimeUtils::ToDataSetFormat(TimeUtils::CurrentTime());
+    TimeStampedName(tsn);
+
+    // UniqueId
+    UniqueId(GenerateUuid());
+}
+
+StrictEntityType::StrictEntityType(const std::string& /*metatype*/, const std::string& label,
+                                   const FromInputXml& fromInputXml, const XsdType& xsd)
+    : BaseEntityType(label, fromInputXml, xsd)
+{
+}
+
+const std::string& StrictEntityType::MetaType() const { return Attribute("MetaType"); }
+
+std::string& StrictEntityType::MetaType() { return Attribute("MetaType"); }
+
+StrictEntityType& StrictEntityType::MetaType(const std::string& metatype)
+{
+    Attribute("MetaType", metatype);
+    return *this;
+}
+
+const std::string& StrictEntityType::TimeStampedName() const
+{
+    return Attribute("TimeStampedName");
+}
+
+std::string& StrictEntityType::TimeStampedName() { return Attribute("TimeStampedName"); }
+
+StrictEntityType& StrictEntityType::TimeStampedName(const std::string& timeStampedName)
+{
+    Attribute("TimeStampedName", timeStampedName);
+    return *this;
+}
+
+const std::string& StrictEntityType::UniqueId() const { return Attribute("UniqueId"); }
+
+std::string& StrictEntityType::UniqueId() { return Attribute("UniqueId"); }
+
+StrictEntityType& StrictEntityType::UniqueId(const std::string& uuid)
+{
+    Attribute("UniqueId", uuid);
+    return *this;
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/DataSetElement.cpp b/src/DataSetElement.cpp

new file mode 100644 (file)

index 0000000..fa8ee67
--- /dev/null
+++ b/src/DataSetElement.cpp
@@ -0,0 +1,20 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/internal/DataSetElement.h"
+
+#include "DataSetUtils.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+const std::string& DataSetElement::SharedNullString()
+{
+    return internal::NullObject<std::string>();
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/DataSetIO.cpp b/src/DataSetIO.cpp

new file mode 100644 (file)

index 0000000..ad3d1a0
--- /dev/null
+++ b/src/DataSetIO.cpp
@@ -0,0 +1,156 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "DataSetIO.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <exception>
+#include <fstream>
+#include <iostream>
+
+#include <boost/algorithm/string.hpp>
+
+#include "FileUtils.h"
+#include "FofnReader.h"
+#include "XmlReader.h"
+#include "XmlWriter.h"
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+std::unique_ptr<DataSetBase> DataSetFromXml(const std::string& xmlFn)
+{
+    std::ifstream in(xmlFn);
+    if (!in) throw std::runtime_error{"DataSet: could not open XML file for reading: " + xmlFn};
+    return XmlReader::FromStream(in);
+}
+
+std::unique_ptr<DataSetBase> DataSetFromBam(const std::string& bamFn)
+{
+    // peek at sort order to determine if file should be an AlignmentSet or else SubreadSet
+    const auto bamFile = BamFile{bamFn};
+    const auto& header = bamFile.Header();
+    const auto aligned = header.SortOrder() == "coordinate";
+
+    std::unique_ptr<DataSetBase> dataset;
+    if (aligned)
+        dataset = std::make_unique<AlignmentSet>();
+    else
+        dataset = std::make_unique<SubreadSet>();
+
+    auto& resources = dataset->ExternalResources();
+    resources.Add(ExternalResource(BamFile(bamFn)));
+    return dataset;
+}
+
+std::unique_ptr<DataSetBase> DataSetFromFasta(const std::string& fasta)
+{
+    // make FASTA data set
+    auto dataset = std::make_unique<ReferenceSet>();
+    auto& resources = dataset->ExternalResources();
+    resources.Add(ExternalResource("PacBio.ReferenceFile.ReferenceFastaFile", fasta));
+    return dataset;
+}
+
+std::unique_ptr<DataSetBase> DataSetFromFofn(const std::string& fofn)
+{
+    const auto fofnDir = FileUtils::DirectoryName(fofn);
+    std::ifstream in(fofn);
+    if (!in) throw std::runtime_error{"DataSet: could not open FOFN for reading: " + fofn};
+
+    auto filenames = FofnReader::Files(in);
+    std::transform(
+        filenames.begin(), filenames.end(), filenames.begin(),
+        [&fofnDir](const std::string fn) { return FileUtils::ResolvedFilePath(fn, fofnDir); });
+    return DataSetIO::FromUris(filenames);
+}
+
+std::unique_ptr<DataSetBase> DataSetFromUri(const std::string& uri)
+{
+    // NOTE: this says URI, but we're not quite handling filenames as true URIs
+    //       basically just treating as a regular filename for now
+
+    // handle on extension
+    if (boost::algorithm::iends_with(uri, ".xml"))
+        return DataSetFromXml(uri);
+    else if (boost::algorithm::iends_with(uri, ".bam"))
+        return DataSetFromBam(uri);
+    else if (boost::algorithm::iends_with(uri, ".fofn"))
+        return DataSetFromFofn(uri);
+    else if (boost::algorithm::iends_with(uri, ".fasta") ||
+             boost::algorithm::iends_with(uri, ".fa")) {
+        return DataSetFromFasta(uri);
+    }
+
+    // unknown filename extension
+    throw std::runtime_error{"DataSet: unsupported extension on input file: " + uri};
+}
+
+}  // namespace
+
+std::unique_ptr<DataSetBase> DataSetIO::FromUri(const std::string& uri)
+{
+    return FromUris(std::vector<std::string>(1, uri));
+}
+
+std::unique_ptr<DataSetBase> DataSetIO::FromUris(const std::vector<std::string>& uris)
+{
+    if (uris.empty()) throw std::runtime_error{"DataSet: empty input URI list"};
+
+    // create dataset(s) from URI(s)
+    std::vector<std::unique_ptr<DataSetBase> > datasets;
+    datasets.reserve(uris.size());
+    for (const auto& uri : uris)
+        datasets.emplace_back(DataSetFromUri(uri));
+    assert(!datasets.empty());
+
+    // if only 1, just return
+    if (datasets.size() == 1) return std::unique_ptr<DataSetBase>(datasets.front().release());
+
+    // else merge
+    else {
+        auto& result = datasets.at(0);
+        for (size_t i = 1; i < datasets.size(); ++i) {
+            const auto& next = datasets.at(i);
+            *result += *next;
+        }
+        return std::move(result);
+    }
+}
+
+std::unique_ptr<DataSetBase> DataSetIO::FromXmlString(const std::string& xml)
+{
+    if (xml.empty()) throw std::runtime_error{"DataSet: cannot load from empty XML string"};
+    std::istringstream s{xml};
+    return XmlReader::FromStream(s);
+}
+
+void DataSetIO::ToFile(const std::unique_ptr<DataSetBase>& dataset, const std::string& fn)
+{
+    DataSetIO::ToFile(*dataset, fn);
+}
+
+void DataSetIO::ToStream(const std::unique_ptr<DataSetBase>& dataset, std::ostream& out)
+{
+    DataSetIO::ToStream(*dataset, out);
+}
+
+void DataSetIO::ToFile(DataSetBase& dataset, const std::string& fn)
+{
+    std::ofstream out(fn);
+    if (!out) throw std::runtime_error{"DataSet: could not open XML file for writing: " + fn};
+    XmlWriter::ToStream(dataset, out);
+}
+
+void DataSetIO::ToStream(DataSetBase& dataset, std::ostream& out)
+{
+    XmlWriter::ToStream(dataset, out);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/DataSetIO.h b/src/DataSetIO.h

new file mode 100644 (file)

index 0000000..733f69b
--- /dev/null
+++ b/src/DataSetIO.h
@@ -0,0 +1,36 @@
+// Author: Derek Barnett
+
+#ifndef DATASETIO_H
+#define DATASETIO_H
+
+#include "pbbam/Config.h"
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <pbbam/DataSet.h>
+
+namespace PacBio {
+namespace BAM {
+
+class DataSetIO
+{
+public:
+    // input
+    static std::unique_ptr<DataSetBase> FromUri(const std::string& uri);
+    static std::unique_ptr<DataSetBase> FromUris(const std::vector<std::string>& uris);
+    static std::unique_ptr<DataSetBase> FromXmlString(const std::string& xml);
+
+    // output
+    static void ToFile(DataSetBase& dataset, const std::string& fn);
+    static void ToFile(const std::unique_ptr<DataSetBase>& dataset, const std::string& fn);
+    static void ToStream(DataSetBase& dataset, std::ostream& out);
+    static void ToStream(const std::unique_ptr<DataSetBase>& dataset, std::ostream& out);
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // DATASETIO_H
diff --git a/src/DataSetTypes.cpp b/src/DataSetTypes.cpp

new file mode 100644 (file)

index 0000000..d45391f
--- /dev/null
+++ b/src/DataSetTypes.cpp
@@ -0,0 +1,1176 @@
+// File Description
+/// \file DataSetTypes.cpp
+/// \brief Implementations for the public DataSet component classes.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/DataSetTypes.h"
+
+#include <cstddef>
+#include <set>
+#include <unordered_map>
+
+#include "pbbam/Unused.h"
+#include "pbbam/internal/DataSetBaseTypes.h"
+
+#include "DataSetIO.h"
+#include "DataSetUtils.h"
+#include "FileUtils.h"
+#include "TimeUtils.h"
+
+namespace {
+
+// clang-format off
+using ElementType = PacBio::BAM::XmlElementType;
+const std::unordered_map<std::string, PacBio::BAM::XmlElementType> elementTypeLookup
+{
+    {"DataSetMetadata",        ElementType::DATASET_METADATA},
+    {"BioSample",              ElementType::BIOSAMPLE},
+    {"BioSamples",             ElementType::BIOSAMPLES},
+    {"DNABarcode",             ElementType::DNA_BARCODE},
+    {"DNABarcodes",            ElementType::DNA_BARCODES},
+    {"ExtensionElement",       ElementType::EXTENSION},
+    {"Extensions",             ElementType::EXTENSIONS},
+    {"ExternalResource",       ElementType::EXTERNAL_RESOURCE},
+    {"ExternalResources",      ElementType::EXTERNAL_RESOURCES},
+    {"FileIndex",              ElementType::FILE_INDEX},
+    {"FileIndices",            ElementType::FILE_INDICES},
+    {"Filter",                 ElementType::FILTER},
+    {"Filters",                ElementType::FILTERS},
+    {"ParentTool",             ElementType::PARENT_TOOL},
+    {"Property",               ElementType::PROPERTY},
+    {"Properties",             ElementType::PROPERTIES},
+    {"Provenance",             ElementType::PROVENANCE},
+    {"AlignmentSet",           ElementType::ALIGNMENT_SET},
+    {"BarcodeSet",             ElementType::BARCODE_SET},
+    {"ConsensusAlignmentSet",  ElementType::CONSENSUS_ALIGNMENT_SET},
+    {"ConsensusReadSet",       ElementType::CONSENSUS_READ_SET},
+    {"ContigSet",              ElementType::CONTIG_SET},
+    {"HdfSubreadSet",          ElementType::HDF_SUBREAD_SET},
+    {"ReferenceSet",           ElementType::REFERENCE_SET},
+    {"SubreadSet",             ElementType::SUBREAD_SET},
+    {"TranscriptSet",          ElementType::TRANSCRIPT_SET},
+    {"TranscriptAlignmentSet", ElementType::TRANSCRIPT_ALIGNMENT_SET},
+    {"DataSets",               ElementType::SUBDATASETS},
+    {"DataSet",                ElementType::GENERIC_DATASET}
+};
+// clang-format on
+
+}  // namespace
+
+namespace PacBio {
+namespace BAM {
+
+// -------------------
+// AlignmentSet
+// -------------------
+
+AlignmentSet::AlignmentSet()
+    : DataSetBase("PacBio.DataSet.AlignmentSet", "AlignmentSet", XsdType::DATASETS)
+{
+}
+
+AlignmentSet::AlignmentSet(const internal::FromInputXml& fromInputXml)
+    : DataSetBase("", "AlignmentSet", fromInputXml, XsdType::DATASETS)
+{
+}
+
+// -------------------
+// BarcodeSet
+// -------------------
+
+BarcodeSet::BarcodeSet() : DataSetBase("PacBio.DataSet.BarcodeSet", "BarcodeSet", XsdType::DATASETS)
+{
+}
+
+BarcodeSet::BarcodeSet(const internal::FromInputXml& fromInputXml)
+    : DataSetBase("", "BarcodeSet", fromInputXml, XsdType::DATASETS)
+{
+}
+
+// -------------------
+// BioSample
+// -------------------
+
+BioSample::BioSample(const std::string& name) : DataSetElement("BioSample", XsdType::SAMPLE_INFO)
+{
+    Name(name);
+}
+
+BioSample::BioSample(const std::string& name, const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::SAMPLE_INFO)
+{
+    Name(name);
+}
+
+DEFINE_ACCESSORS(BioSample, DNABarcodes, DNABarcodes)
+
+BioSample& BioSample::DNABarcodes(const PacBio::BAM::DNABarcodes& barcodes)
+{
+    DNABarcodes() = barcodes;
+    return *this;
+}
+
+const std::string& BioSample::Name() const { return Attribute("Name"); }
+
+std::string& BioSample::Name() { return Attribute("Name"); }
+
+BioSample& BioSample::Name(const std::string& name)
+{
+    Attribute("Name", name);
+    return *this;
+}
+
+// -------------------
+// BioSamples
+// -------------------
+
+BioSamples::BioSamples() : DataSetElement("BioSamples", XsdType::SAMPLE_INFO) {}
+
+BioSamples::BioSamples(const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::SAMPLE_INFO)
+{
+}
+
+void BioSamples::Add(const BioSample& sample) { AddChild(sample); }
+
+void BioSamples::Remove(const BioSample& sample) { RemoveChild(sample); }
+
+BioSamples::iterator_type BioSamples::begin() { return BioSamples::iterator_type(this, 0); }
+
+BioSamples::const_iterator_type BioSamples::begin() const { return cbegin(); }
+
+BioSamples::const_iterator_type BioSamples::cbegin() const
+{
+    return BioSamples::const_iterator_type(this, 0);
+}
+
+BioSamples::iterator_type BioSamples::end()
+{
+    return BioSamples::iterator_type(this, NumChildren());
+}
+
+BioSamples::const_iterator_type BioSamples::end() const { return cend(); }
+
+BioSamples::const_iterator_type BioSamples::cend() const
+{
+    return BioSamples::const_iterator_type(this, NumChildren());
+}
+
+const BioSamples::value_type& BioSamples::operator[](size_t index) const
+{
+    return dynamic_cast<const BioSamples::value_type&>(*(children_.at(index).get()));
+}
+
+BioSamples::value_type& BioSamples::operator[](size_t index)
+{
+    return dynamic_cast<BioSamples::value_type&>(*(children_.at(index).get()));
+}
+
+// -----------------------
+// ConsensusAlignmentSet
+// -----------------------
+
+ConsensusAlignmentSet::ConsensusAlignmentSet()
+    : DataSetBase("PacBio.DataSet.ConsensusAlignmentSet", "ConsensusAlignmentSet",
+                  XsdType::DATASETS)
+{
+}
+
+ConsensusAlignmentSet::ConsensusAlignmentSet(const internal::FromInputXml& fromInputXml)
+    : DataSetBase("", "ConsensusAlignmentSet", fromInputXml, XsdType::DATASETS)
+{
+}
+
+// -------------------
+// ConsensusReadSet
+// -------------------
+
+ConsensusReadSet::ConsensusReadSet()
+    : DataSetBase("PacBio.DataSet.ConsensusReadSet", "ConsensusReadSet", XsdType::DATASETS)
+{
+}
+
+ConsensusReadSet::ConsensusReadSet(const internal::FromInputXml& fromInputXml)
+    : DataSetBase("", "ConsensusReadSet", fromInputXml, XsdType::DATASETS)
+{
+}
+
+// -------------------
+// ContigSet
+// -------------------
+
+ContigSet::ContigSet() : DataSetBase("PacBio.DataSet.ContigSet", "ContigSet", XsdType::DATASETS) {}
+
+ContigSet::ContigSet(const internal::FromInputXml& fromInputXml)
+    : DataSetBase("", "ContigSet", fromInputXml, XsdType::DATASETS)
+{
+}
+
+// -------------------
+// DataSetBase
+// -------------------
+
+DataSetBase::DataSetBase()
+    : StrictEntityType("PacBio.DataSet.DataSet", "DataSet", XsdType::DATASETS)
+    , path_(FileUtils::CurrentWorkingDirectory())
+{
+}
+
+DataSetBase::DataSetBase(const internal::FromInputXml& fromInputXml)
+    : StrictEntityType("", "DataSet", fromInputXml, XsdType::DATASETS)
+    , path_(FileUtils::CurrentWorkingDirectory())
+{
+}
+
+DataSetBase::DataSetBase(const std::string& metatype, const std::string& label, const XsdType& xsd)
+    : StrictEntityType(metatype, label, xsd), path_(FileUtils::CurrentWorkingDirectory())
+{
+}
+
+DataSetBase::DataSetBase(const std::string& metatype, const std::string& label,
+                         const internal::FromInputXml& fromInputXml, const XsdType& xsd)
+    : StrictEntityType(metatype, label, fromInputXml, xsd)
+    , path_(FileUtils::CurrentWorkingDirectory())
+{
+}
+
+const PacBio::BAM::ExternalResources& DataSetBase::ExternalResources() const
+{
+    return Child<PacBio::BAM::ExternalResources>("ExternalResources");
+}
+
+PacBio::BAM::ExternalResources& DataSetBase::ExternalResources()
+{
+    if (!HasChild("ExternalResources")) AddChild(PacBio::BAM::ExternalResources());
+    auto& c = Child<PacBio::BAM::ExternalResources>("ExternalResources");
+    return c;
+}
+
+DataSetBase& DataSetBase::ExternalResources(const PacBio::BAM::ExternalResources& resources)
+{
+    ExternalResources() = resources;
+    return *this;
+}
+
+DEFINE_ACCESSORS(DataSetBase, Filters, Filters)
+
+DataSetBase& DataSetBase::Filters(const PacBio::BAM::Filters& filters)
+{
+    Filters() = filters;
+    return *this;
+}
+
+bool DataSetBase::FromInputXml() const { return fromInputXml_; }
+
+void DataSetBase::FromInputXml(bool ok) { fromInputXml_ = ok; }
+
+DEFINE_ACCESSORS(DataSetBase, DataSetMetadata, Metadata)
+
+DataSetBase& DataSetBase::Metadata(const PacBio::BAM::DataSetMetadata& metadata)
+{
+    Metadata() = metadata;
+    return *this;
+}
+
+const NamespaceRegistry& DataSetBase::Namespaces() const { return registry_; }
+
+NamespaceRegistry& DataSetBase::Namespaces() { return registry_; }
+
+void DataSetBase::Path(const std::string& path) { path_ = path; }
+
+const std::string& DataSetBase::Path() const { return path_; }
+
+const PacBio::BAM::SubDataSets& DataSetBase::SubDataSets() const
+{
+    try {
+        return Child<PacBio::BAM::SubDataSets>("DataSets");
+    } catch (std::exception&) {
+        return internal::NullObject<PacBio::BAM::SubDataSets>();
+    }
+}
+
+PacBio::BAM::SubDataSets& DataSetBase::SubDataSets()
+{
+    if (!HasChild("DataSets")) AddChild(internal::NullObject<PacBio::BAM::SubDataSets>());
+    return Child<PacBio::BAM::SubDataSets>("DataSets");
+}
+
+DataSetBase& DataSetBase::SubDataSets(const PacBio::BAM::SubDataSets& subdatasets)
+{
+    SubDataSets() = subdatasets;
+    return *this;
+}
+
+DataSetBase* DataSetBase::DeepCopy() const
+{
+    auto* copyDataset = new DataSetElement(*this);
+    auto* result = static_cast<DataSetBase*>(copyDataset);
+    result->registry_ = registry_;
+    result->path_ = path_;
+    return result;
+}
+
+DataSetBase& DataSetBase::operator+=(const DataSetBase& other)
+{
+    // must be same dataset types (or 'other' must be generic)
+    if (other.LocalNameLabel() != LocalNameLabel() && other.LocalNameLabel() != "DataSet")
+        throw std::runtime_error{"DataSet: cannot merge different dataset types"};
+
+    // check object metadata
+    Metadata() += other.Metadata();
+    ExternalResources() += other.ExternalResources();
+    Filters() += other.Filters();
+    SubDataSets() += other;
+
+    return *this;
+}
+
+std::shared_ptr<DataSetBase> DataSetBase::Create(const std::string& typeName)
+{
+    if (typeName == std::string("DataSet")) return std::make_shared<DataSetBase>();
+    if (typeName == std::string("SubreadSet")) return std::make_shared<SubreadSet>();
+    if (typeName == std::string("AlignmentSet")) return std::make_shared<AlignmentSet>();
+    if (typeName == std::string("BarcodeSet")) return std::make_shared<BarcodeSet>();
+    if (typeName == std::string("ConsensusAlignmentSet"))
+        return std::make_shared<ConsensusAlignmentSet>();
+    if (typeName == std::string("ConsensusReadSet")) return std::make_shared<ConsensusReadSet>();
+    if (typeName == std::string("ContigSet")) return std::make_shared<ContigSet>();
+    if (typeName == std::string("HdfSubreadSet")) return std::make_shared<HdfSubreadSet>();
+    if (typeName == std::string("ReferenceSet")) return std::make_shared<ReferenceSet>();
+    if (typeName == std::string("TranscriptSet")) return std::make_shared<TranscriptSet>();
+    if (typeName == std::string("TranscriptAlignmentSet"))
+        return std::make_shared<TranscriptAlignmentSet>();
+
+    // unknown typename
+    throw std::runtime_error{"DataSet: unsupported type: " + typeName};
+}
+
+std::shared_ptr<DataSetBase> DataSetBase::Create(const std::string& typeName,
+                                                 const internal::FromInputXml& fromInputXml)
+{
+    if (typeName == std::string("DataSet")) return std::make_shared<DataSetBase>(fromInputXml);
+    if (typeName == std::string("SubreadSet")) return std::make_shared<SubreadSet>(fromInputXml);
+    if (typeName == std::string("AlignmentSet"))
+        return std::make_shared<AlignmentSet>(fromInputXml);
+    if (typeName == std::string("BarcodeSet")) return std::make_shared<BarcodeSet>(fromInputXml);
+    if (typeName == std::string("ConsensusAlignmentSet"))
+        return std::make_shared<ConsensusAlignmentSet>(fromInputXml);
+    if (typeName == std::string("ConsensusReadSet"))
+        return std::make_shared<ConsensusReadSet>(fromInputXml);
+    if (typeName == std::string("ContigSet")) return std::make_shared<ContigSet>(fromInputXml);
+    if (typeName == std::string("HdfSubreadSet"))
+        return std::make_shared<HdfSubreadSet>(fromInputXml);
+    if (typeName == std::string("ReferenceSet"))
+        return std::make_shared<ReferenceSet>(fromInputXml);
+    if (typeName == std::string("TranscriptSet"))
+        return std::make_shared<TranscriptSet>(fromInputXml);
+    if (typeName == std::string("TranscriptAlignmentSet"))
+        return std::make_shared<TranscriptAlignmentSet>(fromInputXml);
+
+    // unknown typename
+    throw std::runtime_error{"DataSet: unsupported type: " + typeName};
+}
+
+void DataSetBase::Save(const std::string& outputFilename)
+{
+    DataSetIO::ToFile(*this, outputFilename);
+}
+
+void DataSetBase::SaveToStream(std::ostream& out) { DataSetIO::ToStream(*this, out); }
+
+// -------------------
+// DataSetMetadata
+// -------------------
+
+DataSetMetadata::DataSetMetadata() : DataSetElement("DataSetMetadata", XsdType::DATASETS) {}
+
+DataSetMetadata::DataSetMetadata(const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::DATASETS)
+{
+}
+
+DataSetMetadata::DataSetMetadata(const std::string& numRecords, const std::string& totalLength)
+    : DataSetElement("DataSetMetadata", XsdType::DATASETS)
+{
+    TotalLength(totalLength);
+    NumRecords(numRecords);
+}
+
+DataSetMetadata::DataSetMetadata(const std::string& numRecords, const std::string& totalLength,
+                                 const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::DATASETS)
+{
+    TotalLength(totalLength);
+    NumRecords(numRecords);
+}
+
+DEFINE_ACCESSORS(DataSetMetadata, BioSamples, BioSamples)
+
+DataSetMetadata& DataSetMetadata::BioSamples(const PacBio::BAM::BioSamples& samples)
+{
+    BioSamples() = samples;
+    return *this;
+}
+
+DEFINE_ACCESSORS(DataSetMetadata, Provenance, Provenance)
+
+DataSetMetadata& DataSetMetadata::Provenance(const PacBio::BAM::Provenance& provenance)
+{
+    Provenance() = provenance;
+    return *this;
+}
+
+DataSetMetadata& DataSetMetadata::operator+=(const DataSetMetadata& other)
+{
+    TotalLength() = TotalLength() + other.TotalLength();
+    NumRecords() = NumRecords() + other.NumRecords();
+    // merge add'l
+    return *this;
+}
+
+const std::string& DataSetMetadata::NumRecords() const { return ChildText("NumRecords"); }
+
+std::string& DataSetMetadata::NumRecords() { return ChildText("NumRecords"); }
+
+DataSetMetadata& DataSetMetadata::NumRecords(const std::string& numRecords)
+{
+    ChildText("NumRecords", numRecords);
+    return *this;
+}
+
+const std::string& DataSetMetadata::TotalLength() const { return ChildText("TotalLength"); }
+
+std::string& DataSetMetadata::TotalLength() { return ChildText("TotalLength"); }
+
+DataSetMetadata& DataSetMetadata::TotalLength(const std::string& totalLength)
+{
+    ChildText("TotalLength", totalLength);
+    return *this;
+}
+
+// -------------------
+// DNABarcode
+// -------------------
+
+DNABarcode::DNABarcode(const std::string& name) : DataSetElement("DNABarcode", XsdType::SAMPLE_INFO)
+{
+    Name(name);
+    UniqueId(internal::GenerateUuid());
+}
+
+DNABarcode::DNABarcode(const std::string& name, const std::string& uuid)
+    : DataSetElement("DNABarcode", XsdType::SAMPLE_INFO)
+{
+    Name(name);
+    UniqueId(uuid);
+}
+
+DNABarcode::DNABarcode(const std::string& name, const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::SAMPLE_INFO)
+{
+    Name(name);
+    UniqueId(internal::GenerateUuid());
+}
+
+DNABarcode::DNABarcode(const std::string& name, const std::string& uuid,
+                       const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::SAMPLE_INFO)
+{
+    Name(name);
+    UniqueId(uuid);
+}
+
+const std::string& DNABarcode::Name() const { return Attribute("Name"); }
+
+std::string& DNABarcode::Name() { return Attribute("Name"); }
+
+DNABarcode& DNABarcode::Name(const std::string& name)
+{
+    Attribute("Name", name);
+    return *this;
+}
+
+const std::string& DNABarcode::UniqueId() const { return Attribute("UniqueId"); }
+
+std::string& DNABarcode::UniqueId() { return Attribute("UniqueId"); }
+
+DNABarcode& DNABarcode::UniqueId(const std::string& uuid)
+{
+    Attribute("UniqueId", uuid);
+    return *this;
+}
+
+// -------------------
+// DNABarcodes
+// -------------------
+
+DNABarcodes::DNABarcodes() : DataSetElement("DNABarcodes", XsdType::SAMPLE_INFO) {}
+
+DNABarcodes::DNABarcodes(const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::SAMPLE_INFO)
+{
+}
+
+void DNABarcodes::Add(const DNABarcode& barcode) { AddChild(barcode); }
+
+void DNABarcodes::Remove(const DNABarcode& barcode) { RemoveChild(barcode); }
+
+DNABarcodes::iterator_type DNABarcodes::begin() { return DNABarcodes::iterator_type(this, 0); }
+
+DNABarcodes::const_iterator_type DNABarcodes::begin() const { return cbegin(); }
+
+DNABarcodes::const_iterator_type DNABarcodes::cbegin() const
+{
+    return DNABarcodes::const_iterator_type(this, 0);
+}
+
+DNABarcodes::iterator_type DNABarcodes::end()
+{
+    return DNABarcodes::iterator_type(this, NumChildren());
+}
+
+DNABarcodes::const_iterator_type DNABarcodes::end() const { return cend(); }
+
+DNABarcodes::const_iterator_type DNABarcodes::cend() const
+{
+    return DNABarcodes::const_iterator_type(this, NumChildren());
+}
+
+const DNABarcodes::value_type& DNABarcodes::operator[](size_t index) const
+{
+    return dynamic_cast<const DNABarcodes::value_type&>(*(children_.at(index).get()));
+}
+
+DNABarcodes::value_type& DNABarcodes::operator[](size_t index)
+{
+    return dynamic_cast<DNABarcodes::value_type&>(*(children_.at(index).get()));
+}
+
+// -------------------
+// ExtensionElement
+// -------------------
+
+ExtensionElement::ExtensionElement() : DataSetElement("ExtensionElement", XsdType::BASE_DATA_MODEL)
+{
+}
+
+ExtensionElement::ExtensionElement(const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::BASE_DATA_MODEL)
+{
+}
+
+// -------------------
+// Extensions
+// -------------------
+
+Extensions::Extensions() : DataSetElement("Extensions", XsdType::BASE_DATA_MODEL) {}
+
+Extensions::Extensions(const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::BASE_DATA_MODEL)
+{
+}
+
+Extensions::iterator_type Extensions::begin() { return Extensions::iterator_type(this, 0); }
+
+Extensions::const_iterator_type Extensions::begin() const { return cbegin(); }
+
+Extensions::const_iterator_type Extensions::cbegin() const
+{
+    return Extensions::const_iterator_type(this, 0);
+}
+
+Extensions::iterator_type Extensions::end()
+{
+    return Extensions::iterator_type(this, NumChildren());
+}
+
+Extensions::const_iterator_type Extensions::end() const { return cend(); }
+
+Extensions::const_iterator_type Extensions::cend() const
+{
+    return Extensions::const_iterator_type(this, NumChildren());
+}
+
+const Extensions::value_type& Extensions::operator[](size_t index) const
+{
+    return dynamic_cast<const Extensions::value_type&>(*(children_.at(index).get()));
+}
+
+Extensions::value_type& Extensions::operator[](size_t index)
+{
+    return dynamic_cast<Extensions::value_type&>(*(children_.at(index).get()));
+}
+
+// -------------------
+// ExternalResource
+// -------------------
+
+ExternalResource::ExternalResource(const BamFile& bamFile)
+    : IndexedDataType("PacBio.SubreadFile.SubreadBamFile", bamFile.Filename(), "ExternalResource",
+                      XsdType::BASE_DATA_MODEL)
+{
+}
+
+ExternalResource::ExternalResource(const std::string& metatype, const std::string& filename)
+    : IndexedDataType(metatype, filename, "ExternalResource", XsdType::BASE_DATA_MODEL)
+{
+}
+
+ExternalResource::ExternalResource(const std::string& metatype, const std::string& filename,
+                                   const internal::FromInputXml& fromInputXml)
+    : IndexedDataType("", filename, "ExternalResource", fromInputXml, XsdType::BASE_DATA_MODEL)
+{
+    UNUSED(metatype);
+}
+
+DEFINE_ACCESSORS(ExternalResource, ExternalResources, ExternalResources)
+
+ExternalResource& ExternalResource::ExternalResources(
+    const PacBio::BAM::ExternalResources& resources)
+{
+    ExternalResources() = resources;
+    return *this;
+}
+
+BamFile ExternalResource::ToBamFile() const { return BamFile(ResourceId()); }
+
+// -------------------
+// ExternalResources
+// -------------------
+
+ExternalResources::ExternalResources()
+    : DataSetElement("ExternalResources", XsdType::BASE_DATA_MODEL)
+{
+}
+
+ExternalResources::ExternalResources(const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::BASE_DATA_MODEL)
+{
+}
+
+ExternalResources& ExternalResources::operator+=(const ExternalResources& other)
+{
+    // only keep unique resource ids
+    std::set<std::string> myResourceIds;
+    for (size_t i = 0; i < NumChildren(); ++i) {
+        const ExternalResource& resource = this->operator[](i);
+        myResourceIds.insert(resource.ResourceId());
+    }
+
+    std::vector<size_t> newResourceIndices;
+    const size_t numOtherResourceIds = other.Size();
+    for (size_t i = 0; i < numOtherResourceIds; ++i) {
+        const std::string& resourceId = other[i].ResourceId();
+        auto found = myResourceIds.find(resourceId);
+        if (found == myResourceIds.cend()) newResourceIndices.push_back(i);
+    }
+
+    for (size_t index : newResourceIndices)
+        Add(other[index]);
+    return *this;
+}
+
+void ExternalResources::Add(const ExternalResource& ext)
+{
+    // disallow external resources w/ duplicate ResourceIds
+    std::set<std::string> myResourceIds;
+    for (size_t i = 0; i < NumChildren(); ++i) {
+        const ExternalResource& resource = this->operator[](i);
+        myResourceIds.insert(resource.ResourceId());
+    }
+
+    if (myResourceIds.find(ext.ResourceId()) == myResourceIds.cend()) AddChild(ext);
+}
+
+std::vector<BamFile> ExternalResources::BamFiles() const
+{
+
+    std::vector<BamFile> result;
+    const int numResources = Size();
+    result.reserve(numResources);
+    for (const ExternalResource& ext : *this)
+        result.push_back(ext.ToBamFile());
+    return result;
+}
+
+void ExternalResources::Remove(const ExternalResource& ext) { RemoveChild(ext); }
+
+ExternalResources::iterator_type ExternalResources::begin()
+{
+    return ExternalResources::iterator_type(this, 0);
+}
+
+ExternalResources::const_iterator_type ExternalResources::begin() const { return cbegin(); }
+
+ExternalResources::const_iterator_type ExternalResources::cbegin() const
+{
+    return ExternalResources::const_iterator_type(this, 0);
+}
+
+ExternalResources::iterator_type ExternalResources::end()
+{
+    return ExternalResources::iterator_type(this, NumChildren());
+}
+
+ExternalResources::const_iterator_type ExternalResources::end() const { return cend(); }
+
+ExternalResources::const_iterator_type ExternalResources::cend() const
+{
+    return ExternalResources::const_iterator_type(this, NumChildren());
+}
+
+const ExternalResources::value_type& ExternalResources::operator[](size_t index) const
+{
+    return dynamic_cast<const ExternalResources::value_type&>(*(children_.at(index).get()));
+}
+
+ExternalResources::value_type& ExternalResources::operator[](size_t index)
+{
+    return dynamic_cast<ExternalResources::value_type&>(*(children_.at(index).get()));
+}
+
+// -------------------
+// FileIndex
+// -------------------
+
+FileIndex::FileIndex(const std::string& metatype, const std::string& filename)
+    : InputOutputDataType(metatype, filename, "FileIndex", XsdType::BASE_DATA_MODEL)
+{
+}
+
+FileIndex::FileIndex(const std::string& metatype, const std::string& filename,
+                     const internal::FromInputXml& fromInputXml)
+    : InputOutputDataType("", filename, "FileIndex", fromInputXml, XsdType::BASE_DATA_MODEL)
+{
+    UNUSED(metatype);
+}
+
+// -------------------
+// FileIndices
+// -------------------
+
+FileIndices::FileIndices() : DataSetElement("FileIndices", XsdType::BASE_DATA_MODEL) {}
+
+FileIndices::FileIndices(const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::BASE_DATA_MODEL)
+{
+}
+
+void FileIndices::Add(const FileIndex& index) { AddChild(index); }
+
+void FileIndices::Remove(const FileIndex& index) { RemoveChild(index); }
+
+FileIndices::iterator_type FileIndices::begin() { return FileIndices::iterator_type(this, 0); }
+
+FileIndices::const_iterator_type FileIndices::begin() const { return cbegin(); }
+
+FileIndices::const_iterator_type FileIndices::cbegin() const
+{
+    return FileIndices::const_iterator_type(this, 0);
+}
+
+FileIndices::iterator_type FileIndices::end()
+{
+    return FileIndices::iterator_type(this, NumChildren());
+}
+
+FileIndices::const_iterator_type FileIndices::end() const { return cend(); }
+
+FileIndices::const_iterator_type FileIndices::cend() const
+{
+    return FileIndices::const_iterator_type(this, NumChildren());
+}
+
+const FileIndices::value_type& FileIndices::operator[](size_t index) const
+{
+    return dynamic_cast<const FileIndices::value_type&>(*(children_.at(index).get()));
+}
+
+FileIndices::value_type& FileIndices::operator[](size_t index)
+{
+    return dynamic_cast<FileIndices::value_type&>(*(children_.at(index).get()));
+}
+
+// -------------------
+// Filter
+// -------------------
+
+Filter::Filter() : DataSetElement("Filter", XsdType::DATASETS) {}
+
+Filter::Filter(const internal::FromInputXml& fromInputXml)
+    : DataSetElement("Filter", fromInputXml, XsdType::DATASETS)
+{
+}
+
+DEFINE_ACCESSORS(Filter, Properties, Properties)
+
+Filter& Filter::Properties(const PacBio::BAM::Properties& properties)
+{
+    Properties() = properties;
+    return *this;
+}
+
+// -------------------
+// Filters
+// -------------------
+
+Filters::Filters() : DataSetElement("Filters", XsdType::DATASETS) {}
+
+Filters::Filters(const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::DATASETS)
+{
+}
+
+Filters& Filters::operator+=(const Filters& other)
+{
+    for (auto& newFilter : other)
+        AddChild(newFilter);
+    return *this;
+}
+
+void Filters::Add(const Filter& filter) { AddChild(filter); }
+
+void Filters::Remove(const Filter& filter) { RemoveChild(filter); }
+
+Filters::iterator_type Filters::begin() { return Filters::iterator_type(this, 0); }
+
+Filters::const_iterator_type Filters::begin() const { return cbegin(); }
+
+Filters::const_iterator_type Filters::cbegin() const
+{
+    return Filters::const_iterator_type(this, 0);
+}
+
+Filters::iterator_type Filters::end() { return Filters::iterator_type(this, NumChildren()); }
+
+Filters::const_iterator_type Filters::end() const { return cend(); }
+
+Filters::const_iterator_type Filters::cend() const
+{
+    return Filters::const_iterator_type(this, NumChildren());
+}
+
+const Filters::value_type& Filters::operator[](size_t index) const
+{
+    return dynamic_cast<const Filters::value_type&>(*(children_.at(index).get()));
+}
+
+Filters::value_type& Filters::operator[](size_t index)
+{
+    return dynamic_cast<Filters::value_type&>(*(children_.at(index).get()));
+}
+
+// -------------------
+// HdfSubreadSet
+// -------------------
+
+HdfSubreadSet::HdfSubreadSet()
+    : DataSetBase("PacBio.DataSet.HdfSubreadSet", "HdfSubreadSet", XsdType::DATASETS)
+{
+}
+
+HdfSubreadSet::HdfSubreadSet(const internal::FromInputXml& fromInputXml)
+    : DataSetBase("", "HdfSubreadSet", fromInputXml, XsdType::DATASETS)
+{
+}
+
+// -------------------
+// ParentTool
+// -------------------
+
+ParentTool::ParentTool() : BaseEntityType("ParentTool", XsdType::DATASETS) {}
+
+ParentTool::ParentTool(const internal::FromInputXml& fromInputXml)
+    : BaseEntityType("", fromInputXml, XsdType::DATASETS)
+{
+}
+
+// -------------------
+// Properties
+// -------------------
+
+Properties::Properties() : DataSetElement("Properties", XsdType::BASE_DATA_MODEL) {}
+
+Properties::Properties(const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::BASE_DATA_MODEL)
+{
+}
+
+void Properties::Add(const Property& property) { AddChild(property); }
+
+void Properties::Remove(const Property& property) { RemoveChild(property); }
+
+Properties::iterator_type Properties::begin() { return Properties::iterator_type(this, 0); }
+
+Properties::const_iterator_type Properties::begin() const { return cbegin(); }
+
+Properties::const_iterator_type Properties::cbegin() const
+{
+    return Properties::const_iterator_type(this, 0);
+}
+
+Properties::iterator_type Properties::end()
+{
+    return Properties::iterator_type(this, NumChildren());
+}
+
+Properties::const_iterator_type Properties::end() const { return cend(); }
+
+Properties::const_iterator_type Properties::cend() const
+{
+    return Properties::const_iterator_type(this, NumChildren());
+}
+
+const Properties::value_type& Properties::operator[](size_t index) const
+{
+    return dynamic_cast<const Properties::value_type&>(*(children_.at(index).get()));
+}
+
+Properties::value_type& Properties::operator[](size_t index)
+{
+    return dynamic_cast<Properties::value_type&>(*(children_.at(index).get()));
+}
+
+// -------------------
+// Property
+// -------------------
+
+Property::Property(const std::string& name, const std::string& value, const std::string& op)
+    : DataSetElement("Property", XsdType::BASE_DATA_MODEL)
+{
+    Name(name);
+    Value(value);
+    Operator(op);
+}
+
+Property::Property(const std::string& name, const std::string& value, const std::string& op,
+                   const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::BASE_DATA_MODEL)
+{
+    Name(name);
+    Value(value);
+    Operator(op);
+}
+
+const std::string& Property::Name() const { return Attribute("Name"); }
+
+std::string& Property::Name() { return Attribute("Name"); }
+
+Property& Property::Name(const std::string& name)
+{
+    Attribute("Name", name);
+    return *this;
+}
+
+const std::string& Property::Operator() const { return Attribute("Operator"); }
+
+std::string& Property::Operator() { return Attribute("Operator"); }
+
+Property& Property::Operator(const std::string& op)
+{
+    Attribute("Operator", op);
+    return *this;
+}
+
+const std::string& Property::Value() const { return Attribute("Value"); }
+
+std::string& Property::Value() { return Attribute("Value"); }
+
+Property& Property::Value(const std::string& value)
+{
+    Attribute("Value", value);
+    return *this;
+}
+
+// -------------------
+// Provenance
+// -------------------
+
+Provenance::Provenance() : DataSetElement("Provenance", XsdType::DATASETS) {}
+
+Provenance::Provenance(const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::DATASETS)
+{
+}
+
+DEFINE_ACCESSORS(Provenance, ParentTool, ParentTool)
+
+const std::string& Provenance::CreatedBy() const { return Attribute("CreatedBy"); }
+
+std::string& Provenance::CreatedBy() { return Attribute("CreatedBy"); }
+
+Provenance& Provenance::CreatedBy(const std::string& createdBy)
+{
+    Attribute("CreatedBy", createdBy);
+    return *this;
+}
+
+const std::string& Provenance::CommonServicesInstanceId() const
+{
+    return ChildText("CommonServicesInstanceId");
+}
+
+std::string& Provenance::CommonServicesInstanceId()
+{
+    return ChildText("CommonServicesInstanceId");
+}
+
+Provenance& Provenance::CommonServicesInstanceId(const std::string& id)
+{
+    ChildText("CommonServicesInstanceId", id);
+    return *this;
+}
+
+const std::string& Provenance::CreatorUserId() const { return ChildText("CreatorUserId"); }
+
+std::string& Provenance::CreatorUserId() { return ChildText("CreatorUserId"); }
+
+Provenance& Provenance::CreatorUserId(const std::string& id)
+{
+    ChildText("CreatorUserId", id);
+    return *this;
+}
+
+const std::string& Provenance::ParentJobId() const { return ChildText("ParentJobId"); }
+
+std::string& Provenance::ParentJobId() { return ChildText("ParentJobId"); }
+
+Provenance& Provenance::ParentJobId(const std::string& id)
+{
+    ChildText("ParentJobId", id);
+    return *this;
+}
+
+Provenance& Provenance::ParentTool(const PacBio::BAM::ParentTool& tool)
+{
+    ParentTool() = tool;
+    return *this;
+}
+
+// -------------------
+// ReferenceSet
+// -------------------
+
+ReferenceSet::ReferenceSet()
+    : DataSetBase("PacBio.DataSet.ReferenceSet", "ReferenceSet", XsdType::DATASETS)
+{
+}
+
+ReferenceSet::ReferenceSet(const internal::FromInputXml& fromInputXml)
+    : DataSetBase("", "ReferenceSet", fromInputXml, XsdType::DATASETS)
+{
+}
+
+// -------------------
+// SubDataSets
+// -------------------
+
+SubDataSets::SubDataSets() : DataSetElement("DataSets", XsdType::DATASETS) {}
+
+SubDataSets::SubDataSets(const internal::FromInputXml& fromInputXml)
+    : DataSetElement("", fromInputXml, XsdType::DATASETS)
+{
+}
+
+SubDataSets& SubDataSets::operator+=(const DataSetBase& other)
+{
+    AddChild(other);
+    return *this;
+}
+
+SubDataSets& SubDataSets::operator+=(const SubDataSets& other)
+{
+    for (auto& newSubDataset : other)
+        AddChild(newSubDataset);
+    return *this;
+}
+
+void SubDataSets::Add(const DataSetBase& subdataset) { AddChild(subdataset); }
+
+void SubDataSets::Remove(const DataSetBase& subdataset) { RemoveChild(subdataset); }
+
+SubDataSets::iterator_type SubDataSets::begin() { return SubDataSets::iterator_type(this, 0); }
+
+SubDataSets::const_iterator_type SubDataSets::begin() const { return cbegin(); }
+
+SubDataSets::const_iterator_type SubDataSets::cbegin() const
+{
+    return SubDataSets::const_iterator_type(this, 0);
+}
+
+SubDataSets::iterator_type SubDataSets::end()
+{
+    return SubDataSets::iterator_type(this, NumChildren());
+}
+
+SubDataSets::const_iterator_type SubDataSets::end() const { return cend(); }
+
+SubDataSets::const_iterator_type SubDataSets::cend() const
+{
+    return SubDataSets::const_iterator_type(this, NumChildren());
+}
+
+const SubDataSets::value_type& SubDataSets::operator[](size_t index) const
+{
+    return dynamic_cast<const SubDataSets::value_type&>(*(children_.at(index).get()));
+}
+
+SubDataSets::value_type& SubDataSets::operator[](size_t index)
+{
+    return dynamic_cast<SubDataSets::value_type&>(*(children_.at(index).get()));
+}
+
+// -------------------
+// SubreadSet
+// -------------------
+
+SubreadSet::SubreadSet() : DataSetBase("PacBio.DataSet.SubreadSet", "SubreadSet", XsdType::DATASETS)
+{
+}
+
+SubreadSet::SubreadSet(const internal::FromInputXml& fromInputXml)
+    : DataSetBase("", "SubreadSet", fromInputXml, XsdType::DATASETS)
+{
+}
+
+// -------------------
+// TranscriptSet
+// -------------------
+
+TranscriptSet::TranscriptSet()
+    : DataSetBase("PacBio.DataSet.TranscriptSet", "TranscriptSet", XsdType::DATASETS)
+{
+}
+
+TranscriptSet::TranscriptSet(const internal::FromInputXml& fromInputXml)
+    : DataSetBase("", "TranscriptSet", fromInputXml, XsdType::DATASETS)
+{
+}
+
+// -------------------
+// TranscriptAlignmentSet
+// -------------------
+
+TranscriptAlignmentSet::TranscriptAlignmentSet()
+    : DataSetBase("PacBio.DataSet.TranscriptAlignmentSet", "TranscriptAlignmentSet",
+                  XsdType::DATASETS)
+{
+}
+
+TranscriptAlignmentSet::TranscriptAlignmentSet(const internal::FromInputXml& fromInputXml)
+    : DataSetBase("", "TranscriptAlignmentSet", fromInputXml, XsdType::DATASETS)
+{
+}
+
+XmlElementType ElementTypeFromName(const std::string& name)
+{
+    const auto found = elementTypeLookup.find(name);
+    if (found == elementTypeLookup.cend()) return XmlElementType::GENERIC_ELEMENT;
+    return found->second;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/DataSetUtils.h b/src/DataSetUtils.h

new file mode 100644 (file)

index 0000000..5a843d3
--- /dev/null
+++ b/src/DataSetUtils.h
@@ -0,0 +1,73 @@
+// Author: Derek Barnett
+
+#ifndef DATASETUTILS_H
+#define DATASETUTILS_H
+
+#include "pbbam/Config.h"
+
+#include <boost/uuid/random_generator.hpp>
+#include <boost/uuid/uuid_io.hpp>
+
+#include <pbbam/DataSetTypes.h>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static const std::string XML_VERSION = std::string{"3.0.1"};
+
+template <typename T>
+inline const T& NullObject()
+{
+    static const T empty;
+    return empty;
+}
+
+template <>
+inline const PacBio::BAM::DataSetMetadata& NullObject()
+{
+    static const PacBio::BAM::DataSetMetadata empty("", "");
+    return empty;
+}
+
+inline std::string GenerateUuid()
+{
+    static boost::uuids::random_generator gen;
+    const boost::uuids::uuid uuid = gen();
+    return boost::uuids::to_string(uuid);
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#ifndef FETCH_CHILD_CONST_REF
+#define FETCH_CHILD_CONST_REF(Class, Type, Method)            \
+                                                              \
+    const PacBio::BAM::Type& Class::Method() const            \
+    {                                                         \
+        try {                                                 \
+            return Child<PacBio::BAM::Type>(#Type);           \
+        } catch (std::exception&) {                           \
+            return internal::NullObject<PacBio::BAM::Type>(); \
+        }                                                     \
+    }
+#endif
+
+#ifndef FETCH_CHILD_REF
+#define FETCH_CHILD_REF(Class, Type, Method)                                       \
+                                                                                   \
+    PacBio::BAM::Type& Class::Method()                                             \
+    {                                                                              \
+        if (!HasChild(#Type)) AddChild(internal::NullObject<PacBio::BAM::Type>()); \
+        return Child<PacBio::BAM::Type>(#Type);                                    \
+    }
+#endif
+
+#ifndef DEFINE_ACCESSORS
+#define DEFINE_ACCESSORS(Class, Type, Method)  \
+    FETCH_CHILD_CONST_REF(Class, Type, Method) \
+    FETCH_CHILD_REF(Class, Type, Method)
+#endif
+
+#endif  // DATASETUTILS_H
diff --git a/src/DataSetXsd.cpp b/src/DataSetXsd.cpp

new file mode 100644 (file)

index 0000000..2151fd2
--- /dev/null
+++ b/src/DataSetXsd.cpp
@@ -0,0 +1,226 @@
+// File Description
+/// \file DataSetXsd.cpp
+/// \brief Implements the XSD- and namespace-related classes for DataSetXML.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/DataSetXsd.h"
+
+#include <cassert>
+#include <type_traits>
+#include <unordered_map>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// clang-format off
+static std::map<XsdType, NamespaceInfo> DefaultRegistry()
+{
+    const auto result = std::map<XsdType, NamespaceInfo>
+    {
+        { XsdType::NONE,                   NamespaceInfo{ "", "" } },
+        { XsdType::AUTOMATION_CONSTRAINTS, NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioAutomationConstraints.xsd" } },
+        { XsdType::BASE_DATA_MODEL,        NamespaceInfo{ "pbbase", "http://pacificbiosciences.com/PacBioBaseDataModel.xsd" } },
+        { XsdType::COLLECTION_METADATA,    NamespaceInfo{ "pbmeta", "http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" } },
+        { XsdType::COMMON_MESSAGES,        NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioCommonMessages.xsd" } },
+        { XsdType::DATA_MODEL,             NamespaceInfo{ "pbdm",   "http://pacificbiosciences.com/PacBioDataModel.xsd" } },
+        { XsdType::DATA_STORE,             NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioDataStore.xsd" } },
+        { XsdType::DATASETS,               NamespaceInfo{ "pbds",   "http://pacificbiosciences.com/PacBioDatasets.xsd" } },
+        { XsdType::DECL_DATA,              NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioDeclData.xsd" } },
+        { XsdType::PART_NUMBERS,           NamespaceInfo{ "pbpn",   "http://pacificbiosciences.com/PacBioPartNumbers.xsd" } },
+        { XsdType::PRIMARY_METRICS,        NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioPrimaryMetrics.xsd" } },
+        { XsdType::REAGENT_KIT,            NamespaceInfo{ "pbrk",   "http://pacificbiosciences.com/PacBioReagentKit.xsd" } },
+        { XsdType::RIGHTS_AND_ROLES,       NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioRightsAndRoles.xsd" } },
+        { XsdType::SAMPLE_INFO,            NamespaceInfo{ "pbsample", "http://pacificbiosciences.com/PacBioSampleInfo.xsd" } },
+        { XsdType::SEEDING_DATA,           NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioSeedingData.xsd" } }
+    };
+    return result;
+}
+
+static const auto elementRegistry = std::unordered_map<std::string, XsdType>
+{
+    // 'pbbase' elements
+    //
+    { "AutomationParameter" ,  XsdType::BASE_DATA_MODEL },
+    { "AutomationParameters" , XsdType::BASE_DATA_MODEL },
+    { "BinCount" ,             XsdType::BASE_DATA_MODEL },
+    { "BinCounts" ,            XsdType::BASE_DATA_MODEL },
+    { "BinLabel" ,             XsdType::BASE_DATA_MODEL },
+    { "BinLabels" ,            XsdType::BASE_DATA_MODEL },
+    { "BinWidth" ,             XsdType::BASE_DATA_MODEL },
+    { "ExternalResource" ,     XsdType::BASE_DATA_MODEL },
+    { "ExternalResources" ,    XsdType::BASE_DATA_MODEL },
+    { "FileIndex" ,            XsdType::BASE_DATA_MODEL },
+    { "FileIndices" ,          XsdType::BASE_DATA_MODEL },
+    { "MaxBinValue" ,          XsdType::BASE_DATA_MODEL },
+    { "MaxOutlierValue" ,      XsdType::BASE_DATA_MODEL },
+    { "MetricDescription" ,    XsdType::BASE_DATA_MODEL },
+    { "NumBins" ,              XsdType::BASE_DATA_MODEL },
+    { "Properties" ,           XsdType::BASE_DATA_MODEL },
+    { "Property" ,             XsdType::BASE_DATA_MODEL },
+    { "Sample95thPct" ,        XsdType::BASE_DATA_MODEL },
+    { "SampleMean" ,           XsdType::BASE_DATA_MODEL },
+    { "SampleMed" ,            XsdType::BASE_DATA_MODEL },
+    { "SampleSize" ,           XsdType::BASE_DATA_MODEL },
+    { "SampleStd" ,            XsdType::BASE_DATA_MODEL },
+
+    // 'pbds' elements
+    //
+    { "AdapterDimerFraction",  XsdType::DATASETS },
+    { "AlignmentSet",          XsdType::DATASETS },
+    { "BarcodeConstruction",   XsdType::DATASETS },
+    { "BarcodeSet",            XsdType::DATASETS },
+    { "ConsensusAlignmentSet", XsdType::DATASETS },
+    { "ConsensusReadSet",      XsdType::DATASETS },
+    { "Contig",                XsdType::DATASETS },
+    { "Contigs",               XsdType::DATASETS },
+    { "ContigSet",             XsdType::DATASETS },
+    { "ControlReadLenDist",    XsdType::DATASETS },
+    { "ControlReadQualDist",   XsdType::DATASETS },
+    { "DataSetMetdata",        XsdType::DATASETS },
+    { "DataSet",               XsdType::DATASETS },
+    { "DataSets",              XsdType::DATASETS },
+    { "Filter",                XsdType::DATASETS },
+    { "Filters",               XsdType::DATASETS },
+    { "HdfSubreadSet",         XsdType::DATASETS },
+    { "InsertReadLenDist",     XsdType::DATASETS },
+    { "InsertReadQualDist" ,   XsdType::DATASETS },
+    { "MedianInsertDist",      XsdType::DATASETS },
+    { "NumRecords",            XsdType::DATASETS },
+    { "NumSequencingZmws",     XsdType::DATASETS },
+    { "Organism",              XsdType::DATASETS },
+    { "ParentTool",            XsdType::DATASETS },
+    { "Ploidy",                XsdType::DATASETS },
+    { "ProdDist",              XsdType::DATASETS },
+    { "Provenance",            XsdType::DATASETS },
+    { "ReadLenDist",           XsdType::DATASETS },
+    { "ReadQualDist",          XsdType::DATASETS },
+    { "ReadTypeDist",          XsdType::DATASETS },
+    { "ReferenceSet",          XsdType::DATASETS },
+    { "ShortInsertFraction",   XsdType::DATASETS },
+    { "SubreadSet",            XsdType::DATASETS },
+    { "SummaryStats",          XsdType::DATASETS },
+    { "TotalLength",           XsdType::DATASETS },
+    { "TranscriptSet",         XsdType::DATASETS },
+    { "TranscriptAlignmentSet",XsdType::DATASETS },
+
+    // 'pbmeta' elements
+    //
+    { "Automation",           XsdType::COLLECTION_METADATA },
+    { "AutomationName",       XsdType::COLLECTION_METADATA },
+    { "CellIndex",            XsdType::COLLECTION_METADATA },
+    { "CellPac",              XsdType::COLLECTION_METADATA },
+    { "CollectionFileCopy",   XsdType::COLLECTION_METADATA },
+    { "CollectionMetadata",   XsdType::COLLECTION_METADATA },
+    { "CollectionNumber",     XsdType::COLLECTION_METADATA },
+    { "CollectionPathUri",    XsdType::COLLECTION_METADATA },
+    { "Collections",          XsdType::COLLECTION_METADATA },
+    { "Concentration",        XsdType::COLLECTION_METADATA },
+    { "ConfigFileName",       XsdType::COLLECTION_METADATA },
+    { "CopyFiles",            XsdType::COLLECTION_METADATA },
+    { "InstCtrlVer",          XsdType::COLLECTION_METADATA },
+    { "MetricsVerbosity",     XsdType::COLLECTION_METADATA },
+    { "Name",                 XsdType::COLLECTION_METADATA },
+    { "OutputOptions",        XsdType::COLLECTION_METADATA },
+    { "PlateId",              XsdType::COLLECTION_METADATA },
+    { "Primary",              XsdType::COLLECTION_METADATA },
+    { "Readout",              XsdType::COLLECTION_METADATA },
+    { "ResultsFolder",        XsdType::COLLECTION_METADATA },
+    { "RunDetails",           XsdType::COLLECTION_METADATA },
+    { "RunId",                XsdType::COLLECTION_METADATA },
+    { "SampleReuseEnabled",   XsdType::COLLECTION_METADATA },
+    { "SequencingCondition",  XsdType::COLLECTION_METADATA },
+    { "SigProcVer",           XsdType::COLLECTION_METADATA },
+    { "SizeSelectionEnabled", XsdType::COLLECTION_METADATA },
+    { "StageHotstartEnabled", XsdType::COLLECTION_METADATA },
+    { "UseCount",             XsdType::COLLECTION_METADATA },
+    { "WellName",             XsdType::COLLECTION_METADATA },
+    { "WellSample",           XsdType::COLLECTION_METADATA },
+
+    // 'pbsample' elements
+    //
+    { "BioSample",         XsdType::SAMPLE_INFO },
+    { "BioSamplePointer",  XsdType::SAMPLE_INFO },
+    { "BioSamplePointers", XsdType::SAMPLE_INFO },
+    { "BioSamples",        XsdType::SAMPLE_INFO }
+};
+// clang-format on
+
+}  // namespace internal
+
+// ---------------
+// NamespaceInfo
+// ---------------
+
+static_assert(std::is_copy_constructible<NamespaceInfo>::value,
+              "NamespaceInfo(const NamespaceInfo&) is not = default");
+static_assert(std::is_copy_assignable<NamespaceInfo>::value,
+              "NamespaceInfo& operator=(const NamespaceInfo&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<NamespaceInfo>::value,
+              "NamespaceInfo(NamespaceInfo&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<NamespaceInfo>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+NamespaceInfo::NamespaceInfo(std::string name, std::string uri)
+    : name_{std::move(name)}, uri_{std::move(uri)}
+{
+}
+
+const std::string& NamespaceInfo::Name() const { return name_; }
+
+const std::string& NamespaceInfo::Uri() const { return uri_; }
+
+// -------------------
+// NamespaceRegistry
+// -------------------
+
+static_assert(std::is_copy_constructible<NamespaceRegistry>::value,
+              "NamespaceRegistry(const NamespaceRegistry&) is not = default");
+static_assert(std::is_copy_assignable<NamespaceRegistry>::value,
+              "NamespaceRegistry& operator=(const NamespaceRegistry&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<NamespaceRegistry>::value,
+              "NamespaceRegistry(NamespaceRegistry&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<NamespaceRegistry>::value,
+              "NamespaceRegistry& operator=(NamespaceRegistry&&) is not = noexcept");
+
+NamespaceRegistry::NamespaceRegistry() : data_{internal::DefaultRegistry()} {}
+
+const NamespaceInfo& NamespaceRegistry::DefaultNamespace() const { return Namespace(DefaultXsd()); }
+
+XsdType NamespaceRegistry::DefaultXsd() const { return defaultXsdType_; }
+
+const NamespaceInfo& NamespaceRegistry::Namespace(const XsdType& xsd) const
+{
+    return data_.at(xsd);
+}
+
+void NamespaceRegistry::Register(const XsdType& xsd, const NamespaceInfo& namespaceInfo)
+{
+    data_[xsd] = namespaceInfo;
+}
+
+void NamespaceRegistry::SetDefaultXsd(const XsdType& xsd) { defaultXsdType_ = xsd; }
+
+XsdType NamespaceRegistry::XsdForElement(const std::string& elementLabel) const
+{
+    const auto iter = internal::elementRegistry.find(elementLabel);
+    return (iter == internal::elementRegistry.cend() ? XsdType::NONE : iter->second);
+}
+
+XsdType NamespaceRegistry::XsdForUri(const std::string& uri) const
+{
+    for (const auto& entry : data_) {
+        const auto& info = entry.second;
+        if (info.Uri() == uri) return entry.first;
+    }
+    return XsdType::NONE;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/EntireFileQuery.cpp b/src/EntireFileQuery.cpp

new file mode 100644 (file)

index 0000000..85f65cf
--- /dev/null
+++ b/src/EntireFileQuery.cpp
@@ -0,0 +1,34 @@
+// File Description
+/// \file EntireFileQuery.cpp
+/// \brief Implements the EntireFileQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/EntireFileQuery.h"
+
+#include "pbbam/CompositeBamReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+class EntireFileQuery::EntireFileQueryPrivate
+{
+public:
+    EntireFileQueryPrivate(const DataSet &dataset) : reader_(dataset) {}
+
+    SequentialCompositeBamReader reader_;
+};
+
+EntireFileQuery::EntireFileQuery(const DataSet &dataset)
+    : internal::IQuery(), d_(new EntireFileQueryPrivate(dataset))
+{
+}
+
+EntireFileQuery::~EntireFileQuery() = default;
+
+bool EntireFileQuery::GetNext(BamRecord &r) { return d_->reader_.GetNext(r); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FaiIndex.cpp b/src/FaiIndex.cpp

new file mode 100644 (file)

index 0000000..e59def1
--- /dev/null
+++ b/src/FaiIndex.cpp
@@ -0,0 +1,139 @@
+// File Description
+/// \file FastaReader.cpp
+/// \brief Implements the FastaReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FaiIndex.h"
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(!std::is_copy_constructible<FaiIndex>::value,
+              "FaiIndex(const FaiIndex&) is not = delete");
+static_assert(!std::is_copy_assignable<FaiIndex>::value,
+              "FaiIndex& operator=(const FaiIndex&) is not = delete");
+
+bool operator==(const FaiEntry& lhs, const FaiEntry& rhs)
+{
+    return std::tie(lhs.Length, lhs.SeqOffset, lhs.NumBases, lhs.NumBytes, lhs.QualOffset) ==
+           std::tie(rhs.Length, rhs.SeqOffset, rhs.NumBases, rhs.NumBytes, rhs.QualOffset);
+}
+
+std::ostream& operator<<(std::ostream& out, const FaiEntry& entry)
+{
+    out << entry.Length << '\t' << entry.SeqOffset << '\t' << entry.NumBases << '\t'
+        << entry.NumBytes;
+    if (entry.QualOffset >= 0) out << '\t' << entry.QualOffset;
+    return out;
+}
+
+class FaiIndex::FaiIndexPrivate
+{
+public:
+    FaiIndexPrivate() = default;
+    FaiIndexPrivate(const std::string& fn) { LoadFromFile(fn); }
+
+    void Add(std::string name, FaiEntry entry)
+    {
+        names_.push_back(name);
+        data_.emplace(std::move(name), std::move(entry));
+    }
+
+    void LoadFromFile(const std::string& fn)
+    {
+        std::ifstream f{fn};
+        std::string line;
+        std::vector<std::string> fields;
+        while (std::getline(f, line)) {
+
+            fields = Split(line, '\t');
+            const auto numFields = fields.size();
+            if (numFields < 5 || numFields > 6) {
+                std::ostringstream msg;
+                msg << "FaiIndex: malformatted index line, incorrect number of fields\n"
+                    << "  expected: 5 for FASTA, or 6 for FASTQ\n"
+                    << "  observed: " << numFields << " in line:\n"
+                    << line << '\n';
+                throw std::runtime_error{msg.str()};
+            }
+
+            FaiEntry entry;
+            entry.Length = std::stoull(fields[1]);
+            entry.SeqOffset = std::stoull(fields[2]);
+            entry.NumBases = std::stoul(fields[3]);
+            entry.NumBytes = std::stoul(fields[4]);
+            if (numFields == 6) entry.QualOffset = std::stoll(fields[5]);
+
+            Add(std::move(fields[0]), std::move(entry));
+        }
+    }
+
+    std::vector<std::string> names_;                  // save names in input order
+    std::unordered_map<std::string, FaiEntry> data_;  // map name -> data
+};
+
+FaiIndex::FaiIndex(const std::string& fn) : d_{std::make_unique<FaiIndexPrivate>(fn)} {}
+
+FaiIndex::FaiIndex() : d_{std::make_unique<FaiIndexPrivate>()} {}
+
+FaiIndex::FaiIndex(FaiIndex&&) noexcept = default;
+
+FaiIndex& FaiIndex::operator=(FaiIndex&&) noexcept = default;
+
+FaiIndex::~FaiIndex() = default;
+
+void FaiIndex::Add(std::string name, FaiEntry entry) { d_->Add(std::move(name), std::move(entry)); }
+
+const FaiEntry& FaiIndex::Entry(const std::string& name) const
+{
+    const auto found = d_->data_.find(name);
+    if (found == d_->data_.cend())
+        throw std::runtime_error{"FaiIndex: could not find entry for sequence name: " + name};
+    return found->second;
+}
+
+const FaiEntry& FaiIndex::Entry(const uint32_t row) const
+{
+    const auto& name = d_->names_.at(row);
+    return Entry(name);
+}
+
+bool FaiIndex::HasEntry(const std::string& name) const
+{
+    const auto found = d_->data_.find(name);
+    return found != d_->data_.cend();
+}
+
+const std::vector<std::string>& FaiIndex::Names() const { return d_->names_; }
+
+void FaiIndex::Save(const std::string& fn) const
+{
+    std::ofstream out{fn};
+    Save(out);
+}
+
+void FaiIndex::Save(std::ostream& out) const
+{
+    for (const auto& name : d_->names_) {
+        const auto& entry = Entry(name);
+        out << name << '\t' << entry << '\n';
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+\ No newline at end of file
diff --git a/src/FaiZmwChunker.cpp b/src/FaiZmwChunker.cpp

new file mode 100644 (file)

index 0000000..062b6f0
--- /dev/null
+++ b/src/FaiZmwChunker.cpp
@@ -0,0 +1,119 @@
+// File Description
+/// \file FaiZmwChunker.cpp
+/// \brief Implements the FaiZmwChunker class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "FaiZmwChunker.h"
+
+#include <cassert>
+
+#include <algorithm>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+int32_t HoleNumber(const std::string& name)
+{
+    const auto firstSlash = name.find('/');
+    if (firstSlash == std::string::npos)
+        throw std::runtime_error{"FaiZmwChunker: could not parse hole number from name: " + name};
+
+    auto numberEnd = name.find('/', firstSlash + 1);
+    if (numberEnd == std::string::npos) numberEnd = name.size();
+
+    return std::stoi(name.substr(firstSlash + 1, (numberEnd - firstSlash)));
+}
+
+}  // namespace
+
+FaiZmwChunker::FaiZmwChunker(const FaiIndex& index, const size_t numChunks)
+{
+    // zero chunks is error
+    if (numChunks == 0)
+        throw std::runtime_error{"FaiZmwChunker: requested chunk count must be greater than zero"};
+
+    // empty index is not (?), but quick return
+    const auto& names = index.Names();
+    if (names.empty()) return;
+
+    // tease apart unique ZMWs
+    int32_t currentHoleNumber = -1;
+    std::vector<FaiZmwChunk> rawChunks;
+    for (const auto& name : names) {
+        const int32_t holeNumber = HoleNumber(name);
+        if (holeNumber != currentHoleNumber) {
+            rawChunks.emplace_back(FaiZmwChunk{name, index.Entry(name).SeqOffset, 1, 1});
+            currentHoleNumber = holeNumber;
+        } else
+            ++rawChunks.back().NumRecords;
+    }
+
+    // no empty chunks (e.g. reduce the requested number, if small ZMW input)
+    const size_t actualNumChunks = std::min(numChunks, rawChunks.size());
+
+    // determine how many ZMWs should land in each chunk, spread roughly evenly
+    const int minimum = (rawChunks.size() / actualNumChunks);
+    const int modulo = (rawChunks.size() % actualNumChunks);
+    std::vector<size_t> chunkCounts(actualNumChunks, minimum);
+    for (int i = 0; i < modulo; ++i)
+        ++chunkCounts.at(i);
+
+    // collate zmw data into larger chunks
+    size_t begin = 0;
+    size_t end = 0;
+    for (const auto n : chunkCounts) {
+
+        // shift end down for this chunk
+        end += n;
+        assert(end <= rawChunks.size());
+
+        // add data for this chunk
+        FaiZmwChunk result = rawChunks.at(begin);
+        result.NumZmws = n;
+        for (size_t j = begin + 1; j < end; ++j)
+            result.NumRecords += rawChunks.at(j).NumRecords;
+        chunks_.emplace_back(std::move(result));
+
+        // slide to next chunk
+        begin = end;
+    }
+}
+
+FaiZmwChunker::FaiZmwChunker(const std::string& filename, const size_t numChunks)
+    : FaiZmwChunker{FaiIndex{filename}, numChunks}
+{
+}
+
+FaiZmwChunker::FaiZmwChunker(const FaiZmwChunker&) = default;
+
+FaiZmwChunker::FaiZmwChunker(FaiZmwChunker&&) noexcept = default;
+
+FaiZmwChunker& FaiZmwChunker::operator=(const FaiZmwChunker&) = default;
+
+FaiZmwChunker& FaiZmwChunker::operator=(FaiZmwChunker&&) noexcept = default;
+
+FaiZmwChunker::~FaiZmwChunker() = default;
+
+const FaiZmwChunk& FaiZmwChunker::Chunk(size_t chunk) const { return chunks_.at(chunk); }
+
+size_t FaiZmwChunker::NumChunks() const { return chunks_.size(); }
+
+// size_t FaiZmwChunker::MaxChunkSize(size_t index) const
+// {
+//     const auto max = std::max_element(
+//         chunks_.cbegin(), chunks_.cend(),
+//         [](const ZmwChunk& lhs, const ZmwChunk& rhs) { return lhs.NumZmws < rhs.NumZmws; });
+
+//     if (max == chunks_.cend()) return 0;
+//     return (*max).NumZmws;
+// }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FaiZmwChunker.h b/src/FaiZmwChunker.h

new file mode 100644 (file)

index 0000000..5c52e03
--- /dev/null
+++ b/src/FaiZmwChunker.h
@@ -0,0 +1,84 @@
+// File Description
+/// \file FaiZmwChunker.h
+/// \brief Defines the FaiZmwChunker enum.
+//
+// Author: Derek Barnett
+
+#ifndef FAIZMWCHUNKER_H
+#define FAIZMWCHUNKER_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+#include <vector>
+
+#include "pbbam/FaiIndex.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct FaiZmwChunk
+{
+    /// Name of first entry
+    std::string FirstSeqName;
+
+    /// File offset to the sequence of the chunk's first entry.
+    uint64_t FirstSeqOffset;
+
+    // Total number of records in chunk.
+    size_t NumRecords;
+
+    // Number of unique ZMWs
+    size_t NumZmws;
+};
+
+///
+/// \brief The FaiZmwChunker takes a FAI index and bins unique ZMW hole numbers
+///        into chunks.
+///
+class FaiZmwChunker
+{
+public:
+    ///
+    /// \brief Construct a new FaiZmwChunker
+    ///
+    /// \param index        FAI index
+    /// \param numChunks    desired number of chunks
+    ///
+    /// Actual chunk count may be smaller than the requested number, if the input
+    /// size is smaller.
+    ///
+    FaiZmwChunker(const FaiIndex& index, const size_t numChunks);
+
+    ///
+    /// \brief Construct a new FaiZmwChunker
+    ///
+    /// \param filename     FAI filename
+    /// \param numChunks    desired number of chunks
+    ///
+    /// Actual chunk count may be smaller than the requested number, if the input
+    /// size is smaller.
+    ///
+    FaiZmwChunker(const std::string& filename, const size_t numChunks);
+
+    FaiZmwChunker(const FaiZmwChunker&);
+    FaiZmwChunker(FaiZmwChunker&&) noexcept;
+    FaiZmwChunker& operator=(const FaiZmwChunker&);
+    FaiZmwChunker& operator=(FaiZmwChunker&&) noexcept;
+    ~FaiZmwChunker();
+
+public:
+    const FaiZmwChunk& Chunk(size_t chunk) const;
+
+    size_t NumChunks() const;
+
+    // size_t MaxChunkSize(size_t index) const;
+
+private:
+    std::vector<FaiZmwChunk> chunks_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FAIZMWCHUNKER_H
diff --git a/src/FastaCache.cpp b/src/FastaCache.cpp

new file mode 100644 (file)

index 0000000..0eb50b2
--- /dev/null
+++ b/src/FastaCache.cpp
@@ -0,0 +1,61 @@
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FastaCache.h"
+
+#include <stdexcept>
+
+#include "pbbam/FastaReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+FastaCacheData::FastaCacheData(const std::string& filename) : cache_{FastaReader::ReadAll(filename)}
+{
+    for (size_t i = 0; i < cache_.size(); ++i)
+        lookup_.emplace(cache_[i].Name(), i);
+}
+
+std::string FastaCacheData::Subsequence(const std::string& name, size_t begin, size_t end) const
+{
+    const auto found = lookup_.find(name);
+    if (found == lookup_.cend()) {
+        std::string msg = "Could not find '";
+        msg += name;
+        msg += "' in FastaCacheData::Subsequence()";
+        throw std::runtime_error{msg};
+    }
+    const std::string& seq = cache_[found->second].Bases();
+
+    if (begin > end) throw std::runtime_error{"begin > end in FastaCacheData::Subsequence"};
+    const size_t length = end - begin;
+    return seq.substr(begin, length);
+}
+
+std::vector<std::string> FastaCacheData::Names() const
+{
+    std::vector<std::string> result;
+    result.reserve(cache_.size());
+    for (const auto& seq : cache_)
+        result.push_back(seq.Name());
+    return result;
+}
+
+size_t FastaCacheData::SequenceLength(const std::string& name) const
+{
+    const auto found = lookup_.find(name);
+    if (found == lookup_.cend()) {
+        std::string msg = "Could not find '";
+        msg += name;
+        msg += "' in FastaCacheData::SequenceLength()";
+        throw std::runtime_error{msg};
+    }
+    return cache_[found->second].Bases().size();
+}
+
+FastaCache MakeFastaCache(const std::string& filename)
+{
+    return std::make_shared<FastaCacheData>(filename);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FastaReader.cpp b/src/FastaReader.cpp

new file mode 100644 (file)

index 0000000..5bfca33
--- /dev/null
+++ b/src/FastaReader.cpp
@@ -0,0 +1,79 @@
+// File Description
+/// \file FastaReader.cpp
+/// \brief Implements the FastaReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FastaReader.h"
+
+#include <algorithm>
+#include <cassert>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "pbbam/FormatUtils.h"
+
+#include "KSeqReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(!std::is_copy_constructible<FastaReader>::value,
+              "FastaReader(const FastaReader&) is not = delete");
+static_assert(!std::is_copy_assignable<FastaReader>::value,
+              "FastaReader& operator=(const FastaReader&) is not = delete");
+
+class FastaReader::FastaReaderPrivate
+{
+public:
+    explicit FastaReaderPrivate(const std::string& fn)
+    {
+        // validate extension
+        if (!FormatUtils::IsFastaFilename(fn)) {
+            throw std::runtime_error{"FastaReader: filename '" + fn +
+                                     "' is not recognized as a FASTA file."};
+        }
+        reader_ = std::make_unique<KSeqReader>(fn);
+    }
+
+    bool GetNext(FastaSequence& record)
+    {
+        const auto readOk = reader_->ReadNext();
+        if (!readOk) return false;  // not error, could be EOF
+
+        record = FastaSequence{reader_->Name(), reader_->Bases()};
+        return true;
+    }
+
+    std::unique_ptr<KSeqReader> reader_;
+};
+
+FastaReader::FastaReader(const std::string& fn)
+    : internal::QueryBase<FastaSequence>{}, d_{std::make_unique<FastaReaderPrivate>(fn)}
+{
+}
+
+FastaReader::FastaReader(FastaReader&&) noexcept = default;
+
+FastaReader& FastaReader::operator=(FastaReader&&) noexcept = default;
+
+FastaReader::~FastaReader() = default;
+
+bool FastaReader::GetNext(FastaSequence& record) { return d_->GetNext(record); }
+
+std::vector<FastaSequence> FastaReader::ReadAll(const std::string& fn)
+{
+    std::vector<FastaSequence> result;
+    result.reserve(256);
+    FastaReader reader{fn};
+    for (const auto& seq : reader)
+        result.emplace_back(seq);
+    return result;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FastaSequence.cpp b/src/FastaSequence.cpp

new file mode 100644 (file)

index 0000000..9be2696
--- /dev/null
+++ b/src/FastaSequence.cpp
@@ -0,0 +1,61 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FastaSequence.h"
+
+#include <cstdio>
+
+#include <cassert>
+#include <exception>
+#include <tuple>
+#include <type_traits>
+
+#include <boost/algorithm/string.hpp>
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(std::is_copy_constructible<FastaSequence>::value,
+              "FastaSequence(const FastaSequence&) is not = default");
+static_assert(std::is_copy_assignable<FastaSequence>::value,
+              "FastaSequence& operator=(const FastaSequence&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<FastaSequence>::value,
+              "FastaSequence(FastaSequence&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<FastaSequence>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+FastaSequence::FastaSequence(std::string name, std::string bases)
+    : name_{std::move(name)}, bases_{std::move(bases)}
+{
+    boost::algorithm::trim(name_);
+    boost::algorithm::trim(bases_);
+}
+
+const std::string& FastaSequence::Bases() const { return bases_; }
+
+FastaSequence& FastaSequence::Bases(std::string bases)
+{
+    bases_ = std::move(bases);
+    return *this;
+}
+
+const std::string& FastaSequence::Name() const { return name_; }
+
+FastaSequence& FastaSequence::Name(std::string name)
+{
+    name_ = std::move(name);
+    return *this;
+}
+
+bool FastaSequence::operator==(const FastaSequence& other) const
+{
+    return std::tie(name_, bases_) == std::tie(other.name_, other.bases_);
+}
+
+bool FastaSequence::operator!=(const FastaSequence& other) const { return !(*this == other); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FastaSequenceQuery.cpp b/src/FastaSequenceQuery.cpp

new file mode 100644 (file)

index 0000000..e865c3d
--- /dev/null
+++ b/src/FastaSequenceQuery.cpp
@@ -0,0 +1,34 @@
+// File Description
+/// \file FastaSequenceQuery.cpp
+/// \brief Implements the FastaSequenceQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FastaSequenceQuery.h"
+
+#include "pbbam/CompositeFastaReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+class FastaSequenceQuery::FastaSequenceQueryPrivate
+{
+public:
+    FastaSequenceQueryPrivate(const DataSet& dataset) : reader_{dataset} {}
+
+    CompositeFastaReader reader_;
+};
+
+FastaSequenceQuery::FastaSequenceQuery(const DataSet& dataset)
+    : internal::QueryBase<FastaSequence>(), d_{std::make_unique<FastaSequenceQueryPrivate>(dataset)}
+{
+}
+
+FastaSequenceQuery::~FastaSequenceQuery() = default;
+
+bool FastaSequenceQuery::GetNext(FastaSequence& seq) { return d_->reader_.GetNext(seq); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FastaWriter.cpp b/src/FastaWriter.cpp

new file mode 100644 (file)

index 0000000..4761527
--- /dev/null
+++ b/src/FastaWriter.cpp
@@ -0,0 +1,45 @@
+// File Description
+/// \file FastaWriter.cpp
+/// \brief Implements the FastaWriter class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FastaWriter.h"
+
+#include <stdexcept>
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/FastqSequence.h"
+#include "pbbam/FormatUtils.h"
+namespace PacBio {
+namespace BAM {
+
+FastaWriter::FastaWriter(const std::string& fn) : IFastaWriter{}
+{
+    if (!FormatUtils::IsFastaFilename(fn)) {
+        throw std::runtime_error{"FastaReader: filename '" + fn +
+                                 "' is not recognized as a FASTA file."};
+    }
+
+    file_.open(fn);
+    if (!file_) throw std::runtime_error{"FastaWriter: could not open file for writing: " + fn};
+}
+
+void FastaWriter::TryFlush() { file_.flush(); }
+
+void FastaWriter::Write(const BamRecordImpl& bam) { Write(bam.Name(), bam.Sequence()); }
+
+void FastaWriter::Write(const FastaSequence& fastq) { Write(fastq.Name(), fastq.Bases()); }
+
+void FastaWriter::Write(const BamRecord& bam) { Write(bam.FullName(), bam.Sequence()); }
+
+void FastaWriter::Write(const std::string& name, const std::string& bases)
+{
+    // TODO: wrap bases
+    file_ << ">" << name << '\n' << bases << '\n';
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FastqReader.cpp b/src/FastqReader.cpp

new file mode 100644 (file)

index 0000000..8762edd
--- /dev/null
+++ b/src/FastqReader.cpp
@@ -0,0 +1,76 @@
+// File Description
+/// \file FastqReader.cpp
+/// \brief Implements the FastqReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FastqReader.h"
+
+#include <cassert>
+#include <stdexcept>
+#include <type_traits>
+
+#include "pbbam/FormatUtils.h"
+
+#include "KSeqReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(!std::is_copy_constructible<FastqReader>::value,
+              "FastqReader(const FastqReader&) is not = delete");
+static_assert(!std::is_copy_assignable<FastqReader>::value,
+              "FastqReader& operator=(const FastqReader&) is not = delete");
+
+class FastqReader::FastqReaderPrivate
+{
+public:
+    explicit FastqReaderPrivate(const std::string& fn)
+    {
+        // validate extension
+        if (!FormatUtils::IsFastqFilename(fn)) {
+            throw std::runtime_error{"FastqReader: filename '" + fn +
+                                     "' is not recognized as a FASTQ file."};
+        }
+        reader_ = std::make_unique<KSeqReader>(fn);
+    }
+
+    bool GetNext(FastqSequence& record)
+    {
+        const auto ok = reader_->ReadNext();
+        if (!ok) return false;  // not error, could be EOF
+
+        record = FastqSequence{reader_->Name(), reader_->Bases(), reader_->Qualities()};
+        return true;
+    }
+
+    std::unique_ptr<KSeqReader> reader_;
+};
+
+FastqReader::FastqReader(const std::string& fn)
+    : internal::QueryBase<FastqSequence>{}, d_{std::make_unique<FastqReaderPrivate>(fn)}
+{
+}
+
+FastqReader::FastqReader(FastqReader&&) noexcept = default;
+
+FastqReader& FastqReader::operator=(FastqReader&&) noexcept = default;
+
+FastqReader::~FastqReader() = default;
+
+bool FastqReader::GetNext(FastqSequence& record) { return d_->GetNext(record); }
+
+std::vector<FastqSequence> FastqReader::ReadAll(const std::string& fn)
+{
+    std::vector<FastqSequence> result;
+    result.reserve(256);
+    FastqReader reader{fn};
+    for (const auto& seq : reader)
+        result.emplace_back(seq);
+    return result;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FastqSequence.cpp b/src/FastqSequence.cpp

new file mode 100644 (file)

index 0000000..c70efe5
--- /dev/null
+++ b/src/FastqSequence.cpp
@@ -0,0 +1,55 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FastqSequence.h"
+
+#include <cassert>
+#include <cstdio>
+#include <exception>
+#include <tuple>
+#include <type_traits>
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(std::is_copy_constructible<FastqSequence>::value,
+              "FastqSequence(const FastqSequence&) is not = default");
+static_assert(std::is_copy_assignable<FastqSequence>::value,
+              "FastqSequence& operator=(const FastqSequence&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<FastqSequence>::value,
+              "FastqSequence(FastqSequence&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<FastqSequence>::value ==
+                  std::is_nothrow_move_assignable<FastaSequence>::value,
+              "");
+
+FastqSequence::FastqSequence(std::string name, std::string bases, Data::QualityValues qualities)
+    : FastaSequence{std::move(name), std::move(bases)}, qualities_{std::move(qualities)}
+{
+}
+
+FastqSequence::FastqSequence(std::string name, std::string bases, std::string qualities)
+    : FastaSequence{std::move(name), std::move(bases)}
+    , qualities_{Data::QualityValues::FromFastq(qualities)}
+{
+}
+
+const Data::QualityValues& FastqSequence::Qualities() const { return qualities_; }
+
+FastqSequence& FastqSequence::Qualities(QualityValues quals)
+{
+    qualities_ = std::move(quals);
+    return *this;
+}
+
+bool FastqSequence::operator==(const FastqSequence& other) const
+{
+    return std::tie(Name(), Bases(), qualities_) ==
+           std::tie(other.Name(), other.Bases(), other.qualities_);
+}
+
+bool FastqSequence::operator!=(const FastqSequence& other) const { return !(*this == other); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FastqWriter.cpp b/src/FastqWriter.cpp

new file mode 100644 (file)

index 0000000..529fa98
--- /dev/null
+++ b/src/FastqWriter.cpp
@@ -0,0 +1,70 @@
+// File Description
+/// \file FastqWriter.cpp
+/// \brief Implements the FastqWriter class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FastqWriter.h"
+
+#include <stdexcept>
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/FastqSequence.h"
+#include "pbbam/FormatUtils.h"
+#include "pbbam/QualityValues.h"
+
+namespace PacBio {
+namespace BAM {
+
+FastqWriter::FastqWriter(const std::string& fn) : IFastqWriter{}
+{
+    if (!FormatUtils::IsFastqFilename(fn)) {
+        throw std::runtime_error{"FastqReader: filename '" + fn +
+                                 "' is not recognized as a FASTQ file."};
+    }
+
+    file_.open(fn);
+    if (!file_) throw std::runtime_error{"FastqWriter: could not open file for writing: " + fn};
+}
+
+void FastqWriter::TryFlush() { file_.flush(); }
+
+void FastqWriter::Write(const FastqSequence& fastq)
+{
+    Write(fastq.Name(), fastq.Bases(), fastq.Qualities());
+}
+
+void FastqWriter::Write(const BamRecord& bam)
+{
+    Write(bam.FullName(), bam.Sequence(), bam.Qualities());
+}
+
+void FastqWriter::Write(const BamRecordImpl& bam)
+{
+    Write(bam.Name(), bam.Sequence(), bam.Qualities());
+}
+
+void FastqWriter::Write(const std::string& name, const std::string& bases,
+                        const Data::QualityValues& quals)
+{
+    Write(name, bases, quals.Fastq());
+}
+
+void FastqWriter::Write(const std::string& name, const std::string& bases, const std::string& quals)
+{
+    file_ << "@" << name << '\n' << bases << '\n' << "+\n";
+
+    if (!quals.empty())
+        file_ << quals;
+    else {
+        std::string q(bases.size(), '!');
+        file_ << q;
+    }
+
+    file_ << '\n';
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FileProducer.cpp b/src/FileProducer.cpp

new file mode 100644 (file)

index 0000000..ccdc218
--- /dev/null
+++ b/src/FileProducer.cpp
@@ -0,0 +1,43 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "FileProducer.h"
+
+#include <cstdio>
+
+#include <stdexcept>
+
+namespace PacBio {
+namespace BAM {
+
+FileProducer::FileProducer(std::string targetFilename)
+    : FileProducer(targetFilename, targetFilename + ".tmp")
+{
+}
+
+FileProducer::FileProducer(std::string targetFilename, std::string tempFilename)
+    : targetFilename_{std::move(targetFilename)}, tempFilename_{std::move(tempFilename)}
+{
+    if (targetFilename_.empty()) {
+        throw std::runtime_error{"FileProducer error: cannot write to file with empty name"};
+    }
+
+    // override renaming if writing to stdout
+    //
+    // setting temp filename to '-' keeps consistent interfaces
+    // for derived classes to actually operate on temp filename
+    if (targetFilename_ == "-") tempFilename_ = "-";
+}
+
+FileProducer::~FileProducer()
+{
+    // skip renaming if there is a 'live' exception
+    // or if writing to stdout
+    if ((std::current_exception() == nullptr) && (tempFilename_ != "-")) {
+        std::rename(tempFilename_.c_str(), targetFilename_.c_str());
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FileProducer.h b/src/FileProducer.h

new file mode 100644 (file)

index 0000000..2562350
--- /dev/null
+++ b/src/FileProducer.h
@@ -0,0 +1,56 @@
+// Author: Derek Barnett
+
+#ifndef FILEPRODUCER_H
+#define FILEPRODUCER_H
+
+#include "pbbam/Config.h"
+
+#include <cstdio>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+// The FileProducer class provides functionality for working with a temp
+// file until successful destruction of a FileProducer-derived class.
+//
+// Derived classes should be sure to flush/close the temp file, and the
+// FileProducer's destructor will ensure that the temp file will be renamed to
+// the target filename.
+//
+// If destruction is triggered by an exception, no renaming will occur.
+//
+class FileProducer
+{
+public:
+    FileProducer() = delete;
+
+    // Initializes FileProducer with specified target filename. Temp filename is
+    // set to target filename plus ".tmp" suffix.
+    explicit FileProducer(std::string targetFilename);
+
+    // Initializes FileProducer with specified target filename & explicit temp
+    // filename.
+    FileProducer(std::string targetFilename, std::string tempFilename);
+
+    // Renames temp file to target filename.
+    //
+    // Derived classes should ensure that data is flushed and file handle closed
+    // before or during their destructor.
+    //
+    // Remaming will not occur if there is a 'live' exception being thrown.
+    //
+    virtual ~FileProducer();
+
+    const std::string& TargetFilename() const { return targetFilename_; }
+    const std::string& TempFilename() const { return tempFilename_; }
+
+private:
+    std::string targetFilename_;
+    std::string tempFilename_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FILEPRODUCER_H
diff --git a/src/FileUtils.cpp b/src/FileUtils.cpp

new file mode 100644 (file)

index 0000000..8f3d848
--- /dev/null
+++ b/src/FileUtils.cpp
@@ -0,0 +1,203 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "FileUtils.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+#include <cassert>
+#include <cstddef>
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+#include <boost/algorithm/string.hpp>
+
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace {
+
+// pops "file://" scheme off the front of a URI/filepath, if found
+static std::string removeFileUriScheme(const std::string& uri)
+{
+    assert(!uri.empty());
+
+    auto schemeLess = uri;
+    const auto fileScheme = std::string{"file://"};
+    const auto schemeFound = schemeLess.find(fileScheme);
+    if (schemeFound != std::string::npos) {
+        if (schemeFound != 0)
+            throw std::runtime_error{"FileUtils: malformed URI, scheme is not at beginning"};
+        schemeLess = schemeLess.substr(fileScheme.size());
+    }
+    return schemeLess;
+}
+
+#ifdef PBBAM_WIN_FILEPATHS
+
+static std::string removeDiskName(const std::string& filePath)
+{
+    if (filePath.size() >= 2) {
+        const char firstChar = filePath.at(0);
+        if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':')) return filePath.substr(2);
+    }
+    return filePath;
+}
+
+static const char native_pathSeparator = '\\';
+
+static bool native_pathIsAbsolute(const std::string& filePath)
+{
+    assert(!filePath.empty());
+
+    // if starts with single slash or double slash
+    if (boost::algorithm::starts_with(filePath, "\\")) return true;
+
+    // if starts with single or double-dots -> not absolute
+    if (boost::algorithm::starts_with(filePath, ".")) return false;
+
+    // if starts with disk drive name and colon ("C:\foo\bar.txt")
+    // strip the drive name and check to see if the remaining path is absolute
+    if (filePath.size() >= 2) {
+        const char firstChar = filePath.at(0);
+        if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':'))
+            return native_pathIsAbsolute(removeDiskName(filePath));
+    }
+
+    // otherwise, likely relative
+    return false;
+}
+
+static std::string native_resolvedFilePath(const std::string& filePath, const std::string& from)
+{
+    // strip file:// scheme if present
+    auto schemeLess = removeFileUriScheme(filePath);
+
+    // if empty or already absolute path, just return it
+    // upfront empty check simplifies further parsing logic
+    if (schemeLess.empty() || native_pathIsAbsolute(schemeLess)) return schemeLess;
+
+    // else make relative from the provided 'from' directory
+    //
+    // first pop disk name, then any leading single-dot '.'
+    //
+    // since we're prepending the 'from' directory, we can remove
+    // any leading './' form our file path. this may just mean that
+    // we pop it off to add it right back (when from == '.'), but this
+    // keeps it consistent with other 'from' parent directories
+    //
+    schemeLess = removeDiskName(schemeLess);
+
+    const bool thisDirAtStart = (schemeLess.find(".") == 0);
+    if (thisDirAtStart) {
+        if (schemeLess.find(native_pathSeparator) == 1) schemeLess = schemeLess.substr(2);
+    }
+    return from + native_pathSeparator + schemeLess;
+}
+
+#else  // else for non-Windows systems
+
+static const char native_pathSeparator = '/';
+
+static bool native_pathIsAbsolute(const std::string& filePath) { return filePath.at(0) == '/'; }
+
+static std::string native_resolvedFilePath(const std::string& filePath, const std::string& from)
+{
+    // strip file:// scheme if present
+    auto schemeLess = removeFileUriScheme(filePath);
+
+    // if empty or already absolute path, just return it
+    // upfront empty check simplifies further parsing logic
+    if (schemeLess.empty() || native_pathIsAbsolute(schemeLess)) return schemeLess;
+
+    // else make relative from the provided 'from' directory
+    //
+    // since we're prepending the 'from' directory, we can remove
+    // any leading './' form our file path. this may just mean that
+    // we pop it off to add it right back (when from == '.'), but this
+    // keeps it consistent with other 'from' parent directories
+    //
+    const bool thisDirAtStart = (schemeLess.find(".") == 0);
+    if (thisDirAtStart) {
+        if (schemeLess.find(native_pathSeparator) == 1) schemeLess = schemeLess.substr(2);
+    }
+    return from + native_pathSeparator + schemeLess;
+}
+
+#endif  // PBBAM_WIN_FILEPATHS
+
+}  // anonymous
+
+// see http://stackoverflow.com/questions/2869594/how-return-a-stdstring-from-cs-getcwd-function
+std::string FileUtils::CurrentWorkingDirectory()
+{
+    const size_t chunkSize = 1024;
+    const size_t maxNumChunks = 20;
+
+    // stack-based buffer for 'normal' case
+    char buffer[chunkSize];
+    if (getcwd(buffer, sizeof(buffer)) != nullptr) return std::string(buffer);
+
+    // if error is not ERANGE, then it's not a problem of too-long name... something else happened
+    if (errno != ERANGE)
+        throw std::runtime_error{"FileUtils: could not determine current working directory path"};
+
+    // long path - use heap, trying progressively longer buffers
+    for (size_t chunks = 2; chunks < maxNumChunks; ++chunks) {
+        std::unique_ptr<char> cwd(new char[chunkSize * chunks]);
+        if (getcwd(cwd.get(), chunkSize * chunks) != nullptr) return std::string(cwd.get());
+
+        // if error is not ERANGE, then it's not a problem of too-long name... something else happened
+        if (errno != ERANGE)
+            throw std::runtime_error{
+                "FileUtils: could not determine current working directory path"};
+    }
+
+    // crazy long path name
+    throw std::runtime_error{
+        "FileUtils: could not determine current working directory - extremely long path"};
+}
+
+std::string FileUtils::DirectoryName(const std::string& file)
+{
+    const auto found = file.rfind(Separator(), file.length());
+    if (found != std::string::npos) return file.substr(0, found);
+    return std::string(".");
+}
+
+bool FileUtils::Exists(const char* fn)
+{
+    struct stat buf;
+    return (stat(fn, &buf) != -1);
+}
+
+std::chrono::system_clock::time_point FileUtils::LastModified(const char* fn)
+{
+    struct stat s;
+    if (stat(fn, &s) != 0)
+        throw std::runtime_error{"FileUtils: could not get timestamp for file: " + std::string{fn}};
+    return std::chrono::system_clock::from_time_t(s.st_mtime);
+}
+
+std::string FileUtils::ResolvedFilePath(const std::string& filePath, const std::string& from)
+{
+    return native_resolvedFilePath(filePath, from);
+}
+
+constexpr char FileUtils::Separator() { return native_pathSeparator; }
+
+off_t FileUtils::Size(const char* fn)
+{
+    struct stat s;
+    if (stat(fn, &s) != 0)
+        throw std::runtime_error{"FileUtils: could not determine size of file: " + std::string{fn}};
+    return s.st_size;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FileUtils.h b/src/FileUtils.h

new file mode 100644 (file)

index 0000000..dc9d2dd
--- /dev/null
+++ b/src/FileUtils.h
@@ -0,0 +1,108 @@
+// Author: Derek Barnett
+
+#ifndef FILEUTILS_H
+#define FILEUTILS_H
+
+#include "pbbam/Config.h"
+
+#include <chrono>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+class FileUtils
+{
+public:
+    /// \returns application's current working directory
+    static std::string CurrentWorkingDirectory();
+
+    /// Parses a filepath for the the directory name for a file.
+    ///
+    /// Essentially this method strips the filename from the string provided (/path/to/file => /path/to).
+    /// If only a filename is provided, then "." is returned to indicate the current directory.
+    ///
+    /// \param[in] file name of file (can be just a filename or path/to/filename)
+    /// \returns file's directory name
+    ///
+    static std::string DirectoryName(const std::string& file);
+
+    /// Check for existence of a file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns true if file exists & can be opened
+    ///
+    static bool Exists(const char* fn);
+
+    /// Check for existence of a file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns true if file exists & can be opened
+    ///
+    static bool Exists(const std::string& fn);
+
+    /// Check "last modified" timestamp for a file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns time of last modification
+    /// \throws runtime_error if file info can't be accessed
+    ///
+    static std::chrono::system_clock::time_point LastModified(const char* fn);
+
+    /// Check "last modified" timestamp for a file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns time of last modification
+    /// \throws runtime_error if file info can't be accessed
+    ///
+    static std::chrono::system_clock::time_point LastModified(const std::string& fn);
+
+    /// Resolves input file path using optional starting directory.
+    ///
+    /// \verbatim
+    ///   /absolute/path/to/file.txt   => /absolute/path/to/file.txt
+    ///   ../relative/path/to/file.txt => <from>/../relative/path/to/file.txt
+    ///   file.txt                     => <from>/file.txt
+    /// \endverbatim
+    ///
+    /// \note This method will strip any URI scheme as well ("file://") so that the result is immediately ready from I/O operations.
+    ///
+    /// \param[in] filePath file path to be resolved
+    /// \param[in] from     optional starting directory (useful if not same as application's working directory)
+    /// \returns resolved file path
+    ///
+    static std::string ResolvedFilePath(const std::string& filePath, const std::string& from = ".");
+
+    /// \returns native path separator
+    constexpr static char Separator();
+
+    /// Check size of file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns file size in bytes
+    /// \throws runtime_error if file info can't be accessed
+    ///
+    static off_t Size(const char* fn);
+
+    /// Check size of file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns file size in bytes
+    /// \throws runtime_error if file info can't be accessed
+    ///
+    static off_t Size(const std::string& fn);
+};
+
+inline bool FileUtils::Exists(const std::string& fn) { return FileUtils::Exists(fn.c_str()); }
+
+inline std::chrono::system_clock::time_point FileUtils::LastModified(const std::string& fn)
+{
+    return FileUtils::LastModified(fn.c_str());
+}
+
+inline off_t FileUtils::Size(const std::string& fn) { return FileUtils::Size(fn.c_str()); }
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FILEUTILS_H
diff --git a/src/FofnReader.cpp b/src/FofnReader.cpp

new file mode 100644 (file)

index 0000000..a8ae80d
--- /dev/null
+++ b/src/FofnReader.cpp
@@ -0,0 +1,22 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "FofnReader.h"
+
+#include <iostream>
+
+namespace PacBio {
+namespace BAM {
+
+std::vector<std::string> FofnReader::Files(std::istream& in)
+{
+    std::vector<std::string> files;
+    std::string fn;
+    while (std::getline(in, fn))
+        files.push_back(fn);
+    return files;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FofnReader.h b/src/FofnReader.h

new file mode 100644 (file)

index 0000000..2b2d441
--- /dev/null
+++ b/src/FofnReader.h
@@ -0,0 +1,26 @@
+// Author: Derek Barnett
+
+#ifndef FOFNREADER_H
+#define FOFNREADER_H
+
+#include "pbbam/Config.h"
+
+#include <iosfwd>
+#include <string>
+#include <vector>
+
+#include <pbbam/DataSet.h>
+
+namespace PacBio {
+namespace BAM {
+
+class FofnReader
+{
+public:
+    static std::vector<std::string> Files(std::istream& in);
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FOFNREADER_H
diff --git a/src/FormatUtils.cpp b/src/FormatUtils.cpp

new file mode 100644 (file)

index 0000000..7b33e33
--- /dev/null
+++ b/src/FormatUtils.cpp
@@ -0,0 +1,61 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FormatUtils.h"
+
+#include <algorithm>
+
+#include <boost/algorithm/string.hpp>
+
+#include "MemoryUtils.h"
+
+namespace PacBio {
+namespace BAM {
+
+const std::vector<std::string>& FormatUtils::BedExtensions()
+{
+    static const std::vector<std::string> extensions{"bed", "bed.gz"};
+    return extensions;
+}
+
+const std::vector<std::string>& FormatUtils::FastaExtensions()
+{
+    static const std::vector<std::string> extensions{"fa", "fasta", "fa.gz", "fasta.gz"};
+    return extensions;
+}
+
+const std::vector<std::string>& FormatUtils::FastqExtensions()
+{
+    static const std::vector<std::string> extensions{"fq", "fastq", "fq.gz", "fastq.gz"};
+    return extensions;
+}
+
+HtslibCompression FormatUtils::CompressionType(BGZF* bgzf)
+{
+    return static_cast<HtslibCompression>(bgzf_compression(bgzf));
+}
+
+HtslibCompression FormatUtils::CompressionType(const std::string& fn)
+{
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> bgzf(bgzf_open(fn.c_str(), "rb"));
+    if (bgzf == nullptr) throw std::runtime_error{"could not check compression level for: " + fn};
+    return CompressionType(bgzf.get());
+}
+
+bool FormatUtils::IsBedFilename(const std::string& fn) { return IsFormat(fn, BedExtensions()); }
+
+bool FormatUtils::IsFastaFilename(const std::string& fn) { return IsFormat(fn, FastaExtensions()); }
+
+bool FormatUtils::IsFastqFilename(const std::string& fn) { return IsFormat(fn, FastqExtensions()); }
+
+bool FormatUtils::IsFormat(const std::string& fn, const std::vector<std::string>& extensions)
+{
+    const auto found = std::find_if(
+        extensions.cbegin(), extensions.cend(),
+        [&fn](const std::string& ext) { return boost::algorithm::iends_with(fn, ext); });
+    return found != extensions.cend();
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/GenomicIntervalQuery.cpp b/src/GenomicIntervalQuery.cpp

new file mode 100644 (file)

index 0000000..78ecef6
--- /dev/null
+++ b/src/GenomicIntervalQuery.cpp
@@ -0,0 +1,70 @@
+// File Description
+/// \file GenomicIntervalQuery.cpp
+/// \brief Implements the GenomicIntervalQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/GenomicIntervalQuery.h"
+
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/GenomicInterval.h"
+
+namespace PacBio {
+namespace BAM {
+
+class GenomicIntervalQuery::GenomicIntervalQueryPrivate
+{
+public:
+    GenomicIntervalQueryPrivate(const DataSet& dataset, const BaiIndexCache& cache)
+        : reader_{dataset, cache}
+    {
+    }
+
+    GenomicIntervalQueryPrivate(const GenomicInterval& interval, const DataSet& dataset,
+                                const BaiIndexCache& cache)
+        : reader_{interval, dataset, cache}
+    {
+    }
+
+    GenomicIntervalCompositeBamReader reader_;
+};
+
+GenomicIntervalQuery::GenomicIntervalQuery(const DataSet& dataset)
+    : GenomicIntervalQuery(dataset, MakeBaiIndexCache(dataset))
+{
+}
+
+GenomicIntervalQuery::GenomicIntervalQuery(const DataSet& dataset, const BaiIndexCache& cache)
+    : internal::IQuery(), d_{std::make_unique<GenomicIntervalQueryPrivate>(dataset, cache)}
+{
+}
+
+GenomicIntervalQuery::GenomicIntervalQuery(const GenomicInterval& interval, const DataSet& dataset)
+    : GenomicIntervalQuery(interval, dataset, MakeBaiIndexCache(dataset))
+{
+}
+
+GenomicIntervalQuery::GenomicIntervalQuery(const GenomicInterval& interval, const DataSet& dataset,
+                                           const BaiIndexCache& cache)
+    : internal::IQuery()
+    , d_{std::make_unique<GenomicIntervalQueryPrivate>(interval, dataset, cache)}
+{
+}
+
+GenomicIntervalQuery::~GenomicIntervalQuery() = default;
+
+bool GenomicIntervalQuery::GetNext(BamRecord& r) { return d_->reader_.GetNext(r); }
+
+GenomicIntervalQuery& GenomicIntervalQuery::Interval(const GenomicInterval& interval)
+{
+    d_->reader_.Interval(interval);
+    return *this;
+}
+
+const GenomicInterval& GenomicIntervalQuery::Interval() const { return d_->reader_.Interval(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/IFastaWriter.cpp b/src/IFastaWriter.cpp

new file mode 100644 (file)

index 0000000..dd0c194
--- /dev/null
+++ b/src/IFastaWriter.cpp
@@ -0,0 +1,19 @@
+// File Description
+/// \file IFastaWriter.cpp
+/// \brief Implements the IFastaWriter class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/IFastaWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+IFastaWriter::IFastaWriter() = default;
+
+IFastaWriter::~IFastaWriter() = default;
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/IFastqWriter.cpp b/src/IFastqWriter.cpp

new file mode 100644 (file)

index 0000000..c294cdb
--- /dev/null
+++ b/src/IFastqWriter.cpp
@@ -0,0 +1,19 @@
+// File Description
+/// \file IFastqWriter.cpp
+/// \brief Implements the IFastqWriter class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/IFastqWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+IFastqWriter::IFastqWriter() = default;
+
+IFastqWriter::~IFastqWriter() = default;
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/IRecordWriter.cpp b/src/IRecordWriter.cpp

new file mode 100644 (file)

index 0000000..5931f78
--- /dev/null
+++ b/src/IRecordWriter.cpp
@@ -0,0 +1,19 @@
+// File Description
+/// \file IRecordWriter.cpp
+/// \brief Implements the IRecordWriter class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/IRecordWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+IRecordWriter::IRecordWriter() = default;
+
+IRecordWriter::~IRecordWriter() = default;
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/IndexedBamWriter.cpp b/src/IndexedBamWriter.cpp

new file mode 100644 (file)

index 0000000..89de02b
--- /dev/null
+++ b/src/IndexedBamWriter.cpp
@@ -0,0 +1,1032 @@
+/// File Description
+/// \file IndexedBamWriter.cpp
+/// \brief Implements the IndexedBamWriter class
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/IndexedBamWriter.h"
+
+#include <sys/stat.h>
+
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <condition_variable>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <mutex>
+#include <stdexcept>
+#include <thread>
+#include <type_traits>
+
+#include <htslib/bgzf.h>
+#include <htslib/hfile.h>
+#include <htslib/hts.h>
+#include <pbcopper/utility/Deleters.h>
+#include <boost/numeric/conversion/cast.hpp>
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/BamRecordImpl.h"
+#include "pbbam/BamWriter.h"
+#include "pbbam/PbiRawData.h"
+#include "pbbam/RecordType.h"
+#include "pbbam/Unused.h"
+#include "pbbam/Validator.h"
+
+#include "FileProducer.h"
+#include "MemoryUtils.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+void bgzf_write_safe2(BGZF* fp, const void* data, size_t length)
+{
+    const auto ret = bgzf_write(fp, data, length);
+    if (ret < 0L)
+        throw std::runtime_error{
+            "IndexedBamWriter: non-zero returned from bgzf_write(). Out of disk space?"};
+}
+
+struct GzIndexEntry
+{
+    int64_t vAddress;
+    int64_t uAddress;
+};
+
+template <typename T>
+inline void SwapEndianness2(std::vector<T>& data)
+{
+    constexpr const size_t elementSize = sizeof(T);
+    const size_t numReads = data.size();
+    switch (elementSize) {
+        case 1:
+            break;  // no swapping necessary
+        case 2:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_2p(&data[i]);
+            break;
+        case 4:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_4p(&data[i]);
+            break;
+        case 8:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_8p(&data[i]);
+            break;
+        default:
+            throw std::runtime_error{"IndexedBamWriter: unsupported element size: " +
+                                     std::to_string(elementSize)};
+    }
+}
+
+template <typename T>
+inline void WriteBgzfVector2(BGZF* fp, std::vector<T>& data)
+{
+    assert(fp);
+    if (fp->is_be) SwapEndianness2(data);
+    bgzf_write_safe2(fp, data.data(), data.size() * sizeof(T));
+}
+
+struct PbiFieldBlock2
+{
+    int64_t pos_;  // file position of block start
+    size_t n_;     // number of entries in block
+};
+
+template <typename T>
+class PbiField2
+{
+    constexpr static const size_t ElementSize = sizeof(T);
+
+public:
+    PbiField2(size_t maxBufferSize) : maxElementCount_{maxBufferSize / ElementSize}
+    {
+        buffer_.reserve(maxElementCount_);
+    }
+
+    void Add(T value) { buffer_.push_back(value); }
+    bool IsFull() const { return buffer_.size() == maxElementCount_; }
+
+    size_t maxElementCount_;
+    std::vector<T> buffer_;
+    std::vector<PbiFieldBlock2> blocks_;
+};
+
+class PbiReferenceDataBuilder2
+{
+public:
+    using ReferenceRows = std::pair<int32_t, int32_t>;  // [startRow, endRow)
+
+    explicit PbiReferenceDataBuilder2(const size_t numReferenceSequences)
+    {
+        // initialize with number of references we expect to see
+        //
+        // we can add more later, but want to ensure known references have an entry
+        // even if no records are observed mapping to it
+        //
+        for (size_t i = 0; i < numReferenceSequences; ++i)
+            rawReferenceEntries_[i] = PbiReferenceEntry(i);
+
+        // also create an "unmapped" entry
+        rawReferenceEntries_[PbiReferenceEntry::UNMAPPED_ID] = PbiReferenceEntry{};
+    }
+
+    bool AddRecord(const BamRecord& record, const int32_t rowNumber)
+    {
+        // fetch ref ID & pos for record
+        const int32_t tId = record.ReferenceId();
+        const int32_t pos = record.ReferenceStart();
+
+        // sanity checks to protect against non-coordinate-sorted BAMs
+        if (lastRefId_ != tId || (lastRefId_ >= 0 && tId < 0)) {
+            if (tId >= 0) {
+
+                // if we've already seen unmapped reads, but our current tId is valid
+                //
+                // error: unmapped reads should all be at the end (can stop checking refs)
+                //
+                PbiReferenceEntry& unmappedEntry =
+                    rawReferenceEntries_.at(PbiReferenceEntry::UNMAPPED_ID);
+                if (unmappedEntry.beginRow_ != PbiReferenceEntry::UNSET_ROW) return false;
+
+                // if we've already seen data for this new tId
+                // (remember we're coming from another tId)
+                //
+                // error: refs are out of order (can stop checking refs)
+                //
+                PbiReferenceEntry& currentEntry =
+                    rawReferenceEntries_.at(static_cast<uint32_t>(tId));
+                if (currentEntry.beginRow_ != PbiReferenceEntry::UNSET_ROW) return false;
+            }
+            lastRefId_ = tId;
+        } else if (tId >= 0 && lastPos_ > pos)
+            return false;  // error: positions out of order
+
+        // update row numbers
+        PbiReferenceEntry& entry = rawReferenceEntries_.at(static_cast<uint32_t>(tId));
+        if (entry.beginRow_ == PbiReferenceEntry::UNSET_ROW) entry.beginRow_ = rowNumber;
+        entry.endRow_ = rowNumber + 1;
+
+        // update pos (for sorting check next go-round)
+        lastPos_ = pos;
+        return true;
+    }
+
+    PbiRawReferenceData Result() const
+    {
+        // PbiReferenceEntries will be sorted thanks to std::map
+        // tId will be at end since we're sorting on the uint cast of -1
+        PbiRawReferenceData result;
+        result.entries_.reserve(rawReferenceEntries_.size());
+        for (const auto& entry : rawReferenceEntries_)
+            result.entries_.push_back(entry.second);
+        return result;
+    }
+
+    void WriteData(BGZF* bgzf)
+    {
+        const auto refData = Result();
+
+        // num_refs
+        uint32_t numRefs = refData.entries_.size();
+        if (bgzf->is_be) numRefs = ed_swap_4(numRefs);
+        bgzf_write_safe2(bgzf, &numRefs, 4);
+
+        // reference entries
+        numRefs = refData.entries_.size();  // need to reset after maybe endian-swapping
+        for (size_t i = 0; i < numRefs; ++i) {
+            auto& entry = refData.entries_[i];
+            auto tId = entry.tId_;
+            auto beginRow = entry.beginRow_;
+            auto endRow = entry.endRow_;
+            if (bgzf->is_be) {
+                tId = ed_swap_4(tId);
+                beginRow = ed_swap_4(beginRow);
+                endRow = ed_swap_4(endRow);
+            }
+            bgzf_write_safe2(bgzf, &tId, 4);
+            bgzf_write_safe2(bgzf, &beginRow, 4);
+            bgzf_write_safe2(bgzf, &endRow, 4);
+        }
+    }
+
+private:
+    int32_t lastRefId_ = -1;
+    Position lastPos_ = -1;
+    std::map<uint32_t, PbiReferenceEntry> rawReferenceEntries_;
+};
+
+// TODO: come back to refseqs, sorting, etc
+class PbiBuilder2
+{
+    enum class FlushMode
+    {
+        FORCE,
+        NO_FORCE
+    };
+
+public:
+    PbiBuilder2(const std::string& bamFilename, const std::string& pbiFilename,
+                const PbiBuilder::CompressionLevel compressionLevel, const size_t numThreads,
+                const size_t fileBufferSize)
+        //                const size_t numReferenceSequences = 0
+        //                const bool isCoordinateSorted = false
+        : bamFilename_{bamFilename},
+          pbiFilename_{pbiFilename},
+          tempFilename_{pbiFilename + ".build"},
+          tempFile_{std::fopen(tempFilename_.c_str(), "w+b")},
+          compressionLevel_{compressionLevel},
+          numThreads_{numThreads},
+          rgIdField_{fileBufferSize},
+          qStartField_{fileBufferSize},
+          qEndField_{fileBufferSize},
+          holeNumField_{fileBufferSize},
+          readQualField_{fileBufferSize},
+          ctxtField_{fileBufferSize},
+          fileOffsetField_{fileBufferSize},
+          tIdField_{fileBufferSize},
+          tStartField_{fileBufferSize},
+          tEndField_{fileBufferSize},
+          aStartField_{fileBufferSize},
+          aEndField_{fileBufferSize},
+          revStrandField_{fileBufferSize},
+          nMField_{fileBufferSize},
+          nMMField_{fileBufferSize},
+          mapQualField_{fileBufferSize},
+          bcForwardField_{fileBufferSize},
+          bcReverseField_{fileBufferSize},
+          bcQualField_{fileBufferSize}
+    {
+        if (!tempFile_)
+            throw std::runtime_error{"IndexedBamWriter: could not open temp file: " +
+                                     tempFilename_};
+
+        // TODO: setup for ref data building
+    }
+
+    void AddRecord(const BamRecord& b, const int64_t uOffset)
+    {
+        // ensure updated data (necessary?)
+        PacBio::BAM::BamRecordMemory::UpdateRecordTags(b);
+        b.ResetCachedPositions();
+
+        // store record data & maybe flush to temp file
+        AddBasicData(b, uOffset);
+        AddMappedData(b);
+        AddBarcodeData(b);
+        AddReferenceData(b, currentRow_);
+        FlushBuffers(FlushMode::NO_FORCE);
+
+        ++currentRow_;
+    }
+
+    void AddBasicData(const BamRecord& b, const int64_t uOffset)
+    {
+        // read group ID
+        const auto rgId = [&b]() -> int32_t {
+            auto rgIdString = b.ReadGroupBaseId();
+            if (rgIdString.empty()) rgIdString = MakeReadGroupId(b.MovieName(), ToString(b.Type()));
+            return std::stoul(rgIdString, nullptr, 16);
+        }();
+
+        // query start/end
+        const auto isCcsOrTranscript = (IsCcsOrTranscript(b.Type()));
+        const int32_t qStart = (isCcsOrTranscript ? 0 : b.QueryStart());
+        const int32_t qEnd = (isCcsOrTranscript ? b.Impl().SequenceLength() : b.QueryEnd());
+
+        // add'l data
+        const int32_t holeNum = (b.HasHoleNumber() ? b.HoleNumber() : 0);
+        const float readAccuracy =
+            (b.HasReadAccuracy() ? boost::numeric_cast<float>(b.ReadAccuracy()) : 0.0F);
+        const uint8_t ctxt = (b.HasLocalContextFlags() ? b.LocalContextFlags()
+                                                       : LocalContextFlags::NO_LOCAL_CONTEXT);
+
+        // store
+        rgIdField_.Add(rgId);
+        qStartField_.Add(qStart);
+        qEndField_.Add(qEnd);
+        holeNumField_.Add(holeNum);
+        ctxtField_.Add(ctxt);
+        readQualField_.Add(readAccuracy);
+        fileOffsetField_.Add(uOffset);
+    }
+
+    void AddMappedData(const BamRecord& b)
+    {
+        // alignment position
+        const auto tId = b.ReferenceId();
+        const auto tStart = static_cast<uint32_t>(b.ReferenceStart());
+        const auto tEnd = static_cast<uint32_t>(b.ReferenceEnd());
+        const auto aStart = static_cast<uint32_t>(b.AlignedStart());
+        const auto aEnd = static_cast<uint32_t>(b.AlignedEnd());
+        const auto isReverseStrand = [&b]() -> uint8_t {
+            return (b.AlignedStrand() == Strand::REVERSE ? 1 : 0);
+        }();
+
+        // alignment quality
+        const auto matchData = b.NumMatchesAndMismatches();
+        const auto nM = static_cast<uint32_t>(matchData.first);
+        const auto nMM = static_cast<uint32_t>(matchData.second);
+        const auto mapQuality = b.MapQuality();
+
+        if (tId >= 0) hasMappedData_ = true;
+
+        // store
+        tIdField_.Add(tId);
+        tStartField_.Add(tStart);
+        tEndField_.Add(tEnd);
+        aStartField_.Add(aStart);
+        aEndField_.Add(aEnd);
+        revStrandField_.Add(isReverseStrand);
+        nMField_.Add(nM);
+        nMMField_.Add(nMM);
+        mapQualField_.Add(mapQuality);
+    }
+
+    void AddBarcodeData(const BamRecord& b)
+    {
+        // initialize w/ 'missing' value
+        int16_t bcForward = -1;
+        int16_t bcReverse = -1;
+        int8_t bcQuality = -1;
+
+        // check for any barcode data (both required)
+        if (b.HasBarcodes() && b.HasBarcodeQuality()) {
+            // fetch data from record
+            std::tie(bcForward, bcReverse) = b.Barcodes();
+            bcQuality = static_cast<int8_t>(b.BarcodeQuality());
+
+            // double-check & reset to 'missing' value if any less than zero
+            if (bcForward < 0 && bcReverse < 0 && bcQuality < 0) {
+                bcForward = -1;
+                bcReverse = -1;
+                bcQuality = -1;
+            } else
+                hasBarcodeData_ = true;
+        }
+
+        // store
+        bcForwardField_.Add(bcForward);
+        bcReverseField_.Add(bcReverse);
+        bcQualField_.Add(bcQuality);
+    }
+
+    void AddReferenceData(const BamRecord& b, const uint32_t currentRow)
+    {
+        // only add if coordinate-sorted hint is set
+        // update with info from refDataBuilder
+        if (refDataBuilder_) {
+            const auto sorted = refDataBuilder_->AddRecord(b, currentRow);
+            if (!sorted) refDataBuilder_.reset();
+        }
+    }
+
+    void Close()
+    {
+        if (isClosed_) return;
+
+        FlushBuffers(FlushMode::FORCE);
+
+        OpenPbiFile();
+        WritePbiHeader();
+        WriteFromTempFile();
+
+        remove(tempFilename_.c_str());
+        isClosed_ = true;
+    }
+
+    void OpenPbiFile()
+    {
+        // open file handle
+        const auto mode = std::string("wb") + std::to_string(static_cast<int>(compressionLevel_));
+        pbiFile_.reset(bgzf_open(pbiFilename_.c_str(), mode.c_str()));
+        if (pbiFile_ == nullptr)
+            throw std::runtime_error{"IndexedBamWriter: could not open output PBI file"};
+
+        // if no explicit thread count given, attempt built-in check
+        size_t actualNumThreads = numThreads_;
+        if (actualNumThreads == 0) {
+            actualNumThreads = std::thread::hardware_concurrency();
+
+            // if still unknown, default to single-threaded
+            if (actualNumThreads == 0) actualNumThreads = 1;
+        }
+
+        // if multithreading requested, enable it
+        if (actualNumThreads > 1) bgzf_mt(pbiFile_.get(), actualNumThreads, 256);
+    }
+
+    template <typename T>
+    void MaybeFlushBuffer(PbiField2<T>& field, bool force)
+    {
+        // replace with lambda, in FlushBuffer(), once PPA can use C++14 ?
+        if (field.IsFull() || force) {
+            WriteToTempFile(field);
+            field.buffer_.clear();
+        }
+    }
+
+    void FlushBuffers(FlushMode mode)
+    {
+        const auto force = (mode == FlushMode::FORCE);
+
+        MaybeFlushBuffer(rgIdField_, force);
+        MaybeFlushBuffer(qStartField_, force);
+        MaybeFlushBuffer(qEndField_, force);
+        MaybeFlushBuffer(holeNumField_, force);
+        MaybeFlushBuffer(readQualField_, force);
+        MaybeFlushBuffer(ctxtField_, force);
+        MaybeFlushBuffer(fileOffsetField_, force);
+
+        MaybeFlushBuffer(tIdField_, force);
+        MaybeFlushBuffer(tStartField_, force);
+        MaybeFlushBuffer(tEndField_, force);
+        MaybeFlushBuffer(aStartField_, force);
+        MaybeFlushBuffer(aEndField_, force);
+        MaybeFlushBuffer(revStrandField_, force);
+        MaybeFlushBuffer(nMField_, force);
+        MaybeFlushBuffer(nMMField_, force);
+        MaybeFlushBuffer(mapQualField_, force);
+
+        MaybeFlushBuffer(bcForwardField_, force);
+        MaybeFlushBuffer(bcReverseField_, force);
+        MaybeFlushBuffer(bcQualField_, force);
+    }
+
+    template <typename T>
+    void LoadFieldBlockFromTempFile(PbiField2<T>& field, const PbiFieldBlock2& block)
+    {
+        // seek to block begin
+        const auto ret = std::fseek(tempFile_.get(), block.pos_, SEEK_SET);
+        if (ret != 0)
+            throw std::runtime_error{"IndexedBamWriter: could not seek in temp file: " +
+                                     tempFilename_ + ", offset: " + std::to_string(block.pos_)};
+
+        // read block elements
+        field.buffer_.assign(block.n_, 0);
+        const auto numElements =
+            std::fread(field.buffer_.data(), sizeof(T), block.n_, tempFile_.get());
+
+        if (numElements != block.n_)
+            throw std::runtime_error{
+                "IndexedBamWriter: could not read expected element count from temp file: " +
+                tempFilename_};
+    }
+
+    template <typename T>
+    void WriteField(PbiField2<T>& field)
+    {
+        for (const auto& block : field.blocks_) {
+            LoadFieldBlockFromTempFile(field, block);
+            WriteBgzfVector2(pbiFile_.get(), field.buffer_);
+        }
+    }
+
+    void WriteFromTempFile()
+    {
+        // load from temp file, in PBI format order, and write to index
+
+        WriteField(rgIdField_);
+        WriteField(qStartField_);
+        WriteField(qEndField_);
+        WriteField(holeNumField_);
+        WriteField(readQualField_);
+        WriteField(ctxtField_);
+
+        WriteVirtualOffsets();
+
+        if (hasMappedData_) {
+            WriteField(tIdField_);
+            WriteField(tStartField_);
+            WriteField(tEndField_);
+            WriteField(aStartField_);
+            WriteField(aEndField_);
+            WriteField(revStrandField_);
+            WriteField(nMField_);
+            WriteField(nMMField_);
+            WriteField(mapQualField_);
+        }
+
+        if (refDataBuilder_) WriteReferenceData();
+
+        if (hasBarcodeData_) {
+            WriteField(bcForwardField_);
+            WriteField(bcReverseField_);
+            WriteField(bcQualField_);
+        }
+    }
+
+    template <typename T>
+    void WriteToTempFile(PbiField2<T>& field)
+    {
+        if (field.buffer_.empty()) return;
+
+        const auto pos = std::ftell(tempFile_.get());
+        const auto numElements =
+            std::fwrite(field.buffer_.data(), sizeof(T), field.buffer_.size(), tempFile_.get());
+        field.blocks_.emplace_back(PbiFieldBlock2{pos, numElements});
+    }
+
+    void WritePbiHeader()
+    {
+        BGZF* bgzf = pbiFile_.get();
+
+        // 'magic' string
+        static constexpr const std::array<char, 4> magic{{'P', 'B', 'I', '\1'}};
+        bgzf_write_safe2(bgzf, magic.data(), 4);
+
+        PbiFile::Sections sections = PbiFile::BASIC;
+        if (hasMappedData_) sections |= PbiFile::MAPPED;
+        if (hasBarcodeData_) sections |= PbiFile::BARCODE;
+        if (refDataBuilder_) sections |= PbiFile::REFERENCE;
+
+        // version, pbi_flags, & n_reads
+        auto version = static_cast<uint32_t>(PbiFile::CurrentVersion);
+        uint16_t pbi_flags = sections;
+        auto numReads = currentRow_;
+        if (bgzf->is_be) {
+            version = ed_swap_4(version);
+            pbi_flags = ed_swap_2(pbi_flags);
+            numReads = ed_swap_4(numReads);
+        }
+        bgzf_write_safe2(bgzf, &version, 4);
+        bgzf_write_safe2(bgzf, &pbi_flags, 2);
+        bgzf_write_safe2(bgzf, &numReads, 4);
+
+        // reserved space
+        char reserved[18];
+        memset(reserved, 0, 18);
+        bgzf_write_safe2(bgzf, reserved, 18);
+    }
+
+    void WriteReferenceData() { refDataBuilder_->WriteData(pbiFile_.get()); }
+
+    std::vector<GzIndexEntry> LoadGzi()
+    {
+        //
+        // Open GZI file & load its contents. About to use for offset transformation.
+        //
+
+        const std::string gziFn{bamFilename_ + ".gzi"};
+        std::unique_ptr<FILE, Utility::FileDeleter> gziFile{fopen(gziFn.c_str(), "rb")};
+        if (!gziFile) throw std::runtime_error{"IndexedBamWriter: could not open gzi file"};
+
+        uint64_t numElements;
+        if (fread(&numElements, sizeof(numElements), 1, gziFile.get()) < 1)
+            throw std::runtime_error{"IndexedBamWriter: could not read from gziFile"};
+        if (ed_is_big()) ed_swap_8(numElements);
+
+        std::vector<GzIndexEntry> result;
+        result.reserve(numElements);
+        for (uint32_t i = 0; i < numElements; ++i) {
+            GzIndexEntry entry;
+            if (fread(&entry.vAddress, sizeof(entry.vAddress), 1, gziFile.get()) < 1)
+                throw std::runtime_error{"IndexedBamWriter: could not read from gziFile"};
+            if (fread(&entry.uAddress, sizeof(entry.uAddress), 1, gziFile.get()) < 1)
+                throw std::runtime_error{"IndexedBamWriter: could not read from gziFile"};
+            if (ed_is_big()) {
+                ed_swap_8(entry.vAddress);
+                ed_swap_8(entry.uAddress);
+            }
+            result.push_back(std::move(entry));
+        }
+        return result;
+    }
+
+    void WriteVirtualOffsets()
+    {
+        auto index = LoadGzi();
+        if (index.empty()) throw std::runtime_error{"IndexedBamWriter: empty GZI file"};
+        std::sort(index.begin(), index.end(),
+                  [](const GzIndexEntry& lhs, const GzIndexEntry& rhs) -> bool {
+                      return lhs.uAddress < rhs.uAddress;
+                  });
+
+        size_t k = 0;
+        for (const auto& block : fileOffsetField_.blocks_) {
+            LoadFieldBlockFromTempFile(fileOffsetField_, block);
+
+            // transform offsets from GZI
+            for (size_t j = 0; j < fileOffsetField_.buffer_.size(); ++j) {
+                while ((k < index.size() - 1) && (static_cast<uint64_t>(index.at(k + 1).uAddress) <=
+                                                  fileOffsetField_.buffer_[j])) {
+                    ++k;
+                }
+                const GzIndexEntry& e = index.at(k);
+                const int64_t uOffset = fileOffsetField_.buffer_[j] - e.uAddress;
+                const auto result = ((e.vAddress << 16) | uOffset);
+                fileOffsetField_.buffer_[j] = result;
+            }
+            WriteBgzfVector2(pbiFile_.get(), fileOffsetField_.buffer_);
+        }
+    }
+
+private:
+    // file info
+    std::string bamFilename_;
+    std::string pbiFilename_;
+    std::string tempFilename_;
+    std::unique_ptr<FILE, Utility::FileDeleter> tempFile_;
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> pbiFile_;
+    PbiBuilder::CompressionLevel compressionLevel_;
+    size_t numThreads_;
+
+    // PBI field buffers
+    PbiField2<int32_t> rgIdField_;
+    PbiField2<int32_t> qStartField_;
+    PbiField2<int32_t> qEndField_;
+    PbiField2<int32_t> holeNumField_;
+    PbiField2<float> readQualField_;
+    PbiField2<uint8_t> ctxtField_;
+    PbiField2<uint64_t> fileOffsetField_;
+    PbiField2<int32_t> tIdField_;
+    PbiField2<uint32_t> tStartField_;
+    PbiField2<uint32_t> tEndField_;
+    PbiField2<uint32_t> aStartField_;
+    PbiField2<uint32_t> aEndField_;
+    PbiField2<uint8_t> revStrandField_;
+    PbiField2<uint32_t> nMField_;
+    PbiField2<uint32_t> nMMField_;
+    PbiField2<uint8_t> mapQualField_;
+    PbiField2<int16_t> bcForwardField_;
+    PbiField2<int16_t> bcReverseField_;
+    PbiField2<int8_t> bcQualField_;
+
+    // reference data
+    std::unique_ptr<PbiReferenceDataBuilder2> refDataBuilder_;
+
+    // tracking data
+    uint32_t currentRow_ = 0;
+    bool isClosed_ = false;
+    bool hasBarcodeData_ = false;
+    bool hasMappedData_ = false;
+};
+
+}  // namespace internal
+
+class IndexedBamWriter::IndexedBamWriterPrivate2  //: public internal::FileProducer
+{
+public:
+    IndexedBamWriterPrivate2(const std::string& outputFilename, std::shared_ptr<bam_hdr_t> header,
+                             const BamWriter::CompressionLevel bamCompressionLevel,
+                             const size_t numBamThreads,
+                             const PbiBuilder::CompressionLevel pbiCompressionLevel,
+                             const size_t numPbiThreads, const size_t numGziThreads,
+                             const size_t tempFileBufferSize)
+        : bamFilename_{outputFilename}, header_{header}
+    {
+        OpenBam(bamCompressionLevel, numBamThreads);
+        OpenGzi(numGziThreads);
+        OpenPbi(pbiCompressionLevel, numPbiThreads, tempFileBufferSize);
+
+        isOpen_ = true;
+    }
+
+    ~IndexedBamWriterPrivate2() noexcept
+    {
+        if (isOpen_) {
+            try {
+                Close();
+            } catch (...) {
+                // swallow any exceptions & remain no-throw from dtor
+            }
+        }
+    }
+
+    void Close()
+    {
+        // NOTE: keep this order of closing ( BAM -> GZI -> PBI )
+        CloseBam();
+        CloseGzi();
+        ClosePbi();
+
+        remove(std::string{bamFilename_ + ".gzi"}.c_str());
+        isOpen_ = false;
+    }
+
+    void CloseBam()
+    {
+        const auto ret = bgzf_flush(bam_.get()->fp.bgzf);
+        UNUSED(ret);
+        bam_.reset();
+    }
+
+    void CloseGzi()
+    {
+        done_ = true;
+        gziThread_.join();
+
+        // TODO: remove GZI file, leaving now for debubging
+    }
+
+    void ClosePbi() { builder_->Close(); }
+
+    void OpenBam(const BamWriter::CompressionLevel compressionLevel, const size_t numThreads)
+    {
+        //
+        // TODO: Compression level & numThreads are hardcoded here. Ok for
+        //       prototyping but need to be tune-able via API.
+        //
+
+        if (!header_)
+            throw std::runtime_error{"IndexedBamWriter: null header provided for output file: " +
+                                     bamFilename_};
+
+        // open output BAM
+        const auto usingFilename = bamFilename_;
+        const auto mode = std::string("wb") + std::to_string(static_cast<int>(compressionLevel));
+        bam_.reset(sam_open(usingFilename.c_str(), mode.c_str()));
+        if (!bam_)
+            throw std::runtime_error{"IndexedBamWriter: could not open file for writing: " +
+                                     usingFilename};
+
+        // maybe set multithreaded writing
+        size_t actualNumThreads = numThreads;
+        if (actualNumThreads == 0) {
+            actualNumThreads = std::thread::hardware_concurrency();
+
+            // if still unknown, default to single-threaded
+            if (actualNumThreads == 0) actualNumThreads = 1;
+        }
+        if (actualNumThreads > 1) hts_set_threads(bam_.get(), actualNumThreads);
+
+        // write header
+        auto ret = sam_hdr_write(bam_.get(), header_.get());
+        if (ret != 0)
+            throw std::runtime_error{"IndexedBamWriter: could not write header to file: " +
+                                     usingFilename};
+        ret = bgzf_flush(bam_.get()->fp.bgzf);
+
+        // store file positions after header
+        auto headerLength = [](const bam_hdr_t* hdr) -> size_t {
+            const size_t textHeader = 12 + hdr->l_text;
+            size_t refHeader = 0;
+            for (int i = 0; i < hdr->n_targets; ++i) {
+                char* n = hdr->target_name[i];
+                refHeader += (8 + (strlen(n) + 1));
+            }
+            return textHeader + refHeader;
+        };
+        uncompressedFilePos_ = headerLength(header_.get());
+    }
+
+    void OpenGzi(size_t numThreads)
+    {
+        size_t actualNumThreads = numThreads;
+        if (actualNumThreads == 0) {
+            actualNumThreads = std::thread::hardware_concurrency();
+
+            // if still unknown, default to single-threaded
+            if (actualNumThreads == 0) actualNumThreads = 1;
+        }
+        gziThread_ = std::thread{&IndexedBamWriterPrivate2::RunGziThread, this, actualNumThreads};
+    }
+
+    void OpenPbi(const PbiBuilder::CompressionLevel compressionLevel, const size_t numThreads,
+                 const size_t fileBufferSize)
+    {
+        builder_ = std::make_unique<internal::PbiBuilder2>(
+            bamFilename_, bamFilename_ + ".pbi", compressionLevel, numThreads, fileBufferSize);
+    }
+
+    void RunGziThread(size_t numThreads)
+    {
+        //
+        // This thread is the GZI index-enabled reader that trails the writer
+        // thread(s). It checks for changes in the output BAM's file size &
+        // reads whatever data is available. When writing is complete, it reads
+        // anything that might remain & dumps the GZI index contents to disk.
+        // This index is used downstream to generate records' "virtual offsets".
+        //
+
+        const auto& bamFilename = bamFilename_;
+        std::unique_ptr<BGZF, HtslibBgzfDeleter> bgzf;
+
+        struct stat st;
+        int ret = 0;
+        int64_t lastFileSize = 0;
+        int64_t numBytesRead = 0;
+
+        auto initBgzf = [&bgzf, &bamFilename, numThreads]() {
+            bgzf.reset(bgzf_open(bamFilename.c_str(), "rb"));
+            if (!bgzf)
+                throw std::runtime_error{
+                    "IndexedBamWriter: could not open BAM for 'toy train' reading"};
+            bgzf_index_build_init(bgzf.get());
+            if (numThreads > 1) bgzf_mt(bgzf.get(), numThreads, 256);
+        };
+
+        // main thread loop
+        while (true) {
+            // Quit if writer thread(s) are finished.
+            if (done_) break;
+
+            if (stat(bamFilename.c_str(), &st) != 0) {
+                gziStatus_ = GziStatus::MISC_ERROR;
+                return;
+            }
+            if (st.st_size > lastFileSize) {
+                lastFileSize = st.st_size;
+            } else {
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                continue;
+            }
+
+            // Don't read unless we can guarantee we won't catch up to the end of the file.
+            // Otherwise htslib will think the file has been truncated and throw errors.
+            // This is a touch tricky because we're reading in multi-thread mode so htslib
+            // will speculatively start grabbing blocks.  So we're going to stay *well* behind
+            // This needs be made more robust.
+            //
+            // Note: It's worth noting that bgzf->block_clength might only be the length of the
+            // compressed *payload*, meaning if there is any other header/metadata/etc on disk
+            // in the actual file, our estimation of our trailing distance might be off.  If
+            // this ever starts throwing exceptions we'll have to look more in to this...
+            while (lastFileSize - numBytesRead > 100 * BGZF_MAX_BLOCK_SIZE) {
+                // Open BAM reader if not already open.  Need to make sure we don't open it
+                // until we've already established our trailing distance.
+                if (!bgzf) initBgzf();
+
+                auto result = bgzf_read_block(bgzf.get());
+                if (result != 0) {
+                    gziStatus_ = GziStatus::IO_ERROR;
+                    return;
+                }
+                if (bgzf->block_length == 0) {
+                    gziStatus_ = GziStatus::TRAIL_ERROR;
+                    return;
+                }
+                numBytesRead += bgzf->block_clength;
+            }
+
+            // Only update if thigs have appreciably fallen behind
+            if (lastFileSize - numBytesRead > 1.10 * maxTrailingDistance_)
+                maxTrailingDistance_ = lastFileSize - numBytesRead;
+        }
+
+        // Try to open BAM if it wasn't opened in main loop.
+        if (!bgzf) initBgzf();
+
+        // Read any remaining data.
+        while (true) {
+            auto result = bgzf_read_block(bgzf.get());
+            if (result != 0) {
+                gziStatus_ = GziStatus::IO_ERROR;
+                return;
+            }
+            if (bgzf->block_length == 0) break;
+        }
+
+        // Dump GZI contents to disk.
+        const std::string gziFn{bamFilename_ + ".gzi"};
+        ret = bgzf_index_dump(bgzf.get(), gziFn.c_str(), nullptr);
+        if (ret != 0) gziStatus_ = GziStatus::GZI_ERROR;
+    }
+
+    void Write(const BamRecord& record)
+    {
+// TODO: add API to auto-skip this without special compile flag
+#if PBBAM_AUTOVALIDATE
+        Validator::Validate(record);
+#endif
+        // add record & its to index builder.
+        //
+        // NOTE: This is the record's postiion as if it were _uncompressed_. We
+        //       will return with GZI data later to transform it into BAM
+        //       "virtual offset".
+        //
+        builder_->AddRecord(record, uncompressedFilePos_);
+
+        const auto rawRecord = BamRecordMemory::GetRawData(record);
+
+        // update bin
+        // min_shift=14 & n_lvls=5 are BAM "magic numbers"
+        rawRecord->core.bin = hts_reg2bin(rawRecord->core.pos, bam_endpos(rawRecord.get()), 14, 5);
+
+        // write record to file
+        const auto ret = sam_write1(bam_.get(), header_.get(), rawRecord.get());
+        if (ret <= 0) throw std::runtime_error{"IndexedBamWriter: could not write record to BAM"};
+
+        // update file position
+        auto recordLength = [](bam1_t* b) {
+            auto* c = &b->core;
+
+            static constexpr size_t fixedLength = 36;
+            const size_t qnameLength = (c->l_qname - c->l_extranul);
+
+            // TODO: long CIGAR handling... sigh...
+
+            size_t remainingLength = 0;
+            if (c->n_cigar <= 0xffff)
+                remainingLength = (b->l_data - c->l_qname);
+            else {
+                const size_t cigarEnd = ((uint8_t*)bam_get_cigar(b) - b->data) + (c->n_cigar * 4);
+                remainingLength = 8 + (b->l_data - cigarEnd) + 4 + (4 * c->n_cigar);
+            }
+
+            return fixedLength + qnameLength + remainingLength;
+        };
+        uncompressedFilePos_ += recordLength(rawRecord.get());
+
+        // Need to handle any errors from the gzi thread, since it's not set
+        // up to throw without terminating the program
+        auto gstatus = gziStatus_.load();
+        if (gstatus != GziStatus::GOOD) {
+            if (gziStatus_.load() == GziStatus::IO_ERROR)
+                throw std::runtime_error(
+                    "IndexedBamWriter: error in gzi thread reading from BAM file " + bamFilename_);
+            if (gziStatus_.load() == GziStatus::TRAIL_ERROR)
+                throw std::runtime_error(
+                    "IndexedBamWriter: gzi reader thread failed to properly trail when reading " +
+                    bamFilename_);
+            if (gziStatus_.load() == GziStatus::GZI_ERROR)
+                throw std::runtime_error(
+                    "IndexedBamWriter: could not dump GZI contents for indexing " + bamFilename_);
+            if (gziStatus_.load() == GziStatus::MISC_ERROR)
+                throw std::runtime_error("IndexedBamWriter: error computing index file for " +
+                                         bamFilename_);
+            gziStatus_.store(GziStatus::DEAD);
+        }
+    }
+
+    size_t MaxReaderLag() const { return maxTrailingDistance_; }
+
+private:
+    std::string bamFilename_;
+
+    std::shared_ptr<bam_hdr_t> header_;
+    std::unique_ptr<samFile, HtslibFileDeleter> bam_;
+    std::unique_ptr<internal::PbiBuilder2> builder_;
+
+    // used as a type of error return code for the gziThread, so
+    // that errors are delayed until at least the bam file is
+    // safely written to disk
+    enum class GziStatus
+    {
+        GOOD,
+        IO_ERROR,
+        TRAIL_ERROR,
+        GZI_ERROR,
+        MISC_ERROR,
+        // There was an error, but we've bubbled up the
+        // information already
+        DEAD
+    };
+    std::atomic<GziStatus> gziStatus_{GziStatus::GOOD};
+    std::thread gziThread_;
+
+    bool isOpen_ = false;
+
+    std::atomic<bool> done_{false};
+    std::atomic<size_t> maxTrailingDistance_{0};
+
+    int64_t uncompressedFilePos_ = 0;
+};
+
+static_assert(!std::is_copy_constructible<IndexedBamWriter>::value,
+              "IndexedBamWriter(const IndexedBamWriter&) is not = delete");
+static_assert(!std::is_copy_assignable<IndexedBamWriter>::value,
+              "IndexedBamWriter& operator=(const IndexedBamWriter&) is not = delete");
+
+IndexedBamWriter::IndexedBamWriter(const std::string& outputFilename, const BamHeader& header,
+                                   const BamWriter::CompressionLevel bamCompressionLevel,
+                                   const size_t numBamThreads,
+                                   const PbiBuilder::CompressionLevel pbiCompressionLevel,
+                                   const size_t numPbiThreads, const size_t numGziThreads,
+                                   const size_t tempFileBufferSize)
+    : IRecordWriter(), d_{nullptr}
+{
+    if (tempFileBufferSize % 8 != 0)
+        throw std::runtime_error{"IndexedBamWriter: invalid buffer size for PBI builder (" +
+                                 std::to_string(tempFileBufferSize) +
+                                 "). Must be a multiple of 8."};
+
+#if PBBAM_AUTOVALIDATE
+    Validator::Validate(header);
+#endif
+    d_ = std::make_unique<IndexedBamWriterPrivate2>(
+        outputFilename, BamHeaderMemory::MakeRawHeader(header), bamCompressionLevel, numBamThreads,
+        pbiCompressionLevel, numPbiThreads, numGziThreads, tempFileBufferSize);
+}
+
+IndexedBamWriter::IndexedBamWriter(IndexedBamWriter&&) noexcept = default;
+
+IndexedBamWriter& IndexedBamWriter::operator=(IndexedBamWriter&&) noexcept = default;
+
+IndexedBamWriter::~IndexedBamWriter() = default;
+
+void IndexedBamWriter::TryFlush() {}  // ignore
+
+void IndexedBamWriter::Write(const BamRecord& record) { d_->Write(record); }
+
+void IndexedBamWriter::Write(const BamRecordImpl& record) { d_->Write(BamRecord{record}); }
+
+size_t IndexedBamWriter::MaxReaderLag() const { return d_->MaxReaderLag(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/IndexedFastaReader.cpp b/src/IndexedFastaReader.cpp

new file mode 100644 (file)

index 0000000..0d6b6f0
--- /dev/null
+++ b/src/IndexedFastaReader.cpp
@@ -0,0 +1,208 @@
+// File Description
+/// \file IndexedFastaReader.cpp
+/// \brief Implements the IndexedFastaReader class.
+//
+// Author: David Alexander
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/IndexedFastaReader.h"
+
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+
+#include <htslib/faidx.h>
+#include <pbcopper/utility/Deleters.h>
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/GenomicInterval.h"
+#include "pbbam/Orientation.h"
+#include "pbbam/StringUtilities.h"
+
+#include "MemoryUtils.h"
+#include "SequenceUtils.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace {
+
+void ClipAndGapify(std::string& subseq, const Cigar& cigar, bool exciseSoftClips)
+{
+    size_t seqIndex = 0;
+    for (const auto& op : cigar) {
+        const auto type = op.Type();
+        const auto opLength = op.Length();
+
+        // do nothing for hard clips
+        if (type == CigarOperationType::HARD_CLIP) continue;
+
+        // maybe remove soft clips
+        if (type == CigarOperationType::SOFT_CLIP) {
+            if (!exciseSoftClips) {
+                subseq.reserve(subseq.size() + opLength);
+                subseq.insert(seqIndex, opLength, '-');
+                seqIndex += opLength;
+            }
+        }
+
+        // for non-clipping operations
+        else {
+            // maybe add gaps/padding
+            if (type == CigarOperationType::INSERTION) {
+                subseq.reserve(subseq.size() + opLength);
+                subseq.insert(seqIndex, opLength, '-');
+            } else if (type == CigarOperationType::PADDING) {
+                subseq.reserve(subseq.size() + opLength);
+                subseq.insert(seqIndex, opLength, '*');
+            }
+
+            // update index
+            seqIndex += opLength;
+        }
+    }
+}
+
+}  // namespace
+
+class IndexedFastaReader::IndexedFastaReaderPrivate
+{
+public:
+    IndexedFastaReaderPrivate(std::string filename)
+        : fastaFilename_{std::move(filename)}, faiFilename_{fastaFilename_ + ".fai"}
+    {
+        handle_.reset(fai_load(fastaFilename_.c_str()));
+        if (handle_ == nullptr) {
+            throw std::runtime_error{
+                "IndexedFastaReader: could not open index file (*.fai) for FASTA file: " +
+                fastaFilename_};
+        }
+    }
+
+    std::string fastaFilename_;
+    std::string faiFilename_;
+    std::unique_ptr<faidx_t, HtslibFastaIndexDeleter> handle_;
+};
+
+IndexedFastaReader::IndexedFastaReader(std::string filename)
+    : d_{std::make_unique<IndexedFastaReaderPrivate>(std::move(filename))}
+{
+}
+
+IndexedFastaReader::IndexedFastaReader(const IndexedFastaReader& other)
+    : d_{std::make_unique<IndexedFastaReaderPrivate>(other.d_->fastaFilename_)}
+{
+}
+
+IndexedFastaReader::IndexedFastaReader(IndexedFastaReader&&) noexcept = default;
+
+IndexedFastaReader& IndexedFastaReader::operator=(const IndexedFastaReader& rhs)
+{
+    if (this != &rhs) *this = IndexedFastaReader{rhs};
+    return *this;
+}
+
+IndexedFastaReader& IndexedFastaReader::operator=(IndexedFastaReader&&) noexcept = default;
+
+IndexedFastaReader::~IndexedFastaReader() = default;
+
+std::string IndexedFastaReader::Subsequence(const std::string& id, Position begin,
+                                            Position end) const
+{
+    assert(begin <= end);
+    // htslib is dumb and will not consider empty intervals valid,
+    // that is, a call to faidx_fetch_seq will *always* return a
+    // sequence consisting of at least one base.
+    if (begin == end) return std::string{};
+
+    int len;
+    // Derek: *Annoyingly* htslib seems to interpret "end" as inclusive in
+    // faidx_fetch_seq, whereas it considers it exclusive in the region spec in
+    // fai_fetch.  Can you please verify?
+    const std::unique_ptr<char, Utility::FreeDeleter> rawSeq{
+        faidx_fetch_seq(d_->handle_.get(), id.c_str(), begin, end - 1, &len)};
+    if (rawSeq == nullptr) {
+        std::ostringstream s;
+        s << "IndexedFastaReader: could not fetch sequence from region: " << id << " [" << begin
+          << ", " << end << ") in FASTA file: " << d_->fastaFilename_;
+        throw std::runtime_error{s.str()};
+    }
+    return RemoveAllWhitespace(rawSeq.get());
+}
+
+std::string IndexedFastaReader::Subsequence(const Data::GenomicInterval& interval) const
+{
+    return Subsequence(interval.Name(), interval.Start(), interval.Stop());
+}
+
+std::string IndexedFastaReader::Subsequence(const char* htslibRegion) const
+{
+    int len = 0;
+    const std::unique_ptr<char, Utility::FreeDeleter> rawSeq(
+        fai_fetch(d_->handle_.get(), htslibRegion, &len));
+    if (rawSeq == nullptr) {
+        throw std::runtime_error{"IndexedFastaReader: could not fetch sequence from region: " +
+                                 std::string{htslibRegion} + " in FASTA file: " +
+                                 d_->fastaFilename_};
+    }
+    return RemoveAllWhitespace(rawSeq.get());
+}
+
+std::string IndexedFastaReader::ReferenceSubsequence(const BamRecord& bamRecord,
+                                                     const Orientation orientation,
+                                                     const bool gapped,
+                                                     const bool exciseSoftClips) const
+{
+    std::string subseq = Subsequence(bamRecord.ReferenceName(), bamRecord.ReferenceStart(),
+                                     bamRecord.ReferenceEnd());
+
+    if (bamRecord.Impl().IsMapped() && gapped)
+        ClipAndGapify(subseq, bamRecord.Impl().CigarData(), exciseSoftClips);
+
+    const auto reverse =
+        (orientation != Orientation::GENOMIC) && bamRecord.Impl().IsReverseStrand();
+    if (reverse) ReverseComplementCaseSens(subseq);
+
+    return subseq;
+}
+
+int IndexedFastaReader::NumSequences() const { return faidx_nseq(d_->handle_.get()); }
+
+std::vector<std::string> IndexedFastaReader::Names() const
+{
+    std::vector<std::string> names;
+    names.reserve(NumSequences());
+    for (int i = 0; i < NumSequences(); ++i)
+        names.emplace_back(faidx_iseq(d_->handle_.get(), i));
+    return names;
+}
+
+std::string IndexedFastaReader::Name(const size_t idx) const
+{
+    if (static_cast<int>(idx) >= NumSequences()) {
+        std::ostringstream s;
+        s << "IndexedFastaReader: cannot fetch sequence name. Index (" << idx
+          << ") is larger than the number of sequences: (" << NumSequences()
+          << ") in FASTA file: " << d_->fastaFilename_;
+        throw std::runtime_error{s.str()};
+    }
+    return {faidx_iseq(d_->handle_.get(), idx)};
+}
+
+bool IndexedFastaReader::HasSequence(const std::string& name) const
+{
+    return (faidx_has_seq(d_->handle_.get(), name.c_str()) != 0);
+}
+
+int IndexedFastaReader::SequenceLength(const std::string& name) const
+{
+    const auto len = faidx_seq_len(d_->handle_.get(), name.c_str());
+    if (len < 0) {
+        throw std::runtime_error{"IndexedFastaReader: could not determine sequence length of " +
+                                 name + " in FASTA file: " + d_->fastaFilename_};
+    }
+    return len;
+}
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/IndexedFastqBgzfReader.cpp b/src/IndexedFastqBgzfReader.cpp

new file mode 100644 (file)

index 0000000..f363d17
--- /dev/null
+++ b/src/IndexedFastqBgzfReader.cpp
@@ -0,0 +1,141 @@
+// File Description
+/// \file BgzFastqLoader.cpp
+/// \brief Implements the IndexedFastqBgzfReaderr class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "IndexedFastqBgzfReader.h"
+
+#include <algorithm>
+#include <sstream>
+#include <stdexcept>
+
+namespace PacBio {
+namespace BAM {
+
+IndexedFastqBgzfReader::IndexedFastqBgzfReader(std::string filename)
+    : IndexedFastqReaderImpl{std::move(filename)}
+    , file_{bgzf_open(fastqFilename_.c_str(), "r")}
+    , seq_{kseq_init(file_.get())}
+{
+    // check BGZF file handle
+    if (file_ == nullptr) {
+        std::ostringstream msg;
+        msg << "IndexedFastqBgzfReader: could not open file for reading\n"
+            << "  FASTQ file: " << fastqFilename_ << '\n';
+        throw std::runtime_error{msg.str()};
+    }
+
+    // check kseq sequence handle
+    assert(seq_ != nullptr);
+
+    // load BGZF index data (*.gzi)
+    const auto result = bgzf_index_load(file_.get(), fastqFilename_.c_str(), ".gzi");
+    if (result != 0) {
+        std::ostringstream msg;
+        msg << "IndexedFastqBgzfReader: could not load bgzf index data\n"
+            << "  FASTQ file: " << fastqFilename_ << '\n'
+            << "  index file: " << fastqFilename_ << ".gzi\n";
+        throw std::runtime_error{msg.str()};
+    }
+}
+
+int IndexedFastqBgzfReader::FetchRecord()
+{
+    // NOTE: kseq_read assumes it is at the beginning of "next" sequence's name.
+    //       However, here the file handle already points to the first base after
+    //       seeking using FAI. So this is kseq_read without the name/comment scan.
+
+    int c;
+    kseq_t* seq = seq_.get();
+    kstream_t* ks = seq->f;
+    seq_->comment.l = seq_->seq.l = seq_->qual.l = 0; /* reset all members */
+    if (seq_->seq.s == 0) { /* we can do this in the loop below, but that is slower */
+        seq_->seq.m = 256;
+        seq_->seq.s = (char*)malloc(seq_->seq.m);
+    }
+    while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') {
+        if (c == '\n') continue;        /* skip empty lines */
+        seq_->seq.s[seq_->seq.l++] = c; /* this is safe: we always have enough space for 1 char */
+        ks_getuntil2(ks, KS_SEP_LINE, &seq_->seq, 0, 1); /* read the rest of the line */
+    }
+
+    if (c == '>' || c == '@') seq_->last_char = c; /* the first header char has been read */
+    if (seq_->seq.l + 1 >=
+        seq_->seq.m) { /* seq_->seq.s[seq_->seq.l] below may be out of boundary */
+        seq_->seq.m = seq_->seq.l + 2;
+        kroundup32(seq_->seq.m); /* rounded to the next closest 2^k */
+        seq_->seq.s = (char*)realloc(seq_->seq.s, seq_->seq.m);
+    }
+    seq_->seq.s[seq_->seq.l] = 0;     /* null terminated string */
+    if (c != '+') return seq_->seq.l; /* FASTA */
+    if (seq_->qual.m < seq_->seq.m) { /* allocate memory for qual in case insufficient */
+        seq_->qual.m = seq_->seq.m;
+        seq_->qual.s = (char*)realloc(seq_->qual.s, seq_->qual.m);
+    }
+
+    while ((c = ks_getc(ks)) != -1 && c != '\n')
+        ;                   /* skip the rest of '+' line */
+    if (c == -1) return -2; /* error: no quality string */
+    while (ks_getuntil2(ks, KS_SEP_LINE, &seq_->qual, 0, 1) >= 0 && seq_->qual.l < seq_->seq.l)
+        ;
+    seq_->last_char = 0; /* we have not come to the next header line */
+
+    if (seq_->seq.l != seq_->qual.l) return -2; /* error: qual string is of a different length */
+    return seq_->seq.l;
+}
+
+std::pair<std::string, QualityValues> IndexedFastqBgzfReader::Subsequence(const std::string& id,
+                                                                          Position start,
+                                                                          Position end)
+{
+    // check requested region is valid
+    const auto& entry = index_.Entry(id);
+    const int64_t available = static_cast<int64_t>(entry.Length) - start;
+    const int64_t requested = end - start;
+    const int64_t length = std::min(available, requested);
+    if ((start < 0) || (end < 0) || (length < 0)) {
+        std::ostringstream msg;
+        msg << "IndexedFastqBgzfReader: invalid subsequence region requested from\n"
+            << "  FASTQ file: " << fastqFilename_ << '\n'
+            << "  requested region: " << id << ':' << start << '-' << end << '\n'
+            << "  sequence length:  " << entry.Length << '\n';
+        throw std::runtime_error{msg.str()};
+    }
+
+    // quick out if nothing needed
+    if (length == 0) return {};
+
+    // seek to sequence 'id' & reset kseq handle
+    auto result = bgzf_useek(file_.get(), entry.SeqOffset, SEEK_SET);
+    if (result != 0) {
+        std::ostringstream msg;
+        msg << "IndexedFastqBgzfReader: could not seek to requested region\n"
+            << "  FASTQ file: " << fastqFilename_ << '\n'
+            << "  requested region: " << id << ':' << start << '-' << end << '\n';
+        throw std::runtime_error{msg.str()};
+    }
+    ks_rewind(seq_->f);
+
+    // read (entire) sequence
+    result = FetchRecord();
+    if (result < 0) {
+        std::ostringstream msg;
+        msg << "IndexedFastqBgzfReader: error reading from\n"
+            << "  FASTQ file: " << fastqFilename_ << '\n'
+            << "  requested region: " << id << ':' << start << '-' << end << '\n'
+            << "  reason: likely truncated quality string\n";
+        throw std::runtime_error{msg.str()};
+    }
+
+    // trim to region bounds and return
+    const std::string seq{seq_->seq.s, seq_->seq.l};
+    const std::string quals{seq_->qual.s, seq_->qual.l};
+    return std::make_pair(seq.substr(start, length),
+                          QualityValues::FromFastq(quals.substr(start, length)));
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/IndexedFastqBgzfReader.h b/src/IndexedFastqBgzfReader.h

new file mode 100644 (file)

index 0000000..3ec294e
--- /dev/null
+++ b/src/IndexedFastqBgzfReader.h
@@ -0,0 +1,52 @@
+// File Description
+/// \file IndexedFastqBgzfReader.h
+/// \brief Defines the IndexedFastqBgzfReader class.
+//
+// Author: Derek Barnett
+
+#ifndef INDEXEDFASTQBGZFREADER_H
+#define INDEXEDFASTQBGZFREADER_H
+
+#include "pbbam/Config.h"
+
+#include "IndexedFastqReaderImpl.h"
+
+#include <memory>
+
+#include <htslib/kseq.h>
+
+#include "MemoryUtils.h"
+
+namespace PacBio {
+namespace BAM {
+
+class IndexedFastqBgzfReader final : public IndexedFastqReaderImpl
+{
+public:
+    IndexedFastqBgzfReader(std::string filename);
+
+    std::pair<std::string, QualityValues> Subsequence(const std::string& id, Position start,
+                                                      Position end) final;
+
+private:
+    int FetchRecord();
+
+    // specialize kseq_t for BGZF handle
+    KSEQ_INIT(BGZF*, bgzf_read);
+    struct KSeqDeleter
+    {
+        void operator()(kseq_t* seq) const
+        {
+            if (seq) kseq_destroy(seq);
+            seq = nullptr;
+        }
+    };
+
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> file_;
+    std::unique_ptr<kseq_t, KSeqDeleter> seq_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // INDEXEDFASTQBGZFREADER_H
+\ No newline at end of file
diff --git a/src/IndexedFastqReader.cpp b/src/IndexedFastqReader.cpp

new file mode 100644 (file)

index 0000000..d59cdad
--- /dev/null
+++ b/src/IndexedFastqReader.cpp
@@ -0,0 +1,198 @@
+// File Description
+/// \file IndexedFastqReader.cpp
+/// \brief Implements the IndexedFastqReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/IndexedFastqReader.h"
+
+#include <sstream>
+#include <stdexcept>
+#include <utility>
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/FaiIndex.h"
+#include "pbbam/FormatUtils.h"
+#include "pbbam/GenomicInterval.h"
+
+#include "IndexedFastqBgzfReader.h"
+#include "IndexedFastqReaderImpl.h"
+#include "IndexedFastqTextReader.h"
+#include "SequenceUtils.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace {
+
+void ClipAndGapify(std::pair<std::string, QualityValues>& seqQual, const Cigar& cigar,
+                   bool exciseSoftClips)
+{
+    auto& subseq = seqQual.first;
+    auto subQual = seqQual.second.Fastq();
+
+    const char nullQual = QualityValue{0}.Fastq();
+
+    size_t seqIndex = 0;
+    for (const auto& op : cigar) {
+        const auto type = op.Type();
+        const auto opLength = op.Length();
+
+        // do nothing for hard clips
+        if (type == CigarOperationType::HARD_CLIP) continue;
+
+        // maybe remove soft clips
+        if (type == CigarOperationType::SOFT_CLIP) {
+            if (!exciseSoftClips) {
+                subseq.reserve(subseq.size() + opLength);
+                subseq.insert(seqIndex, opLength, '-');
+                subQual.reserve(subQual.size() + opLength);
+                subQual.insert(seqIndex, opLength, nullQual);
+                seqIndex += opLength;
+            }
+        }
+
+        // for non-clipping operations
+        else {
+            // maybe add gaps/padding
+            if (type == CigarOperationType::INSERTION) {
+                subseq.reserve(subseq.size() + opLength);
+                subseq.insert(seqIndex, opLength, '-');
+                subQual.reserve(subseq.size() + opLength);
+                subQual.insert(seqIndex, opLength, nullQual);
+            } else if (type == CigarOperationType::PADDING) {
+                subseq.reserve(subseq.size() + opLength);
+                subseq.insert(seqIndex, opLength, '*');
+                subQual.reserve(subseq.size() + opLength);
+                subQual.insert(seqIndex, opLength, nullQual);
+            }
+
+            // update index
+            seqIndex += opLength;
+        }
+    }
+
+    seqQual.second = QualityValues::FromFastq(subQual);
+}
+
+std::unique_ptr<IndexedFastqReaderImpl> MakeReaderImpl(std::string filename)
+{
+    // validate extension
+    if (!FormatUtils::IsFastqFilename(filename)) {
+        throw std::runtime_error{"IndexedFastqReader: filename '" + filename +
+                                 "' is not recognized as a FASTQ file."};
+    }
+
+    // determine subsequence "loader" from compression type: plain-text, bgzf, or unsupported
+    const auto compressionType = FormatUtils::CompressionType(filename);
+    switch (compressionType) {
+
+        case HtslibCompression::NONE:
+            return std::make_unique<IndexedFastqTextReader>(std::move(filename));
+        case HtslibCompression::BGZIP:
+            return std::make_unique<IndexedFastqBgzfReader>(std::move(filename));
+
+        case HtslibCompression::GZIP: {
+            std::ostringstream msg;
+            msg << "IndexedFastqReader: random-access is not supported for plain gzipped "
+                   "file "
+                << filename << "\n\n"
+                << "Compressed files must be bgzipped, with accompanying *.gzi "
+                   "index.\n\n"
+                << "To keep the original gzipped file unchanged:\n"
+                << "  $ gunzip -c " << filename << " > <unzipped_file>\n"
+                << "or discard the gzipped file:\n"
+                << "  $ gunzip " << filename << '\n'
+                << '\n'
+                << "Re-compress & create *.gzi index:\n"
+                << "  $ bgzip --index <unzipped_file>\n\n";
+            throw std::runtime_error{msg.str()};
+        }
+        default:
+            assert(false);  // should never get here, the way htslib currently determines type
+            throw std::runtime_error{
+                "IndexedFastqReader: could not determine compression type for file: " + filename};
+    }
+}
+
+}  // namespace
+
+IndexedFastqReader::IndexedFastqReader(std::string filename)
+    : d_{MakeReaderImpl(std::move(filename))}
+{
+}
+
+IndexedFastqReader::IndexedFastqReader(const IndexedFastqReader& other)
+    : d_{MakeReaderImpl(other.d_->faiFilename_)}
+{
+}
+
+IndexedFastqReader::IndexedFastqReader(IndexedFastqReader&&) noexcept = default;
+
+IndexedFastqReader& IndexedFastqReader::operator=(const IndexedFastqReader& rhs)
+{
+    if (this != &rhs) *this = IndexedFastqReader{rhs};
+    return *this;
+}
+
+IndexedFastqReader& IndexedFastqReader::operator=(IndexedFastqReader&&) noexcept = default;
+
+IndexedFastqReader::~IndexedFastqReader() = default;
+
+bool IndexedFastqReader::HasSequence(const std::string& name) const
+{
+    return d_->index_.HasEntry(name);
+}
+
+std::vector<std::string> IndexedFastqReader::Names() const { return d_->index_.Names(); }
+
+std::string IndexedFastqReader::Name(const size_t idx) const { return d_->index_.Names().at(idx); }
+
+int IndexedFastqReader::NumSequences() const { return d_->index_.Names().size(); }
+
+std::pair<std::string, QualityValues> IndexedFastqReader::ReferenceSubsequence(
+    const BamRecord& bamRecord, const Orientation orientation, const bool gapped,
+    const bool exciseSoftClips)
+{
+    // fetch raw data for record's region
+    auto seqQual = Subsequence(bamRecord.ReferenceName(), bamRecord.ReferenceStart(),
+                               bamRecord.ReferenceEnd());
+
+    // maybe clip/gapify
+    if (bamRecord.Impl().IsMapped() && gapped) {
+        ClipAndGapify(seqQual, bamRecord.Impl().CigarData(), exciseSoftClips);
+    }
+
+    // maybe reverse
+    const auto reverse =
+        (orientation != Orientation::GENOMIC) && bamRecord.Impl().IsReverseStrand();
+    if (reverse) {
+        ReverseComplementCaseSens(seqQual.first);
+        Reverse(seqQual.second);
+    }
+
+    return seqQual;
+}
+
+int IndexedFastqReader::SequenceLength(const std::string& name) const
+{
+    const auto& entry = d_->index_.Entry(name);
+    return static_cast<int>(entry.Length);
+}
+
+std::pair<std::string, QualityValues> IndexedFastqReader::Subsequence(const std::string& id,
+                                                                      Position start, Position end)
+{
+    return d_->Subsequence(id, start, end);
+}
+
+std::pair<std::string, QualityValues> IndexedFastqReader::Subsequence(
+    const GenomicInterval& interval)
+{
+    return Subsequence(interval.Name(), interval.Start(), interval.Stop());
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+\ No newline at end of file
diff --git a/src/IndexedFastqReaderImpl.cpp b/src/IndexedFastqReaderImpl.cpp

new file mode 100644 (file)

index 0000000..3b4a2ef
--- /dev/null
+++ b/src/IndexedFastqReaderImpl.cpp
@@ -0,0 +1,24 @@
+// File Description
+/// \file IndexedFastqReaderImpl.cpp
+/// \brief Implements the IndexedFastqReaderImpl class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "IndexedFastqReaderImpl.h"
+
+namespace PacBio {
+namespace BAM {
+
+IndexedFastqReaderImpl::IndexedFastqReaderImpl(std::string filename)
+    : fastqFilename_{std::move(filename)}
+    , faiFilename_{fastqFilename_ + ".fai"}
+    , index_{faiFilename_}
+{
+}
+
+IndexedFastqReaderImpl::~IndexedFastqReaderImpl() = default;
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/IndexedFastqReaderImpl.h b/src/IndexedFastqReaderImpl.h

new file mode 100644 (file)

index 0000000..da1e616
--- /dev/null
+++ b/src/IndexedFastqReaderImpl.h
@@ -0,0 +1,43 @@
+// File Description
+/// \file IndexedFastqReaderImpl.h
+/// \brief Defines the IndexedFastqReaderImpl class.
+//
+// Author: Derek Barnett
+
+#ifndef INDEXEDFASTQREADERIMPL_H
+#define INDEXEDFASTQREADERIMPL_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+#include <utility>
+
+#include <pbcopper/data/Position.h>
+#include <pbcopper/data/QualityValues.h>
+
+#include "pbbam/FaiIndex.h"
+
+namespace PacBio {
+namespace BAM {
+
+class IndexedFastqReaderImpl
+{
+public:
+    virtual ~IndexedFastqReaderImpl();
+
+    virtual std::pair<std::string, Data::QualityValues> Subsequence(const std::string& id,
+                                                                    Data::Position start,
+                                                                    Data::Position end) = 0;
+
+    std::string fastqFilename_;
+    std::string faiFilename_;
+    FaiIndex index_;
+
+protected:
+    IndexedFastqReaderImpl(std::string filename);
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // INDEXEDFASTQREADERIMPL_H
+\ No newline at end of file
diff --git a/src/IndexedFastqTextReader.cpp b/src/IndexedFastqTextReader.cpp

new file mode 100644 (file)

index 0000000..4b4e23f
--- /dev/null
+++ b/src/IndexedFastqTextReader.cpp
@@ -0,0 +1,142 @@
+// File Description
+/// \file IndexedFastqTextReader.cpp
+/// \brief Implements the IndexedFastqTextReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "IndexedFastqTextReader.h"
+
+#include <unistd.h>
+#include <cassert>
+#include <cstdio>
+
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+
+namespace PacBio {
+namespace BAM {
+
+IndexedFastqTextReader::IndexedFastqTextReader(std::string filename)
+    : IndexedFastqReaderImpl{std::move(filename)}
+    , file_{fopen(fastqFilename_.c_str(), "r")}
+    , seq_{kseq_init(file_.get())}
+{
+    // check file handle
+    if (file_ == nullptr) {
+        std::ostringstream msg;
+        msg << "IndexedFastqTextReader: could not open file for reading\n"
+            << "  FASTQ file: " << fastqFilename_ << '\n';
+        throw std::runtime_error{msg.str()};
+    }
+
+    // check kseq sequence handle
+    assert(seq_ != nullptr);
+}
+
+int IndexedFastqTextReader::FetchRecord()
+{
+    // NOTE: kseq_read assumes it is at the beginning of "next" sequence's name.
+    //       However, here the file handle already points to the first base after
+    //       seeking using FAI. So this is kseq_read without the name/comment scan.
+
+    int c;
+    kseq_t* seq = seq_.get();
+    kstream_t* ks = seq->f;
+    seq_->comment.l = seq_->seq.l = seq_->qual.l = 0; /* reset all members */
+    if (seq_->seq.s == 0) { /* we can do this in the loop below, but that is slower */
+        seq_->seq.m = 256;
+        seq_->seq.s = (char*)malloc(seq_->seq.m);
+    }
+    while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') {
+        if (c == '\n') continue;        /* skip empty lines */
+        seq_->seq.s[seq_->seq.l++] = c; /* this is safe: we always have enough space for 1 char */
+        ks_getuntil2(ks, KS_SEP_LINE, &seq_->seq, 0, 1); /* read the rest of the line */
+    }
+
+    if (c == '>' || c == '@') seq_->last_char = c; /* the first header char has been read */
+    if (seq_->seq.l + 1 >=
+        seq_->seq.m) { /* seq_->seq.s[seq_->seq.l] below may be out of boundary */
+        seq_->seq.m = seq_->seq.l + 2;
+        kroundup32(seq_->seq.m); /* rounded to the next closest 2^k */
+        seq_->seq.s = (char*)realloc(seq_->seq.s, seq_->seq.m);
+    }
+    seq_->seq.s[seq_->seq.l] = 0; /* null terminated string */
+
+    if (c != '+') return seq_->seq.l; /* FASTA */
+    if (seq_->qual.m < seq_->seq.m) { /* allocate memory for qual in case insufficient */
+        seq_->qual.m = seq_->seq.m;
+        seq_->qual.s = (char*)realloc(seq_->qual.s, seq_->qual.m);
+    }
+
+    while ((c = ks_getc(ks)) != -1 && c != '\n')
+        ;                   /* skip the rest of '+' line */
+    if (c == -1) return -2; /* error: no quality string */
+    while (ks_getuntil2(ks, KS_SEP_LINE, &seq_->qual, 0, 1) >= 0 && seq_->qual.l < seq_->seq.l)
+        ;
+
+    seq_->last_char = 0; /* we have not come to the next header line */
+
+    if (seq_->seq.l != seq_->qual.l) return -2; /* error: qual string is of a different length */
+    return seq_->seq.l;
+}
+
+int IndexedFastqTextReader::ReadFromFile(FILE* fp, void* data, size_t length)
+{
+    return static_cast<int>(std::fread(data, sizeof(uint8_t), length, fp));
+}
+
+std::pair<std::string, Data::QualityValues> IndexedFastqTextReader::Subsequence(
+    const std::string& id, Data::Position start, Data::Position end)
+{
+    // check requested region is valid
+    const auto& entry = index_.Entry(id);
+    const int64_t available = static_cast<int64_t>(entry.Length) - start;
+    const int64_t requested = end - start;
+    const int64_t length = std::min(available, requested);
+    if ((start < 0) || (end < 0) || (length < 0)) {
+        std::ostringstream msg;
+        msg << "IndexedFastqTextReader: invalid subsequence region requested from\n"
+            << "  FASTQ file: " << fastqFilename_ << '\n'
+            << "  requested region: " << id << ':' << start << '-' << end << '\n'
+            << "  sequence length:  " << entry.Length << '\n';
+        throw std::runtime_error{msg.str()};
+    }
+
+    // quick out if nothing needed
+    if (length == 0) return {};
+
+    // seek to sequence 'id' & reset kseq handle
+    auto result = fseek(file_.get(), entry.SeqOffset, SEEK_SET);
+    if (result != 0) {
+        std::ostringstream msg;
+        msg << "IndexedFastqTextReader: could not seek to requested region\n"
+            << "  FASTQ file: " << fastqFilename_ << '\n'
+            << "  requested region: " << id << ':' << start << '-' << end << '\n';
+        throw std::runtime_error{msg.str()};
+    }
+    ks_rewind(seq_->f);
+
+    // read (entire) sequence
+    result = FetchRecord();
+    if (result < 0) {
+        std::ostringstream msg;
+        msg << "IndexedFastqTextReader: error reading from\n"
+            << "  FASTQ file: " << fastqFilename_ << '\n'
+            << "  requested region: " << id << ':' << start << '-' << end << '\n'
+            << "  reason: likely truncated quality string\n";
+        throw std::runtime_error{msg.str()};
+    }
+
+    // trim to region bounds and return
+    const std::string seq{seq_->seq.s, seq_->seq.l};
+    const std::string quals{seq_->qual.s, seq_->qual.l};
+    return std::make_pair(seq.substr(start, length),
+                          Data::QualityValues::FromFastq(quals.substr(start, length)));
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/IndexedFastqTextReader.h b/src/IndexedFastqTextReader.h

new file mode 100644 (file)

index 0000000..b77992e
--- /dev/null
+++ b/src/IndexedFastqTextReader.h
@@ -0,0 +1,59 @@
+// File Description
+/// \file IndexedFastqTextReader.h
+/// \brief Defines the IndexedFastqTextReader class.
+//
+// Author: Derek Barnett
+
+#ifndef INDEXEDFASTQTEXTREADER_H
+#define INDEXEDFASTQTEXTREADER_H
+
+#include "pbbam/Config.h"
+
+#include "IndexedFastqReaderImpl.h"
+
+#include <cstdio>
+
+#include <memory>
+
+#include <htslib/kseq.h>
+#include <pbcopper/utility/Deleters.h>
+
+namespace PacBio {
+namespace BAM {
+
+class IndexedFastqTextReader final : public IndexedFastqReaderImpl
+{
+public:
+    IndexedFastqTextReader(std::string filename);
+
+    std::pair<std::string, Data::QualityValues> Subsequence(const std::string& id,
+                                                            Data::Position start,
+                                                            Data::Position end) final;
+
+private:
+    int FetchRecord();
+
+    // kseq needs a '__read' function with this signature, so fread does not work
+    // in this case. gzread/bgzf_read match but we want better seek performance
+    // than gzstream and are specifically not using indexed BGZF
+    static int ReadFromFile(FILE* fp, void* data, size_t length);
+
+    // specialize kseq_t for FILE handle
+    KSEQ_INIT(FILE*, ReadFromFile)
+    struct KSeqDeleter
+    {
+        void operator()(kseq_t* seq) const
+        {
+            if (seq) kseq_destroy(seq);
+            seq = nullptr;
+        }
+    };
+
+    std::unique_ptr<FILE, Utility::FileDeleter> file_;
+    std::unique_ptr<kseq_t, KSeqDeleter> seq_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // INDEXEDFASTQTEXTREADER_H
+\ No newline at end of file
diff --git a/src/KSeqReader.cpp b/src/KSeqReader.cpp

new file mode 100644 (file)

index 0000000..4caf6ed
--- /dev/null
+++ b/src/KSeqReader.cpp
@@ -0,0 +1,54 @@
+// File Description
+/// \file KSeqReader.cpp
+/// \brief Implements the KSeqReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "KSeqReader.h"
+
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+KSeqReader::KSeqReader(const std::string& fn)
+    : fp_{gzopen(fn.c_str(), "r")}, seq_{kseq_init(fp_.get())}
+{
+    // check file handle
+    if (fp_ == nullptr) {
+        std::ostringstream msg;
+        msg << "KSeqReader: could not open file for reading\n"
+            << "  file: " << fn << '\n';
+        throw std::runtime_error{msg.str()};
+    }
+
+    // check kseq sequence handle
+    assert(seq_ != nullptr);
+}
+
+KSeqReader::KSeqReader(KSeqReader&&) noexcept = default;
+
+KSeqReader& KSeqReader::operator=(KSeqReader&&) noexcept = default;
+
+KSeqReader::~KSeqReader() = default;
+
+std::string KSeqReader::Bases() const { return std::string{seq_->seq.s, seq_->seq.l}; }
+
+std::string KSeqReader::Name() const { return std::string{seq_->name.s, seq_->name.l}; }
+
+std::string KSeqReader::Qualities() const { return std::string{seq_->qual.s, seq_->qual.l}; }
+
+bool KSeqReader::ReadNext()
+{
+    const auto result = kseq_read(seq_.get());
+    if (result == -1)  // EOF
+        return false;
+    return true;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/KSeqReader.h b/src/KSeqReader.h

new file mode 100644 (file)

index 0000000..9594ae6
--- /dev/null
+++ b/src/KSeqReader.h
@@ -0,0 +1,59 @@
+// File Description
+/// \file KSeqReader.h
+/// \brief Defines the KSeqReader class.
+//
+// Author: Derek Barnett
+
+#ifndef KSEQREADER_H
+#define KSEQREADER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+#include <string>
+
+#include <htslib/kseq.h>
+#include <zlib.h>
+
+#include "MemoryUtils.h"
+
+namespace PacBio {
+namespace BAM {
+
+class KSeqReader
+{
+public:
+    explicit KSeqReader(const std::string& fn);
+
+    KSeqReader(const KSeqReader&) = delete;
+    KSeqReader(KSeqReader&&) noexcept;
+    KSeqReader& operator=(const KSeqReader&) = delete;
+    KSeqReader& operator=(KSeqReader&&) noexcept;
+    virtual ~KSeqReader();
+
+    std::string Name() const;
+    std::string Bases() const;
+    std::string Qualities() const;
+
+    bool ReadNext();
+
+private:
+    // specialize kseq for gzstream (handles all of our types)
+    KSEQ_INIT(gzFile, gzread)
+    struct KSeqDeleter
+    {
+        void operator()(kseq_t* seq) const
+        {
+            if (seq) kseq_destroy(seq);
+            seq = nullptr;
+        }
+    };
+
+    std::unique_ptr<gzFile_s, GzFileDeleter> fp_;
+    std::unique_ptr<kseq_t, KSeqDeleter> seq_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // KSEQREADER_H
diff --git a/src/MD5.cpp b/src/MD5.cpp

new file mode 100644 (file)

index 0000000..6be55a2
--- /dev/null
+++ b/src/MD5.cpp
@@ -0,0 +1,54 @@
+// File Description
+/// \file MD5.cpp
+/// \brief Implements basic MD5 hash utilities
+//
+// Author: Brett Bowman
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/MD5.h"
+
+#include <stdexcept>
+
+#include <htslib/hts.h>
+
+namespace PacBio {
+namespace BAM {
+
+class Md5ContextHelper
+{
+public:
+    Md5ContextHelper() : data_(hts_md5_init())
+    {
+        if (data_ == nullptr) throw std::runtime_error{"MD5: could not initialize context"};
+    }
+
+    ~Md5ContextHelper() { hts_md5_destroy(data_); }
+
+    std::string Encoded(const std::string& str)
+    {
+        hts_md5_update(data_, reinterpret_cast<void*>(const_cast<char*>(str.c_str())), str.size());
+
+        unsigned char digest[16];
+        hts_md5_final(digest, data_);
+
+        char hexdigest[33];  // leave space for null-term
+        hts_md5_hex(hexdigest, digest);
+
+        return std::string{hexdigest, 32};
+    }
+
+private:
+    hts_md5_context* data_;
+};
+
+/// \brief MD5 hash of a string as a 32-digit hexadecimal string
+///
+std::string MD5Hash(const std::string& str)
+{
+    Md5ContextHelper md5;
+    return md5.Encoded(str);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/MemoryUtils.cpp b/src/MemoryUtils.cpp

new file mode 100644 (file)

index 0000000..9be40a8
--- /dev/null
+++ b/src/MemoryUtils.cpp
@@ -0,0 +1,40 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "MemoryUtils.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+BamHeader BamHeaderMemory::FromRawData(bam_hdr_t* hdr)
+{
+    // null input - error
+    if (hdr == nullptr) throw std::runtime_error{"MemoryUtils: null BAM header"};
+
+    // empty text input - ok
+    if (hdr->text == nullptr || hdr->l_text == 0) return BamHeader();
+
+    // parse normal SAM text input
+    return BamHeader(std::string(hdr->text, hdr->l_text));
+}
+
+std::shared_ptr<bam_hdr_t> BamHeaderMemory::MakeRawHeader(const BamHeader& header)
+{
+    const std::string text = header.ToSam();
+    std::shared_ptr<bam_hdr_t> rawData(sam_hdr_parse(text.size(), text.c_str()),
+                                       HtslibHeaderDeleter());
+    rawData->ignore_sam_err = 0;
+    rawData->cigar_tab = nullptr;
+    rawData->l_text = text.size();
+    rawData->text = static_cast<char*>(calloc(rawData->l_text + 1, 1));
+    memcpy(rawData->text, text.c_str(), rawData->l_text);
+    return rawData;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/MemoryUtils.h b/src/MemoryUtils.h

new file mode 100644 (file)

index 0000000..1679f76
--- /dev/null
+++ b/src/MemoryUtils.h
@@ -0,0 +1,151 @@
+// Author: Derek Barnett
+
+#ifndef MEMORYUTILS_H
+#define MEMORYUTILS_H
+
+#include "pbbam/Config.h"
+
+#include <cstdio>
+#include <memory>
+
+#include <htslib/bgzf.h>
+#include <htslib/faidx.h>
+#include <htslib/sam.h>
+#include <zlib.h>
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/BamRecordImpl.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamHeader;
+
+// intended for use with std::shared_ptr<T>, std::unique_ptr<T>, etc
+
+struct GzFileDeleter
+{
+    void operator()(gzFile fp) const
+    {
+        if (fp) gzclose(fp);
+        fp = nullptr;
+    }
+};
+
+struct HtslibBgzfDeleter
+{
+    void operator()(BGZF* bgzf) const
+    {
+        if (bgzf) bgzf_close(bgzf);
+        bgzf = nullptr;
+    }
+};
+
+struct HtslibFastaIndexDeleter
+{
+    void operator()(faidx_t* fai) const
+    {
+        if (fai) fai_destroy(fai);
+        fai = nullptr;
+    }
+};
+
+struct HtslibFileDeleter
+{
+    void operator()(samFile* file) const
+    {
+        if (file) sam_close(file);
+        file = nullptr;
+    }
+};
+
+struct HtslibHeaderDeleter
+{
+    void operator()(bam_hdr_t* hdr) const
+    {
+        if (hdr) bam_hdr_destroy(hdr);
+        hdr = nullptr;
+    }
+};
+
+struct HtslibIndexDeleter
+{
+    void operator()(hts_idx_t* index) const
+    {
+        if (index) hts_idx_destroy(index);
+        index = nullptr;
+    }
+};
+
+struct HtslibIteratorDeleter
+{
+    void operator()(hts_itr_t* iter) const
+    {
+        if (iter) hts_itr_destroy(iter);
+        iter = nullptr;
+    }
+};
+
+struct HtslibRecordDeleter
+{
+    void operator()(bam1_t* b) const
+    {
+        if (b) bam_destroy1(b);
+        b = nullptr;
+    }
+};
+
+class BamHeaderMemory
+{
+public:
+    static BamHeader FromRawData(bam_hdr_t* header);
+    static std::shared_ptr<bam_hdr_t> MakeRawHeader(const BamHeader& header);
+};
+
+class BamRecordMemory
+{
+public:
+    static const BamRecordImpl& GetImpl(const BamRecord& r);
+    static const BamRecordImpl& GetImpl(const BamRecord* r);
+    static std::shared_ptr<bam1_t> GetRawData(const BamRecord& r);
+    static std::shared_ptr<bam1_t> GetRawData(const BamRecord* r);
+    static std::shared_ptr<bam1_t> GetRawData(const BamRecordImpl& impl);
+    static std::shared_ptr<bam1_t> GetRawData(const BamRecordImpl* impl);
+
+    static void UpdateRecordTags(const BamRecord& r);
+    static void UpdateRecordTags(const BamRecordImpl& r);
+};
+
+inline const BamRecordImpl& BamRecordMemory::GetImpl(const BamRecord& r) { return r.impl_; }
+
+inline const BamRecordImpl& BamRecordMemory::GetImpl(const BamRecord* r) { return r->impl_; }
+
+inline std::shared_ptr<bam1_t> BamRecordMemory::GetRawData(const BamRecord& r)
+{
+    return GetRawData(r.impl_);
+}
+
+inline std::shared_ptr<bam1_t> BamRecordMemory::GetRawData(const BamRecord* r)
+{
+    return GetRawData(r->impl_);
+}
+
+inline std::shared_ptr<bam1_t> BamRecordMemory::GetRawData(const BamRecordImpl& impl)
+{
+    return impl.d_;
+}
+
+inline std::shared_ptr<bam1_t> BamRecordMemory::GetRawData(const BamRecordImpl* impl)
+{
+    return impl->d_;
+}
+
+inline void BamRecordMemory::UpdateRecordTags(const BamRecord& r) { UpdateRecordTags(r.impl_); }
+
+inline void BamRecordMemory::UpdateRecordTags(const BamRecordImpl& r) { r.UpdateTagMap(); }
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // MEMORYUTILS_H
diff --git a/src/PbbamInternalConfig.h b/src/PbbamInternalConfig.h

new file mode 100644 (file)

index 0000000..340b97d
--- /dev/null
+++ b/src/PbbamInternalConfig.h
@@ -0,0 +1,18 @@
+// File Description
+/// \file PbbamInternalConfig.h
+/// \brief Defines internal macros for symbol visibility
+//
+// Author: Derek Barnett
+
+#ifndef PBBAMINTERNALCONFIG_H
+#define PBBAMINTERNALCONFIG_H
+
+#if defined(WIN32)
+#define PBBAM_EXPORT __declspec(dllexport)
+#else
+#define PBBAM_EXPORT __attribute__((visibility("default")))
+#endif
+
+#include "pbbam/Config.h"
+
+#endif  // PBBAMINTERNALCONFIG_H
diff --git a/src/PbiBuilder.cpp b/src/PbiBuilder.cpp

new file mode 100644 (file)

index 0000000..3f710b4
--- /dev/null
+++ b/src/PbiBuilder.cpp
@@ -0,0 +1,662 @@
+// File Description
+/// \file PbiBuilder.cpp
+/// \brief Implements the PbiBuilder class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiBuilder.h"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <stdexcept>
+#include <thread>
+#include <tuple>
+
+#include <htslib/bgzf.h>
+#include <htslib/hfile.h>
+#include <htslib/hts.h>
+
+#include <boost/numeric/conversion/cast.hpp>
+
+#include <pbcopper/utility/Deleters.h>
+
+#include "MemoryUtils.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/BamRecordImpl.h"
+#include "pbbam/PbiRawData.h"
+#include "pbbam/RecordType.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template <typename T>
+inline void SwapEndianness(std::vector<T>& data)
+{
+    const size_t elementSize = sizeof(T);
+    const size_t numReads = data.size();
+    switch (elementSize) {
+        case 1:
+            break;  // no swapping necessary
+        case 2:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_2p(&data[i]);
+            break;
+        case 4:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_4p(&data[i]);
+            break;
+        case 8:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_8p(&data[i]);
+            break;
+        default:
+            throw std::runtime_error{"PbiBuilder: unsupported element size (" +
+                                     std::to_string(elementSize) + ")"};
+    }
+}
+
+void bgzf_write_safe(BGZF* fp, const void* data, size_t length)
+{
+    const auto ret = bgzf_write(fp, data, length);
+    if (ret < 0L)
+        throw std::runtime_error{
+            "PbiBuilder: non-zero returned from bgzf_write(). Out of disk space?"};
+}
+
+template <typename T>
+inline void WriteBgzfVector(BGZF* fp, std::vector<T>& data)
+{
+    assert(fp);
+    if (fp->is_be) SwapEndianness(data);
+    bgzf_write_safe(fp, &data[0], data.size() * sizeof(T));
+}
+
+struct PbiFieldBlock
+{
+    int64_t pos_;  // file position of block start
+    size_t n_;     // number of entries in block
+};
+
+template <typename T>
+class PbiField
+{
+    constexpr static const size_t ElementSize = sizeof(T);
+
+public:
+    PbiField(size_t maxBufferSize) : maxElementCount_{maxBufferSize / ElementSize}
+    {
+        buffer_.reserve(maxElementCount_);
+    }
+
+    void Add(T value) { buffer_.push_back(value); }
+    bool IsFull() const { return buffer_.size() == maxElementCount_; }
+
+    size_t maxElementCount_;
+    std::vector<T> buffer_;
+    std::vector<PbiFieldBlock> blocks_;
+};
+// --------------------------
+// PbiReferenceDataBuilder
+// --------------------------
+
+class PbiReferenceDataBuilder
+{
+public:
+    using ReferenceRows = std::pair<int32_t, int32_t>;  // [startRow, endRow)
+
+    explicit PbiReferenceDataBuilder(const size_t numReferenceSequences);
+
+    bool AddRecord(const BamRecord& record, const int32_t rowNumber);
+
+    PbiRawReferenceData Result() const;
+
+    void WriteData(BGZF* bgzf);
+
+private:
+    int32_t lastRefId_ = -1;
+    Position lastPos_ = -1;
+    std::map<uint32_t, PbiReferenceEntry> rawReferenceEntries_;
+};
+
+PbiReferenceDataBuilder::PbiReferenceDataBuilder(const size_t numReferenceSequences)
+{
+    // initialize with number of references we expect to see
+    //
+    // we can add more later, but want to ensure known references have an entry
+    // even if no records are observed mapping to it
+    //
+    for (size_t i = 0; i < numReferenceSequences; ++i)
+        rawReferenceEntries_[i] = PbiReferenceEntry(i);
+
+    // also create an "unmapped" entry
+    rawReferenceEntries_[PbiReferenceEntry::UNMAPPED_ID] = PbiReferenceEntry{};
+}
+
+bool PbiReferenceDataBuilder::AddRecord(const BamRecord& record, const int32_t rowNumber)
+{
+    // fetch ref ID & pos for record
+    const int32_t tId = record.ReferenceId();
+    const int32_t pos = record.ReferenceStart();
+
+    // sanity checks to protect against non-coordinate-sorted BAMs
+    if (lastRefId_ != tId || (lastRefId_ >= 0 && tId < 0)) {
+        if (tId >= 0) {
+
+            // if we've already seen unmapped reads, but our current tId is valid
+            //
+            // error: unmapped reads should all be at the end (can stop checking refs)
+            //
+            PbiReferenceEntry& unmappedEntry =
+                rawReferenceEntries_.at(PbiReferenceEntry::UNMAPPED_ID);
+            if (unmappedEntry.beginRow_ != PbiReferenceEntry::UNSET_ROW) return false;
+
+            // if we've already seen data for this new tId
+            // (remember we're coming from another tId)
+            //
+            // error: refs are out of order (can stop checking refs)
+            //
+            PbiReferenceEntry& currentEntry = rawReferenceEntries_.at(static_cast<uint32_t>(tId));
+            if (currentEntry.beginRow_ != PbiReferenceEntry::UNSET_ROW) return false;
+        }
+        lastRefId_ = tId;
+    } else if (tId >= 0 && lastPos_ > pos)
+        return false;  // error: positions out of order
+
+    // update row numbers
+    PbiReferenceEntry& entry = rawReferenceEntries_.at(static_cast<uint32_t>(tId));
+    if (entry.beginRow_ == PbiReferenceEntry::UNSET_ROW) entry.beginRow_ = rowNumber;
+    entry.endRow_ = rowNumber + 1;
+
+    // update pos (for sorting check next go-round)
+    lastPos_ = pos;
+    return true;
+}
+
+PbiRawReferenceData PbiReferenceDataBuilder::Result() const
+{
+    // PbiReferenceEntries will be sorted thanks to std::map
+    // tId will be at end since we're sorting on the uint cast of -1
+    PbiRawReferenceData result;
+    result.entries_.reserve(rawReferenceEntries_.size());
+    for (const auto& entry : rawReferenceEntries_)
+        result.entries_.push_back(entry.second);
+    return result;
+}
+
+void PbiReferenceDataBuilder::WriteData(BGZF* bgzf)
+{
+    const auto refData = Result();
+
+    // num_refs
+    uint32_t numRefs = refData.entries_.size();
+    if (bgzf->is_be) numRefs = ed_swap_4(numRefs);
+    internal::bgzf_write_safe(bgzf, &numRefs, 4);
+
+    // reference entries
+    numRefs = refData.entries_.size();  // need to reset after maybe endian-swapping
+    for (size_t i = 0; i < numRefs; ++i) {
+        auto& entry = refData.entries_[i];
+        auto tId = entry.tId_;
+        auto beginRow = entry.beginRow_;
+        auto endRow = entry.endRow_;
+        if (bgzf->is_be) {
+            tId = ed_swap_4(tId);
+            beginRow = ed_swap_4(beginRow);
+            endRow = ed_swap_4(endRow);
+        }
+        internal::bgzf_write_safe(bgzf, &tId, 4);
+        internal::bgzf_write_safe(bgzf, &beginRow, 4);
+        internal::bgzf_write_safe(bgzf, &endRow, 4);
+    }
+}
+
+}  // namespace internal
+
+// --------------------------------------------
+// PbiBuilderPrivate - builder implementation
+// --------------------------------------------
+
+// TODO: Come back to refseqs, sorting, etc
+
+// TODO: We **NEED** to sync this up with the builder in IndexedBamWriter. They
+//       differ slightly but should be shareable.
+
+class PbiBuilder::PbiBuilderPrivate
+{
+    enum class FlushMode
+    {
+        FORCE,
+        NO_FORCE
+    };
+
+    // TODO: Make this tweak-able, a la IndexedBamWriter's buffers
+    constexpr static const size_t MaxBufferSize = 0x10000;
+
+public:
+    PbiBuilderPrivate(const std::string& pbiFilename, const size_t numReferenceSequences,
+                      const bool isCoordinateSorted,
+                      const PbiBuilder::CompressionLevel compressionLevel, const size_t numThreads)
+        : pbiFilename_{pbiFilename}
+        , tempFilename_{pbiFilename + ".build"}
+        , tempFile_{std::fopen(tempFilename_.c_str(), "w+b")}
+        , compressionLevel_{compressionLevel}
+        , numThreads_{numThreads}
+        , rgIdField_{MaxBufferSize}
+        , qStartField_{MaxBufferSize}
+        , qEndField_{MaxBufferSize}
+        , holeNumField_{MaxBufferSize}
+        , readQualField_{MaxBufferSize}
+        , ctxtField_{MaxBufferSize}
+        , fileOffsetField_{MaxBufferSize}
+        , tIdField_{MaxBufferSize}
+        , tStartField_{MaxBufferSize}
+        , tEndField_{MaxBufferSize}
+        , aStartField_{MaxBufferSize}
+        , aEndField_{MaxBufferSize}
+        , revStrandField_{MaxBufferSize}
+        , nMField_{MaxBufferSize}
+        , nMMField_{MaxBufferSize}
+        , mapQualField_{MaxBufferSize}
+        , bcForwardField_{MaxBufferSize}
+        , bcReverseField_{MaxBufferSize}
+        , bcQualField_{MaxBufferSize}
+    {
+        if (!tempFile_)
+            throw std::runtime_error{"PbiBuilder: could not open temp file: " + tempFilename_};
+
+        if (isCoordinateSorted && numReferenceSequences > 0)
+            refDataBuilder_ =
+                std::make_unique<internal::PbiReferenceDataBuilder>(numReferenceSequences);
+    }
+
+    ~PbiBuilderPrivate() noexcept
+    {
+        if (!isClosed_) {
+            try {
+                Close();
+            } catch (...) {
+                // swallow any exceptions & remain no-throw from dtor
+            }
+        }
+    }
+
+    void AddRecord(const BamRecord& b, const int64_t uOffset)
+    {
+        // ensure updated data (necessary?)
+        PacBio::BAM::BamRecordMemory::UpdateRecordTags(b);
+        b.ResetCachedPositions();
+
+        // store record data & maybe flush to temp file
+        AddBasicData(b, uOffset);
+        AddMappedData(b);
+        AddBarcodeData(b);
+        AddReferenceData(b, currentRow_);
+        FlushBuffers(FlushMode::NO_FORCE);
+
+        ++currentRow_;
+    }
+
+    void AddBasicData(const BamRecord& b, const int64_t uOffset)
+    {
+        // read group ID
+        const auto rgId = [&b]() -> int32_t {
+            auto rgIdString = b.ReadGroupBaseId();
+            if (rgIdString.empty()) rgIdString = MakeReadGroupId(b.MovieName(), ToString(b.Type()));
+            return std::stoul(rgIdString, nullptr, 16);
+        }();
+
+        // query start/end
+        const auto isCcsOrTranscript = (IsCcsOrTranscript(b.Type()));
+        const int32_t qStart = (isCcsOrTranscript ? 0 : b.QueryStart());
+        const int32_t qEnd = (isCcsOrTranscript ? b.Impl().SequenceLength() : b.QueryEnd());
+
+        // add'l data
+        const int32_t holeNum = (b.HasHoleNumber() ? b.HoleNumber() : 0);
+        const float readAccuracy =
+            (b.HasReadAccuracy() ? boost::numeric_cast<float>(b.ReadAccuracy()) : 0.0F);
+        const uint8_t ctxt = (b.HasLocalContextFlags() ? b.LocalContextFlags()
+                                                       : LocalContextFlags::NO_LOCAL_CONTEXT);
+
+        // store
+        rgIdField_.Add(rgId);
+        qStartField_.Add(qStart);
+        qEndField_.Add(qEnd);
+        holeNumField_.Add(holeNum);
+        ctxtField_.Add(ctxt);
+        readQualField_.Add(readAccuracy);
+        fileOffsetField_.Add(uOffset);
+    }
+
+    void AddMappedData(const BamRecord& b)
+    {
+        // alignment position
+        const auto tId = b.ReferenceId();
+        const auto tStart = static_cast<uint32_t>(b.ReferenceStart());
+        const auto tEnd = static_cast<uint32_t>(b.ReferenceEnd());
+        const auto aStart = static_cast<uint32_t>(b.AlignedStart());
+        const auto aEnd = static_cast<uint32_t>(b.AlignedEnd());
+        const auto isReverseStrand = [&b]() -> uint8_t {
+            return (b.AlignedStrand() == Strand::REVERSE ? 1 : 0);
+        }();
+
+        // alignment quality
+        const auto matchData = b.NumMatchesAndMismatches();
+        const auto nM = static_cast<uint32_t>(matchData.first);
+        const auto nMM = static_cast<uint32_t>(matchData.second);
+        const auto mapQuality = b.MapQuality();
+
+        if (tId >= 0) hasMappedData_ = true;
+
+        // store
+        tIdField_.Add(tId);
+        tStartField_.Add(tStart);
+        tEndField_.Add(tEnd);
+        aStartField_.Add(aStart);
+        aEndField_.Add(aEnd);
+        revStrandField_.Add(isReverseStrand);
+        nMField_.Add(nM);
+        nMMField_.Add(nMM);
+        mapQualField_.Add(mapQuality);
+    }
+
+    void AddBarcodeData(const BamRecord& b)
+    {
+        // initialize w/ 'missing' value
+        int16_t bcForward = -1;
+        int16_t bcReverse = -1;
+        int8_t bcQuality = -1;
+
+        // check for any barcode data (both required)
+        if (b.HasBarcodes() && b.HasBarcodeQuality()) {
+            // fetch data from record
+            std::tie(bcForward, bcReverse) = b.Barcodes();
+            bcQuality = static_cast<int8_t>(b.BarcodeQuality());
+
+            // double-check & reset to 'missing' value if any less than zero
+            if (bcForward < 0 && bcReverse < 0 && bcQuality < 0) {
+                bcForward = -1;
+                bcReverse = -1;
+                bcQuality = -1;
+            } else
+                hasBarcodeData_ = true;
+        }
+
+        // store
+        bcForwardField_.Add(bcForward);
+        bcReverseField_.Add(bcReverse);
+        bcQualField_.Add(bcQuality);
+    }
+
+    void AddReferenceData(const BamRecord& b, const uint32_t currentRow)
+    {
+        // only add if coordinate-sorted hint is set
+        // update with info from refDataBuilder
+        if (refDataBuilder_) {
+            const auto sorted = refDataBuilder_->AddRecord(b, currentRow);
+            if (!sorted) refDataBuilder_.reset();
+        }
+    }
+
+    void Close()
+    {
+        if (isClosed_) return;
+
+        FlushBuffers(FlushMode::FORCE);
+
+        OpenPbiFile();
+        WritePbiHeader();
+        WriteFromTempFile();
+
+        remove(tempFilename_.c_str());
+        isClosed_ = true;
+    }
+
+    void OpenPbiFile()
+    {
+        // open file handle
+        const auto mode = std::string("wb") + std::to_string(static_cast<int>(compressionLevel_));
+        pbiFile_.reset(bgzf_open(pbiFilename_.c_str(), mode.c_str()));
+        if (pbiFile_ == nullptr)
+            throw std::runtime_error{"PbiBuilder: could not open file for writing: " +
+                                     pbiFilename_};
+
+        // if no explicit thread count given, attempt built-in check
+        size_t actualNumThreads = numThreads_;
+        if (actualNumThreads == 0) {
+            actualNumThreads = std::thread::hardware_concurrency();
+
+            // if still unknown, default to single-threaded
+            if (actualNumThreads == 0) actualNumThreads = 1;
+        }
+
+        // if multithreading requested, enable it
+        if (actualNumThreads > 1) bgzf_mt(pbiFile_.get(), actualNumThreads, 256);
+    }
+
+    template <typename T>
+    void MaybeFlushBuffer(internal::PbiField<T>& field, bool force)
+    {
+        // replace with lambda, in FlushBuffer(), once PPA can use C++14 ?
+        if (field.IsFull() || force) {
+            WriteToTempFile(field);
+            field.buffer_.clear();
+        }
+    }
+
+    void FlushBuffers(FlushMode mode)
+    {
+        const auto force = (mode == FlushMode::FORCE);
+
+        MaybeFlushBuffer(rgIdField_, force);
+        MaybeFlushBuffer(qStartField_, force);
+        MaybeFlushBuffer(qEndField_, force);
+        MaybeFlushBuffer(holeNumField_, force);
+        MaybeFlushBuffer(readQualField_, force);
+        MaybeFlushBuffer(ctxtField_, force);
+        MaybeFlushBuffer(fileOffsetField_, force);
+
+        MaybeFlushBuffer(tIdField_, force);
+        MaybeFlushBuffer(tStartField_, force);
+        MaybeFlushBuffer(tEndField_, force);
+        MaybeFlushBuffer(aStartField_, force);
+        MaybeFlushBuffer(aEndField_, force);
+        MaybeFlushBuffer(revStrandField_, force);
+        MaybeFlushBuffer(nMField_, force);
+        MaybeFlushBuffer(nMMField_, force);
+        MaybeFlushBuffer(mapQualField_, force);
+
+        MaybeFlushBuffer(bcForwardField_, force);
+        MaybeFlushBuffer(bcReverseField_, force);
+        MaybeFlushBuffer(bcQualField_, force);
+    }
+
+    template <typename T>
+    void LoadFieldBlockFromTempFile(internal::PbiField<T>& field,
+                                    const internal::PbiFieldBlock& block)
+    {
+        // seek to block begin
+        const auto ret = std::fseek(tempFile_.get(), block.pos_, SEEK_SET);
+        if (ret != 0)
+            throw std::runtime_error{"PbiBuilder: could not seek in temp file: " + tempFilename_ +
+                                     ", offset: " + std::to_string(block.pos_)};
+
+        // read block elements
+        field.buffer_.assign(block.n_, 0);
+        const auto numElements =
+            std::fread(field.buffer_.data(), sizeof(T), block.n_, tempFile_.get());
+
+        if (numElements != block.n_)
+            throw std::runtime_error{"PbiBuilder: could not read element count from temp file: " +
+                                     tempFilename_};
+    }
+
+    template <typename T>
+    void WriteField(internal::PbiField<T>& field)
+    {
+        for (const auto& block : field.blocks_) {
+            LoadFieldBlockFromTempFile(field, block);
+            internal::WriteBgzfVector(pbiFile_.get(), field.buffer_);
+        }
+    }
+
+    void WriteFromTempFile()
+    {
+        // load from temp file, in PBI format order, and write to index
+
+        WriteField(rgIdField_);
+        WriteField(qStartField_);
+        WriteField(qEndField_);
+        WriteField(holeNumField_);
+        WriteField(readQualField_);
+        WriteField(ctxtField_);
+        WriteField(fileOffsetField_);
+
+        if (hasMappedData_) {
+            WriteField(tIdField_);
+            WriteField(tStartField_);
+            WriteField(tEndField_);
+            WriteField(aStartField_);
+            WriteField(aEndField_);
+            WriteField(revStrandField_);
+            WriteField(nMField_);
+            WriteField(nMMField_);
+            WriteField(mapQualField_);
+        }
+
+        if (refDataBuilder_) WriteReferenceData();
+
+        if (hasBarcodeData_) {
+            WriteField(bcForwardField_);
+            WriteField(bcReverseField_);
+            WriteField(bcQualField_);
+        }
+    }
+
+    template <typename T>
+    void WriteToTempFile(internal::PbiField<T>& field)
+    {
+        if (field.buffer_.empty()) return;
+
+        const auto pos = std::ftell(tempFile_.get());
+        const auto numElements =
+            std::fwrite(field.buffer_.data(), sizeof(T), field.buffer_.size(), tempFile_.get());
+        field.blocks_.emplace_back(internal::PbiFieldBlock{pos, numElements});
+    }
+
+    void WritePbiHeader()
+    {
+        BGZF* bgzf = pbiFile_.get();
+
+        // 'magic' string
+        static constexpr const std::array<char, 4> magic{{'P', 'B', 'I', '\1'}};
+        internal::bgzf_write_safe(bgzf, magic.data(), 4);
+
+        PbiFile::Sections sections = PbiFile::BASIC;
+        if (hasMappedData_) sections |= PbiFile::MAPPED;
+        if (hasBarcodeData_) sections |= PbiFile::BARCODE;
+        if (refDataBuilder_) sections |= PbiFile::REFERENCE;
+
+        // version, pbi_flags, & n_reads
+        auto version = static_cast<uint32_t>(PbiFile::CurrentVersion);
+        uint16_t pbi_flags = sections;
+        auto numReads = currentRow_;
+        if (bgzf->is_be) {
+            version = ed_swap_4(version);
+            pbi_flags = ed_swap_2(pbi_flags);
+            numReads = ed_swap_4(numReads);
+        }
+        internal::bgzf_write_safe(bgzf, &version, 4);
+        internal::bgzf_write_safe(bgzf, &pbi_flags, 2);
+        internal::bgzf_write_safe(bgzf, &numReads, 4);
+
+        // reserved space
+        char reserved[18];
+        memset(reserved, 0, 18);
+        internal::bgzf_write_safe(bgzf, reserved, 18);
+    }
+
+    void WriteReferenceData() { refDataBuilder_->WriteData(pbiFile_.get()); }
+
+private:
+    // file info
+    std::string bamFilename_;
+    std::string pbiFilename_;
+    std::string tempFilename_;
+    std::unique_ptr<FILE, Utility::FileDeleter> tempFile_;
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> pbiFile_;
+    PbiBuilder::CompressionLevel compressionLevel_;
+    size_t numThreads_;
+
+    // PBI field buffers
+    internal::PbiField<int32_t> rgIdField_;
+    internal::PbiField<int32_t> qStartField_;
+    internal::PbiField<int32_t> qEndField_;
+    internal::PbiField<int32_t> holeNumField_;
+    internal::PbiField<float> readQualField_;
+    internal::PbiField<uint8_t> ctxtField_;
+    internal::PbiField<uint64_t> fileOffsetField_;
+    internal::PbiField<int32_t> tIdField_;
+    internal::PbiField<uint32_t> tStartField_;
+    internal::PbiField<uint32_t> tEndField_;
+    internal::PbiField<uint32_t> aStartField_;
+    internal::PbiField<uint32_t> aEndField_;
+    internal::PbiField<uint8_t> revStrandField_;
+    internal::PbiField<uint32_t> nMField_;
+    internal::PbiField<uint32_t> nMMField_;
+    internal::PbiField<uint8_t> mapQualField_;
+    internal::PbiField<int16_t> bcForwardField_;
+    internal::PbiField<int16_t> bcReverseField_;
+    internal::PbiField<int8_t> bcQualField_;
+
+    // reference data
+    std::unique_ptr<internal::PbiReferenceDataBuilder> refDataBuilder_;
+
+    // tracking data
+    uint32_t currentRow_ = 0;
+    bool isClosed_ = false;
+    bool hasBarcodeData_ = false;
+    bool hasMappedData_ = false;
+};
+
+// --------------------------------------------
+// PbiBuilder - builder API
+// --------------------------------------------
+
+PbiBuilder::PbiBuilder(const std::string& pbiFilename, const CompressionLevel compressionLevel,
+                       const size_t numThreads)
+    : PbiBuilder{pbiFilename, 0, false, compressionLevel, numThreads}
+{
+}
+
+PbiBuilder::PbiBuilder(const std::string& pbiFilename, const size_t numReferenceSequences,
+                       const CompressionLevel compressionLevel, const size_t numThreads)
+    : PbiBuilder{pbiFilename, numReferenceSequences, (numReferenceSequences > 0), compressionLevel,
+                 numThreads}
+{
+}
+
+PbiBuilder::PbiBuilder(const std::string& pbiFilename, const size_t numReferenceSequences,
+                       const bool isCoordinateSorted, const CompressionLevel compressionLevel,
+                       const size_t numThreads)
+    : d_{std::make_unique<PbiBuilderPrivate>(pbiFilename, numReferenceSequences, isCoordinateSorted,
+                                             compressionLevel, numThreads)}
+{
+}
+
+PbiBuilder::~PbiBuilder() noexcept = default;
+
+void PbiBuilder::AddRecord(const BamRecord& record, const int64_t vOffset)
+{
+    d_->AddRecord(record, vOffset);
+}
+
+void PbiBuilder::Close() { d_->Close(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiFile.cpp b/src/PbiFile.cpp

new file mode 100644 (file)

index 0000000..fe5fdc0
--- /dev/null
+++ b/src/PbiFile.cpp
@@ -0,0 +1,37 @@
+// File Description
+/// \file PbiFile.cpp
+/// \brief Implements the PbiFile methods.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiFile.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamReader.h"
+#include "pbbam/PbiBuilder.h"
+
+namespace PacBio {
+namespace BAM {
+
+void PbiFile::CreateFrom(const BamFile& bamFile,
+                         const PbiBuilder::CompressionLevel compressionLevel,
+                         const size_t numThreads)
+{
+    PbiBuilder builder(bamFile.PacBioIndexFilename(), bamFile.Header().Sequences().size(),
+                       compressionLevel, numThreads);
+    BamReader reader(bamFile);
+    BamRecord b;
+    int64_t offset = reader.VirtualTell();
+    while (reader.GetNext(b)) {
+        builder.AddRecord(b, offset);
+        offset = reader.VirtualTell();
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiFilter.cpp b/src/PbiFilter.cpp

new file mode 100644 (file)

index 0000000..5364f77
--- /dev/null
+++ b/src/PbiFilter.cpp
@@ -0,0 +1,497 @@
+// File Description
+/// \file PbiFilter.cpp
+/// \brief Implements the PbiFilter class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiFilter.h"
+
+#include <algorithm>
+#include <cctype>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include <boost/algorithm/string/case_conv.hpp>
+#include <boost/algorithm/string/trim.hpp>
+#include <boost/numeric/conversion/cast.hpp>
+
+#include "FileUtils.h"
+#include "pbbam/PbiFilterTypes.h"
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// clang-format off
+enum class BuiltIn
+{
+    AlignedEndFilter
+  , AlignedLengthFilter
+  , AlignedStartFilter
+  , AlignedStrandFilter
+  , BarcodeFilter
+  , BarcodeForwardFilter
+  , BarcodeQualityFilter
+  , BarcodeReverseFilter
+  , BarcodesFilter
+  , IdentityFilter
+  , LocalContextFilter
+  , MovieNameFilter
+  , NumDeletedBasesFilter
+  , NumInsertedBasesFilter
+  , NumMatchesFilter
+  , NumMismatchesFilter
+  , QIdFilter
+  , QueryEndFilter
+  , QueryLengthFilter
+  , QueryNameFilter
+  , QueryNamesFromFileFilter
+  , QueryStartFilter
+  , ReadAccuracyFilter
+  , ReadGroupFilter
+  , ReferenceEndFilter
+  , ReferenceIdFilter
+  , ReferenceNameFilter
+  , ReferenceStartFilter
+  , ZmwFilter
+};
+
+static const std::unordered_map<std::string, BuiltIn> builtInLookup =
+{
+    // property name   built-in filter
+    { "ae",            BuiltIn::AlignedEndFilter },
+    { "aend",          BuiltIn::AlignedEndFilter },
+    { "alignedlength", BuiltIn::AlignedLengthFilter },
+    { "as",            BuiltIn::AlignedStartFilter },
+    { "astart",        BuiltIn::AlignedStartFilter },
+    { "readstart",     BuiltIn::AlignedStartFilter },
+    { "bc",            BuiltIn::BarcodeFilter },
+    { "barcode",       BuiltIn::BarcodeFilter },
+    { "bcf",           BuiltIn::BarcodeForwardFilter },
+    { "bq",            BuiltIn::BarcodeQualityFilter },
+    { "bcq",           BuiltIn::BarcodeQualityFilter },
+    { "bcr",           BuiltIn::BarcodeReverseFilter },
+    { "accuracy",      BuiltIn::IdentityFilter },
+    { "identity",      BuiltIn::IdentityFilter },
+    { "cx",            BuiltIn::LocalContextFilter },
+    { "movie",         BuiltIn::MovieNameFilter },
+    { "qid",           BuiltIn::QIdFilter },
+    { "qe",            BuiltIn::QueryEndFilter },
+    { "qend",          BuiltIn::QueryEndFilter },
+    { "length",        BuiltIn::QueryLengthFilter },
+    { "querylength",   BuiltIn::QueryLengthFilter },
+    { "qname",         BuiltIn::QueryNameFilter },
+    { "qname_file",    BuiltIn::QueryNamesFromFileFilter },
+    { "qs",            BuiltIn::QueryStartFilter },
+    { "qstart",        BuiltIn::QueryStartFilter },
+    { "rq",            BuiltIn::ReadAccuracyFilter },
+    { "te",            BuiltIn::ReferenceEndFilter },
+    { "tend",          BuiltIn::ReferenceEndFilter },
+    { "rname",         BuiltIn::ReferenceNameFilter },
+    { "ts",            BuiltIn::ReferenceStartFilter },
+    { "tstart",        BuiltIn::ReferenceStartFilter },
+    { "pos",           BuiltIn::ReferenceStartFilter },
+    { "zm",            BuiltIn::ZmwFilter },
+    { "zmw",           BuiltIn::ZmwFilter }
+};
+
+static const std::unordered_map<std::string, LocalContextFlags> contextFlagNames =
+{
+    { "NO_LOCAL_CONTEXT",   LocalContextFlags::NO_LOCAL_CONTEXT },
+    { "ADAPTER_BEFORE",     LocalContextFlags::ADAPTER_BEFORE },
+    { "ADAPTER_AFTER",      LocalContextFlags::ADAPTER_AFTER },
+    { "BARCODE_BEFORE",     LocalContextFlags::BARCODE_BEFORE },
+    { "BARCODE_AFTER",      LocalContextFlags::BARCODE_AFTER },
+    { "FORWARD_PASS",       LocalContextFlags::FORWARD_PASS },
+    { "REVERSE_PASS",       LocalContextFlags::REVERSE_PASS },
+    { "ADAPTER_BEFORE_BAD", LocalContextFlags::ADAPTER_BEFORE_BAD},
+    { "ADAPTER_AFTER_BAD",  LocalContextFlags::ADAPTER_AFTER_BAD}
+};
+// clang-format off
+
+// helper methods (for handling maybe-list strings))
+static inline bool isBracketed(const std::string& value)
+{
+    static const std::string openBrackets = "[({";
+    static const std::string closeBrackets = "])}";
+    return openBrackets.find(value.at(0)) != std::string::npos &&
+           closeBrackets.find(value.at(value.length() - 1)) != std::string::npos;
+}
+
+static inline bool isList(const std::string& value) { return value.find(',') != std::string::npos; }
+
+static PbiFilter CreateBarcodeFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"PbiFilter: empty value for barcode filter property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+        std::vector<std::string> barcodes = Split(value, ',');
+        if (barcodes.size() != 2) throw std::runtime_error{"PbiFilter: only 2 barcode values expected"};
+        return PbiBarcodesFilter{boost::numeric_cast<int16_t>(std::stoi(barcodes.at(0))),
+                                 boost::numeric_cast<int16_t>(std::stoi(barcodes.at(1))),
+                                 compareType};
+    } else
+        return PbiBarcodeFilter{boost::numeric_cast<int16_t>(stoi(value)), compareType};
+}
+
+static PbiFilter CreateBarcodeForwardFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"PbiFilter: empty value for barcode_forward filter property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+        std::vector<std::string> tokens = Split(value, ',');
+        std::vector<int16_t> barcodes;
+        barcodes.reserve(tokens.size());
+        for (const auto& t : tokens)
+            barcodes.push_back(boost::numeric_cast<int16_t>(stoi(t)));
+        return PbiBarcodeForwardFilter{std::move(barcodes)};
+    } else
+        return PbiBarcodeForwardFilter{boost::numeric_cast<int16_t>(std::stoi(value)), compareType};
+}
+
+static PbiFilter CreateBarcodeReverseFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"PbiFilter: empty value for barcode_reverse filter property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+        std::vector<std::string> tokens = Split(value, ',');
+        std::vector<int16_t> barcodes;
+        barcodes.reserve(tokens.size());
+        for (const auto& t : tokens)
+            barcodes.push_back(boost::numeric_cast<int16_t>(std::stoi(t)));
+        return PbiBarcodeReverseFilter{std::move(barcodes)};
+    } else
+        return PbiBarcodeReverseFilter{boost::numeric_cast<int16_t>(stoi(value)), compareType};
+}
+
+static PbiFilter CreateLocalContextFilter(const std::string& value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"PbiFilter: empty value for local context filter property"};
+
+    LocalContextFlags filterValue = LocalContextFlags::NO_LOCAL_CONTEXT;
+
+    // if raw integer
+    if (isdigit(value.at(0))) filterValue = static_cast<LocalContextFlags>(stoi(value));
+
+    // else interpret as flag names
+    else {
+        std::vector<std::string> tokens = Split(value, '|');
+        for (std::string& token : tokens) {
+            boost::algorithm::trim(token);  // trim whitespace
+            filterValue = (filterValue | contextFlagNames.at(token));
+        }
+    }
+
+    return PbiFilter{PbiLocalContextFilter{filterValue, compareType}};
+}
+
+static PbiFilter CreateMovieNameFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"PbiFilter: empty value for movie property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+
+        if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+            throw std::runtime_error{"PbiFilter: unsupported compare type on movie property"};
+
+        std::vector<std::string> tokens = Split(value, ',');
+        return PbiMovieNameFilter{std::move(tokens), compareType};
+    } else
+        return PbiMovieNameFilter{value, compareType};
+}
+
+static PbiFilter CreateQIdFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"PbiFilter: empty value for qid property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+
+        if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+            throw std::runtime_error{"PbiFilter: unsupported compare type on qid property"};
+
+        std::vector<int32_t> rgIds;
+        for (const auto& t : Split(value, ','))
+            rgIds.push_back(static_cast<int32_t>(std::stoul(t)));
+        return PbiReadGroupFilter{rgIds, compareType};
+    } else {
+        const auto n = static_cast<int32_t>(std::stoul(value));
+        return PbiReadGroupFilter{n, compareType};
+    }
+
+}
+
+static PbiFilter CreateQueryNamesFilterFromFile(const std::string& value, const DataSet& dataset, const Compare::Type compareType)
+{
+    if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+        throw std::runtime_error{"PbiFilter: unsupported compare type on query name property"};
+
+    // resolve file from dataset, value
+    const std::string resolvedFilename = dataset.ResolvePath(value);
+    std::vector<std::string> whitelist;
+    std::string fn;
+    std::ifstream in(resolvedFilename);
+    while (std::getline(in, fn))
+        whitelist.push_back(fn);
+    return PbiQueryNameFilter{whitelist, compareType};
+}
+
+static PbiFilter CreateQueryNameFilter(std::string value, const DataSet& dataset, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"PbiFilter: empty value for query name property"};
+
+    // try possible filename first
+    const std::string resolvedFilename = dataset.ResolvePath(value);
+    if (FileUtils::Exists(value))
+        return CreateQueryNamesFilterFromFile(value, dataset, compareType);
+
+    // otherwise "normal" qname (single, or list)
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+
+        if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+            throw std::runtime_error{"PbiFilter: unsupported compare type on query name property"};
+
+        std::vector<std::string> tokens = Split(value, ',');
+        return PbiQueryNameFilter{std::move(tokens), compareType};
+    } else
+        return PbiQueryNameFilter{value, compareType};
+}
+
+static PbiFilter CreateReadGroupFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"PbiFilter: empty value for read group property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+
+        if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+            throw std::runtime_error{"PbiFilter: unsupported compare type on read group property"};
+
+        std::vector<std::string> tokens = Split(value, ',');
+        return PbiReadGroupFilter{std::move(tokens), compareType};
+    } else
+        return PbiReadGroupFilter{value, compareType};
+}
+
+static PbiFilter CreateReferenceIdFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"PbiFilter: empty value for reference ID property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+
+        if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+            throw std::runtime_error{"PbiFilter: unsupported compare type on reference name ID property"};
+
+        std::vector<std::string> tokens = Split(value, ',');
+        std::vector<int32_t> ids;
+        ids.reserve(tokens.size());
+        for (const auto& t : tokens)
+            ids.push_back(boost::numeric_cast<int32_t>(stoi(t)));
+        return PbiReferenceIdFilter{std::move(ids), compareType};
+    } else
+        return PbiReferenceIdFilter{boost::numeric_cast<int32_t>(stoi(value)), compareType};
+}
+
+static PbiFilter CreateReferenceNameFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"PbiFilter: empty value for reference name property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+
+        if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+            throw std::runtime_error{"PbiFilter: unsupported compare type on reference name property"};
+
+        std::vector<std::string> tokens = Split(value, ',');
+        return PbiReferenceNameFilter{std::move(tokens), compareType};
+    } else
+        return PbiReferenceNameFilter{value, compareType};
+}
+
+static PbiFilter CreateZmwFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"PbiFilter: empty value for ZMW filter property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+        std::vector<std::string> tokens = Split(value, ',');
+        std::vector<int32_t> zmws;
+        zmws.reserve(tokens.size());
+        for (const auto& t : tokens)
+            zmws.push_back(boost::numeric_cast<int32_t>(stoi(t)));
+        return PbiZmwFilter{std::move(zmws)};
+    } else
+        return PbiZmwFilter{boost::numeric_cast<int32_t>(stoi(value)), compareType};
+}
+
+static PbiFilter CreateZmwModuloFilter(const Property& property)
+{
+    if (!property.HasAttribute("Modulo") || !property.HasAttribute("Hash") ||
+        property.Name() != "zm")
+    {
+        throw std::runtime_error{"PbiFilter: modulo filter is not supported on property: " + property.Name()};
+    }
+
+    const auto hashType = property.Attribute("Hash");
+    const FilterHash hash = [&hashType]()
+    {
+        if (boost::algorithm::to_lower_copy(hashType) == "uint32cast")
+            return FilterHash::UNSIGNED_LONG_CAST;
+        if (boost::algorithm::to_lower_copy(hashType) == "boosthashcombine")
+            return FilterHash::BOOST_HASH_COMBINE;
+        throw std::runtime_error{"PbiFilter: unsuppoerted hash type: " + hashType};
+    }();
+
+    const uint32_t denom = std::stoul(property.Attribute("Modulo"));
+    const uint32_t value = std::stoul(property.Value());
+
+    return PbiZmwModuloFilter{ denom, value, hash, Compare::EQUAL };
+}
+
+static PbiFilter FromDataSetProperty(const Property& property, const DataSet& dataset)
+{
+    try {
+        const std::string& value = property.Value();
+
+        if (property.Name() == "zm" && property.HasAttribute("Modulo"))
+            return CreateZmwModuloFilter(property);
+
+        const Compare::Type compareType = Compare::TypeFromOperator(property.Operator());
+        const BuiltIn builtInCode =
+            builtInLookup.at(boost::algorithm::to_lower_copy(property.Name()));
+
+        // clang-format off
+        switch (builtInCode) {
+
+            // single-value filters
+            case BuiltIn::AlignedEndFilter     : return PbiAlignedEndFilter{ static_cast<uint32_t>(std::stoul(value)), compareType };
+            case BuiltIn::AlignedLengthFilter  : return PbiAlignedLengthFilter{ static_cast<uint32_t>(std::stoul(value)), compareType };
+            case BuiltIn::AlignedStartFilter   : return PbiAlignedStartFilter{ static_cast<uint32_t>(std::stoul(value)), compareType };
+            case BuiltIn::BarcodeQualityFilter : return PbiBarcodeQualityFilter{ static_cast<uint8_t>(std::stoul(value)), compareType };
+            case BuiltIn::IdentityFilter       : return PbiIdentityFilter{ std::stof(value), compareType };
+            case BuiltIn::QueryEndFilter       : return PbiQueryEndFilter{ std::stoi(value), compareType };
+            case BuiltIn::QueryLengthFilter    : return PbiQueryLengthFilter{ std::stoi(value), compareType };
+            case BuiltIn::QueryStartFilter     : return PbiQueryStartFilter{ std::stoi(value), compareType };
+            case BuiltIn::ReadAccuracyFilter   : return PbiReadAccuracyFilter{ std::stof(value), compareType };
+            case BuiltIn::ReferenceEndFilter   : return PbiReferenceEndFilter{ static_cast<uint32_t>(std::stoul(value)), compareType };
+            case BuiltIn::ReferenceStartFilter : return PbiReferenceStartFilter{ static_cast<uint32_t>(std::stoul(value)), compareType };
+
+            // (maybe) list-value filters
+            case BuiltIn::BarcodeFilter        : return CreateBarcodeFilter(value, compareType);
+            case BuiltIn::BarcodeForwardFilter : return CreateBarcodeForwardFilter(value, compareType);
+            case BuiltIn::BarcodeReverseFilter : return CreateBarcodeReverseFilter(value, compareType);
+            case BuiltIn::LocalContextFilter   : return CreateLocalContextFilter(value, compareType);
+            case BuiltIn::MovieNameFilter      : return CreateMovieNameFilter(value, compareType);
+            case BuiltIn::QIdFilter            : return CreateQIdFilter(value, compareType);
+            case BuiltIn::QueryNameFilter      : return CreateQueryNameFilter(value, dataset, compareType);
+            case BuiltIn::ReadGroupFilter      : return CreateReadGroupFilter(value, compareType);
+            case BuiltIn::ReferenceIdFilter    : return CreateReferenceIdFilter(value, compareType);
+            case BuiltIn::ReferenceNameFilter  : return CreateReferenceNameFilter(value, compareType);
+            case BuiltIn::ZmwFilter            : return CreateZmwFilter(value, compareType);
+
+            // other built-ins
+            case BuiltIn::QueryNamesFromFileFilter : return CreateQueryNamesFilterFromFile(value, dataset, compareType);
+
+            default :
+            throw std::runtime_error{"PbiFilter: invalid built-in filter requested"};
+        }
+        // clang-format on
+
+        // unreachable
+        return PbiFilter{};
+
+    } catch (std::exception& e) {
+        std::ostringstream s;
+        s << "PbiFilter: could not create filter from XML Property element:\n"
+          << "  Name:     " << property.Name() << '\n'
+          << "  Value:    " << property.Value() << '\n'
+          << "  Operator: " << property.Operator() << '\n'
+          << "  reason:   " << e.what() << '\n';
+        throw std::runtime_error{s.str()};
+    }
+}
+
+}  // namespace internal
+
+PbiFilter PbiFilter::FromDataSet(const DataSet& dataset)
+{
+    PbiFilter datasetFilter{PbiFilter::UNION};
+    for (const auto& xmlFilter : dataset.Filters()) {
+        PbiFilter propertiesFilter;
+        for (const auto& xmlProperty : xmlFilter.Properties())
+            propertiesFilter.Add(internal::FromDataSetProperty(xmlProperty, dataset));
+        datasetFilter.Add(propertiesFilter);
+    }
+    return datasetFilter;
+}
+
+PbiFilter PbiFilter::Intersection(std::vector<PbiFilter> filters)
+{
+    auto result = PbiFilter{PbiFilter::INTERSECT};
+    result.Add(std::move(filters));
+    return result;
+}
+
+PbiFilter PbiFilter::Union(std::vector<PbiFilter> filters)
+{
+    auto result = PbiFilter{PbiFilter::UNION};
+    result.Add(std::move(filters));
+    return result;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiFilterQuery.cpp b/src/PbiFilterQuery.cpp

new file mode 100644 (file)

index 0000000..37d655f
--- /dev/null
+++ b/src/PbiFilterQuery.cpp
@@ -0,0 +1,58 @@
+// File Description
+/// \file PbiFilterQuery.cpp
+/// \brief Implements the PbiFilterQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiFilterQuery.h"
+
+#include <iostream>
+
+#include "pbbam/CompositeBamReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+class PbiFilterQuery::PbiFilterQueryPrivate
+{
+public:
+    PbiFilterQueryPrivate(const PbiFilter& filter, const DataSet& dataset,
+                          const PbiIndexCache& cache)
+        : reader_{filter, dataset, cache}
+    {
+    }
+
+    PbiFilterCompositeBamReader<Compare::None> reader_;  // unsorted
+};
+
+PbiFilterQuery::PbiFilterQuery(const DataSet& dataset)
+    : PbiFilterQuery{PbiFilter::FromDataSet(dataset), dataset, MakePbiIndexCache(dataset)}
+{
+}
+
+PbiFilterQuery::PbiFilterQuery(const DataSet& dataset, const PbiIndexCache& cache)
+    : PbiFilterQuery{PbiFilter::FromDataSet(dataset), dataset, cache}
+{
+}
+
+PbiFilterQuery::PbiFilterQuery(const PbiFilter& filter, const DataSet& dataset)
+    : PbiFilterQuery{filter, dataset, MakePbiIndexCache(dataset)}
+{
+}
+
+PbiFilterQuery::PbiFilterQuery(const PbiFilter& filter, const DataSet& dataset,
+                               const PbiIndexCache& cache)
+    : internal::IQuery(), d_{std::make_unique<PbiFilterQueryPrivate>(filter, dataset, cache)}
+{
+}
+
+PbiFilterQuery::~PbiFilterQuery() = default;
+
+bool PbiFilterQuery::GetNext(BamRecord& r) { return d_->reader_.GetNext(r); }
+
+uint32_t PbiFilterQuery::NumReads() const { return d_->reader_.NumReads(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiFilterTypes.cpp b/src/PbiFilterTypes.cpp

new file mode 100644 (file)

index 0000000..6cba0b1
--- /dev/null
+++ b/src/PbiFilterTypes.cpp
@@ -0,0 +1,522 @@
+// File Description
+/// \file PbiFilterTypes.cpp
+/// \brief Implements the built-in PBI filters.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiFilterTypes.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include <boost/algorithm/string.hpp>
+
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+template <typename T>
+IndexList readLengthHelper(const std::vector<T>& start, const std::vector<T>& end, const T& value,
+                           const Compare::Type cmp)
+{
+    assert(start.size() == end.size());
+
+    auto result = IndexList{};
+    const auto numElements = start.size();
+    for (size_t i = 0; i < numElements; ++i) {
+        const auto readLength = end[i] - start[i];
+        bool keep = false;
+        switch (cmp) {
+            case Compare::EQUAL:
+                keep = (readLength == value);
+                break;
+            case Compare::NOT_EQUAL:
+                keep = (readLength != value);
+                break;
+            case Compare::LESS_THAN:
+                keep = (readLength < value);
+                break;
+            case Compare::LESS_THAN_EQUAL:
+                keep = (readLength <= value);
+                break;
+            case Compare::GREATER_THAN:
+                keep = (readLength > value);
+                break;
+            case Compare::GREATER_THAN_EQUAL:
+                keep = (readLength >= value);
+                break;
+            default:
+                assert(false);
+                throw std::runtime_error{
+                    "PbiFilter: read length filter encountered unknown compare type: " +
+                    Compare::TypeToName(cmp)};
+        }
+
+        if (keep) result.push_back(i);
+    }
+    return result;
+}
+
+PbiFilter filterFromMovieName(const std::string& movieName, bool includeCcs)
+{
+    //
+    // All transcript-type reads (movieName == "transcript") have the same
+    // read group ID. Calculate once & and create filters from that ID.
+    //
+    if (movieName == "transcript") {
+        static const auto transcriptRgId = MakeReadGroupId("transcript", "TRANSCRIPT");
+        return PbiFilter{PbiReadGroupFilter{transcriptRgId}};
+    }
+
+    //
+    // For all other movie names, we can't determine read type up front, so we'll match
+    // on any rgIds from a candidate list.
+    //
+    auto filter = PbiFilter{PbiFilter::UNION};
+    filter.Add({PbiReadGroupFilter{MakeReadGroupId(movieName, "POLYMERASE")},
+                PbiReadGroupFilter{MakeReadGroupId(movieName, "HQREGION")},
+                PbiReadGroupFilter{MakeReadGroupId(movieName, "SUBREAD")},
+                PbiReadGroupFilter{MakeReadGroupId(movieName, "SCRAP")},
+                PbiReadGroupFilter{MakeReadGroupId(movieName, "UNKNOWN")}});
+    if (includeCcs) filter.Add(PbiReadGroupFilter{MakeReadGroupId(movieName, "CCS")});
+
+    return filter;
+}
+
+}  // namespace
+
+// PbiAlignedLengthFilter
+
+bool PbiAlignedLengthFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const auto& mappedData = idx.MappedData();
+    const auto& aEnd = mappedData.aEnd_.at(row);
+    const auto& aStart = mappedData.aStart_.at(row);
+    const auto aLength = aEnd - aStart;
+    return CompareHelper(aLength);
+}
+
+// PbiIdentityFilter
+
+bool PbiIdentityFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const auto& mappedData = idx.MappedData();
+    const auto& nMM = mappedData.nMM_.at(row);
+    const auto& nIndels = mappedData.NumDeletedAndInsertedBasesAt(row);
+    const auto& nDel = nIndels.first;
+    const auto& nIns = nIndels.second;
+
+    const auto& basicData = idx.BasicData();
+    const auto& qStart = basicData.qStart_.at(row);
+    const auto& qEnd = basicData.qEnd_.at(row);
+
+    const float readLength = qEnd - qStart;
+    const float nonMatches = nMM + nDel + nIns;
+    const float identity = 1.0f - (nonMatches / readLength);
+
+    return CompareHelper(identity);
+}
+
+// PbiMovieNameFilter
+
+PbiMovieNameFilter::PbiMovieNameFilter(const std::string& movieName, const Compare::Type cmp)
+    : compositeFilter_{filterFromMovieName(movieName, true)}  // include CCS
+    , cmp_{cmp}
+{
+}
+
+PbiMovieNameFilter::PbiMovieNameFilter(const std::vector<std::string>& movieNames,
+                                       const Compare::Type cmp)
+    : compositeFilter_{PbiFilter::UNION}, cmp_{cmp}
+{
+    for (const auto& movieName : movieNames)
+        compositeFilter_.Add(filterFromMovieName(movieName, true));  // include CCS
+}
+
+// PbiQueryLengthFilter
+
+bool PbiQueryLengthFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const auto& basicData = idx.BasicData();
+    const auto& qStart = basicData.qStart_.at(row);
+    const auto& qEnd = basicData.qEnd_.at(row);
+    const auto readLength = qEnd - qStart;
+    return CompareHelper(readLength);
+}
+
+// PbiQueryNameFilter
+
+struct PbiQueryNameFilter::PbiQueryNameFilterPrivate
+{
+public:
+    using QueryInterval = std::pair<int32_t, int32_t>;
+    using QueryIntervals = std::set<QueryInterval>;
+    using ZmwLookup = std::unordered_map<int32_t, QueryIntervals>;
+    using ZmwLookupPtr = std::shared_ptr<ZmwLookup>;  // may be shared by more than one rgId
+    using RgIdLookup = std::unordered_map<int32_t, ZmwLookupPtr>;
+
+    PbiQueryNameFilterPrivate(const std::vector<std::string>& queryNames,
+                              const Compare::Type cmp = Compare::EQUAL)
+        : cmp_{cmp}
+    {
+        for (const auto& queryName : queryNames) {
+
+            if (queryName.find("transcript/") == 0)
+                HandleName(queryName, RecordType::TRANSCRIPT);
+            else if (queryName.find("/ccs") != std::string::npos)
+                HandleName(queryName, RecordType::CCS);
+            else
+                HandleName(queryName, RecordType::UNKNOWN);
+        }
+    }
+
+    PbiQueryNameFilterPrivate(const std::unique_ptr<PbiQueryNameFilterPrivate>& other)
+    {
+        if (other) {
+            lookup_ = other->lookup_;
+            cmp_ = other->cmp_;
+        }
+    }
+
+    bool Accepts(const PbiRawData& idx, const size_t row) const
+    {
+        const auto& basicData = idx.BasicData();
+
+        const bool found = [&]() {
+            // see if row's RGID known
+            const auto& rgId = basicData.rgId_.at(row);
+            const auto rgFound = lookup_.find(rgId);
+            if (rgFound == lookup_.end()) return false;
+
+            // see if row's ZMW known
+            const auto& zmwPtr = rgFound->second;
+            const auto zmw = basicData.holeNumber_.at(row);
+            const auto zmwFound = zmwPtr->find(zmw);
+            if (zmwFound == zmwPtr->end()) return false;
+
+            // see if row's QueryStart/QueryEnd known
+            // CCS names already covered in lookup construction phase
+            const auto& queryIntervals = zmwFound->second;
+            const auto qStart = basicData.qStart_.at(row);
+            const auto qEnd = basicData.qEnd_.at(row);
+            const auto queryInterval = std::make_pair(qStart, qEnd);
+            return (queryIntervals.find(queryInterval) != queryIntervals.end());
+        }();
+
+        if (cmp_ == Compare::EQUAL || cmp_ == Compare::CONTAINS)
+            return found;
+        else if (cmp_ == Compare::NOT_EQUAL || cmp_ == Compare::NOT_CONTAINS)
+            return !found;
+        else
+            throw std::runtime_error{"PbiFilter: unsupported compare type on query name filter"};
+    }
+
+    std::vector<int32_t> CandidateRgIds(const std::string& movieName, const RecordType type)
+    {
+        if (type == RecordType::CCS)
+            return {ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "CCS"))};
+
+        if (type == RecordType::TRANSCRIPT)
+            return {ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "TRANSCRIPT"))};
+
+        // we can't know for sure from QNAME alone
+        return {ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "POLYMERASE")),
+                ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "HQREGION")),
+                ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "SUBREAD")),
+                ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "SCRAP")),
+                ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "UNKNOWN")),
+                ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "ZMW"))};
+    }
+
+    void HandleName(const std::string& queryName, const RecordType type)
+    {
+        // split name into main parts
+        const auto nameParts = Split(queryName, '/');
+
+        // verify syntax
+        if (IsCcsOrTranscript(type)) {
+            if (nameParts.size() != 2) {
+                const auto typeName = (type == RecordType::CCS) ? "CCS" : "transcript";
+                throw std::runtime_error{"PbiQueryNameFilter: requested QNAME (" + queryName +
+                                         ") is not valid for PacBio " + typeName +
+                                         " reads. See spec for details."};
+            }
+        } else {
+            if (nameParts.size() != 3) {
+                throw std::runtime_error{"PbiQueryNameFilter: requested QNAME (" + queryName +
+                                         ") is not a valid PacBio BAM QNAME. See spec for details"};
+            }
+        }
+
+        // generate candidate read group IDs from movie name & record type, then
+        // add to lookup table
+        const auto zmwPtr = UpdateRgLookup(CandidateRgIds(nameParts.at(0), type));
+
+        // add qStart/qEnd interval to zmw lookup
+        const auto zmw = std::stoi(nameParts.at(1));
+        if (IsCcsOrTranscript(type))
+            UpdateZmwQueryIntervals(zmwPtr.get(), zmw, -1, -1);
+        else {
+            const auto queryIntervalParts = Split(nameParts.at(2), '_');
+            if (queryIntervalParts.size() != 2) {
+                throw std::runtime_error{"PbiQueryNameFilter: requested QNAME (" + queryName +
+                                         ") is not a valid PacBio BAM QNAME. See spec for details"};
+            }
+            UpdateZmwQueryIntervals(zmwPtr.get(), zmw, std::stoi(queryIntervalParts.at(0)),
+                                    std::stoi(queryIntervalParts.at(1)));
+        }
+    }
+
+    ZmwLookupPtr UpdateRgLookup(std::vector<int32_t>&& rgIds)
+    {
+        assert(!rgIds.empty());
+
+        ZmwLookupPtr zmwPtr;
+
+        const auto rgFound = lookup_.find(rgIds.front());
+        if (rgFound == lookup_.end()) {
+            zmwPtr = std::make_shared<ZmwLookup>();
+            for (const auto& rg : rgIds) {
+                assert(lookup_.find(rg) == lookup_.end());
+                lookup_.emplace(rg, zmwPtr);
+            }
+        } else {
+#ifndef NDEBUG
+            for (const auto& rg : rgIds)
+                assert(lookup_.find(rg) != lookup_.end());
+#endif
+            zmwPtr = rgFound->second;
+        }
+        return zmwPtr;
+    }
+
+    // add QS/QE pair to ZMW lookup
+    void UpdateZmwQueryIntervals(ZmwLookup* const zmwPtr, const int32_t zmw,
+                                 const int32_t queryStart, const int32_t queryEnd)
+    {
+        const auto zmwFound = zmwPtr->find(zmw);
+        if (zmwFound == zmwPtr->end()) zmwPtr->emplace(zmw, QueryIntervals{});
+        auto& queryIntervals = zmwPtr->at(zmw);
+        queryIntervals.emplace(std::make_pair(queryStart, queryEnd));
+    }
+
+private:
+    RgIdLookup lookup_;
+    Compare::Type cmp_;
+};
+
+PbiQueryNameFilter::PbiQueryNameFilter(const std::string& qname, const Compare::Type cmp)
+    : d_{std::make_unique<PbiQueryNameFilter::PbiQueryNameFilterPrivate>(
+          std::vector<std::string>{1, qname}, cmp)}
+{
+}
+
+PbiQueryNameFilter::PbiQueryNameFilter(const std::vector<std::string>& queryNames,
+                                       const Compare::Type cmp)
+    : d_{std::make_unique<PbiQueryNameFilter::PbiQueryNameFilterPrivate>(queryNames, cmp)}
+{
+}
+
+PbiQueryNameFilter::PbiQueryNameFilter(const PbiQueryNameFilter& other)
+    : d_{std::make_unique<PbiQueryNameFilter::PbiQueryNameFilterPrivate>(other.d_)}
+{
+}
+
+PbiQueryNameFilter::~PbiQueryNameFilter() = default;
+
+bool PbiQueryNameFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    return d_->Accepts(idx, row);
+}
+
+// PbiReadGroupFilter
+
+PbiReadGroupFilter::PbiReadGroupFilter(const std::vector<int32_t>& rgIds, const Compare::Type cmp)
+    : cmp_{cmp}
+{
+    if (cmp_ == Compare::EQUAL)
+        cmp_ = Compare::CONTAINS;
+    else if (cmp_ == Compare::NOT_EQUAL)
+        cmp_ = Compare::NOT_CONTAINS;
+
+    if (cmp_ != Compare::CONTAINS && cmp_ != Compare::NOT_CONTAINS) {
+        throw std::runtime_error{
+            "PbiFilter: unsupported compare type (" + Compare::TypeToName(cmp) +
+            ") for this property. "
+            "Read group filter can only compare equality or presence in whitelist/blacklist."};
+    }
+
+    // Add RG ID & empty filter if not present. The empty filter will work for
+    // non-barcoded IDs that match the expected number(s).
+    //
+    for (const auto& rgId : rgIds) {
+        const auto found = lookup_.find(rgId);
+        if (found == lookup_.cend()) lookup_.emplace(rgId, boost::none);
+    }
+}
+
+PbiReadGroupFilter::PbiReadGroupFilter(const int32_t rgId, const Compare::Type cmp)
+    : PbiReadGroupFilter{std::vector<int32_t>{rgId}, cmp}
+{
+}
+
+PbiReadGroupFilter::PbiReadGroupFilter(const std::vector<ReadGroupInfo>& readGroups,
+                                       const Compare::Type cmp)
+    : cmp_{cmp}
+{
+    if (cmp_ == Compare::EQUAL)
+        cmp_ = Compare::CONTAINS;
+    else if (cmp_ == Compare::NOT_EQUAL)
+        cmp_ = Compare::NOT_CONTAINS;
+
+    if (cmp_ != Compare::CONTAINS && cmp_ != Compare::NOT_CONTAINS) {
+        throw std::runtime_error{
+            "PbiFilter: unsupported compare type (" + Compare::TypeToName(cmp) +
+            ") for this property. "
+            "Read group filter can only compare equality or presence in whitelist/blacklist."};
+    }
+
+    for (const auto& rg : readGroups) {
+        // Add RG base ID with no filter if not present. The empty filter will
+        // work for non-barcoded IDs. We'll add to it if the base read group ID
+        // also has barcode labels,so that any barcode pair whitelisted for this
+        // read group filter will be a match.
+        //
+        const auto idNum = ReadGroupInfo::IdToInt(rg.BaseId());
+        const auto found = lookup_.find(idNum);
+        if (found == lookup_.cend()) lookup_.emplace(idNum, boost::none);
+
+        // Maybe add barcodes to base ID
+        const auto barcodes = rg.Barcodes();
+        if (barcodes) {
+            const auto bcFor = static_cast<int16_t>(barcodes->first);
+            const auto bcRev = static_cast<int16_t>(barcodes->second);
+            auto& idBarcodes = lookup_.at(idNum);
+            if (!idBarcodes) idBarcodes = std::vector<std::pair<int16_t, int16_t>>{};
+            idBarcodes->push_back(std::make_pair(bcFor, bcRev));
+        }
+    }
+}
+
+PbiReadGroupFilter::PbiReadGroupFilter(const ReadGroupInfo& rg, const Compare::Type cmp)
+    : PbiReadGroupFilter{std::vector<ReadGroupInfo>{rg}, cmp}
+{
+}
+
+PbiReadGroupFilter::PbiReadGroupFilter(const std::vector<std::string>& rgIds,
+                                       const Compare::Type cmp)
+{
+    std::vector<ReadGroupInfo> readGroups;
+    for (const auto rgId : rgIds)
+        readGroups.push_back(rgId);
+    *this = PbiReadGroupFilter{readGroups, cmp};
+}
+
+PbiReadGroupFilter::PbiReadGroupFilter(const std::string& rgId, const Compare::Type cmp)
+    : PbiReadGroupFilter{ReadGroupInfo{rgId}, cmp}
+{
+}
+
+bool PbiReadGroupFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const auto accepted = [this](const PbiRawData& index, const size_t i) {
+        // Check that read group base ID is found.
+        const auto rowRgId = index.BasicData().rgId_.at(i);
+        const auto foundAt = lookup_.find(rowRgId);
+        if (foundAt == lookup_.cend()) return false;
+
+        // Read group's base ID is found, check for filtered barcodes.
+        //
+        // For non-barcoded read groups, the filter is empty. This is
+        // essentially a no-op for allowing all candidate rows.
+        //
+        const auto& barcodes = foundAt->second;
+        if (!barcodes) return true;
+
+        // Return success on first match, otherwise no match found.
+        for (const auto bcPair : *barcodes) {
+            if (index.BarcodeData().bcForward_.at(i) == bcPair.first &&
+                index.BarcodeData().bcReverse_.at(i) == bcPair.second) {
+                return true;
+            }
+        }
+        return false;
+    }(idx, row);
+
+    assert(cmp_ == Compare::CONTAINS || cmp_ == Compare::NOT_CONTAINS);
+    return (cmp_ == Compare::CONTAINS ? accepted : !accepted);
+}
+
+// PbiReferenceNameFilter
+
+PbiReferenceNameFilter::PbiReferenceNameFilter(std::string rname, Compare::Type cmp)
+    : rname_{std::move(rname)}, cmp_{cmp}
+{
+    Validate();
+}
+
+PbiReferenceNameFilter::PbiReferenceNameFilter(std::vector<std::string> rnames,
+                                               const Compare::Type cmp)
+    : rnameWhitelist_{std::move(rnames)}, cmp_{cmp}
+{
+    Validate();
+}
+
+bool PbiReferenceNameFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    if (!initialized_) Initialize(idx);
+    return subFilter_.Accepts(idx, row);
+}
+
+void PbiReferenceNameFilter::Initialize(const PbiRawData& idx) const
+{
+
+    // fetch BAM header info associate with this index
+    const auto pbiFilename = idx.Filename();
+    const auto bamFilename = pbiFilename.substr(0, pbiFilename.length() - 4);
+    const BamFile bamFile{bamFilename};
+
+    // single-value
+    if (rnameWhitelist_ == boost::none) {
+        const auto tId = bamFile.ReferenceId(rname_);
+        subFilter_ = PbiReferenceIdFilter{tId, cmp_};
+    }
+
+    // multi-value (whitelist/blacklist)
+    else {
+        std::vector<int32_t> ids;
+        for (const auto& rname : rnameWhitelist_.get())
+            ids.push_back(bamFile.ReferenceId(rname));
+        subFilter_ = PbiReferenceIdFilter{std::move(ids), cmp_};
+    }
+    initialized_ = true;
+}
+
+void PbiReferenceNameFilter::Validate() const
+{
+    // double-check valid compare type
+    const bool compareTypeOk = [&]() {
+        if (cmp_ == Compare::EQUAL) return true;
+        if (cmp_ == Compare::NOT_EQUAL) return true;
+        if (cmp_ == Compare::CONTAINS) return true;
+        if (cmp_ == Compare::NOT_CONTAINS) return true;
+        return false;
+    }();
+    if (!compareTypeOk) {
+        throw std::runtime_error{
+            "PbiFilter: unsupported compare type (" + Compare::TypeToName(cmp_) +
+            ") for this property. "
+            "Reference name filter can only compare equality or presence in whitelist/blacklist."};
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiIndexIO.cpp b/src/PbiIndexIO.cpp

new file mode 100644 (file)

index 0000000..2afdcba
--- /dev/null
+++ b/src/PbiIndexIO.cpp
@@ -0,0 +1,401 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "PbiIndexIO.h"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include <boost/algorithm/string.hpp>
+
+#include <pbcopper/utility/MoveAppend.h>
+
+#include "MemoryUtils.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/PbiBuilder.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+void CheckContainer(const std::string& container, const size_t expected, const size_t observed)
+{
+    if (observed != expected) {
+        std::ostringstream msg;
+        msg << "PBI index error: expected " << expected << " records in " << container
+            << " field, but found " << observed << " instead";
+        throw std::runtime_error{msg.str()};
+    }
+}
+
+void CheckExpectedSize(const PbiRawBarcodeData& barcodeData, const size_t numReads)
+{
+    CheckContainer("BarcodeData.bc_forward", numReads, barcodeData.bcForward_.size());
+    CheckContainer("BarcodeData.bc_reverse", numReads, barcodeData.bcReverse_.size());
+    CheckContainer("BarcodeData.bc_qual", numReads, barcodeData.bcReverse_.size());
+}
+
+void CheckExpectedSize(const PbiRawBasicData& basicData, const size_t numReads)
+{
+    CheckContainer("BasicData.rgId", numReads, basicData.rgId_.size());
+    CheckContainer("BasicData.qStart", numReads, basicData.qStart_.size());
+    CheckContainer("BasicData.qEnd", numReads, basicData.qEnd_.size());
+    CheckContainer("BasicData.holeNumber", numReads, basicData.holeNumber_.size());
+    CheckContainer("BasicData.readQual", numReads, basicData.readQual_.size());
+    CheckContainer("BasicData.ctxt_flag", numReads, basicData.ctxtFlag_.size());
+    CheckContainer("BasicData.fileOffset", numReads, basicData.fileOffset_.size());
+}
+
+void CheckExpectedSize(const PbiRawMappedData& mappedData, const size_t numReads)
+{
+    CheckContainer("MappedData.tId", numReads, mappedData.tId_.size());
+    CheckContainer("MappedData.tStart", numReads, mappedData.tStart_.size());
+    CheckContainer("MappedData.tEnd", numReads, mappedData.tEnd_.size());
+    CheckContainer("MappedData.aStart", numReads, mappedData.aStart_.size());
+    CheckContainer("MappedData.aEnd", numReads, mappedData.aEnd_.size());
+    CheckContainer("MappedData.revStrand", numReads, mappedData.revStrand_.size());
+    CheckContainer("MappedData.nM", numReads, mappedData.nM_.size());
+    CheckContainer("MappedData.nMM", numReads, mappedData.nMM_.size());
+    CheckContainer("MappedData.mapQV", numReads, mappedData.mapQV_.size());
+}
+
+}  // namespace
+
+PbiRawData PbiIndexIO::Load(const std::string& pbiFilename)
+{
+    PbiRawData rawData;
+    Load(rawData, pbiFilename);
+    return rawData;
+}
+
+void PbiIndexIO::Load(PbiRawData& rawData, const std::string& filename)
+{
+    // open file for reading
+    if (!boost::algorithm::iends_with(filename, ".pbi"))
+        throw std::runtime_error{"unsupported file extension on " + filename};
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> bgzf(bgzf_open(filename.c_str(), "rb"));
+    auto* fp = bgzf.get();
+    if (fp == nullptr)
+        throw std::runtime_error{"could not open PBI file: " + filename + "for reading"};
+
+    // load data
+    LoadHeader(rawData, fp);
+    const auto numReads = rawData.NumReads();
+    if (numReads > 0) {
+        LoadBasicData(rawData.BasicData(), numReads, fp);
+        if (rawData.HasMappedData()) LoadMappedData(rawData.MappedData(), numReads, fp);
+        if (rawData.HasReferenceData()) LoadReferenceData(rawData.ReferenceData(), fp);
+        if (rawData.HasBarcodeData()) LoadBarcodeData(rawData.BarcodeData(), numReads, fp);
+    }
+}
+
+void PbiIndexIO::LoadFromDataSet(PbiRawData& aggregateData, const DataSet& dataset)
+{
+    aggregateData.NumReads(0);
+    aggregateData.FileSections(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::BARCODE);
+    aggregateData.Version(PbiFile::CurrentVersion);
+
+    const auto bamFiles = dataset.BamFiles();
+    uint16_t fileNumber = 0;
+    for (const auto& bamFile : bamFiles) {
+        PbiRawData currentPbi{bamFile.PacBioIndexFilename()};
+        const auto currentPbiCount = currentPbi.NumReads();
+
+        // read count
+        aggregateData.NumReads(aggregateData.NumReads() + currentPbiCount);
+
+        // BasicData
+        auto& aggregateBasicData = aggregateData.BasicData();
+        auto& currentBasicData = currentPbi.BasicData();
+        Utility::MoveAppend(std::move(currentBasicData.rgId_), aggregateBasicData.rgId_);
+        Utility::MoveAppend(std::move(currentBasicData.qStart_), aggregateBasicData.qStart_);
+        Utility::MoveAppend(std::move(currentBasicData.qEnd_), aggregateBasicData.qEnd_);
+        Utility::MoveAppend(std::move(currentBasicData.holeNumber_),
+                            aggregateBasicData.holeNumber_);
+        Utility::MoveAppend(std::move(currentBasicData.readQual_), aggregateBasicData.readQual_);
+        Utility::MoveAppend(std::move(currentBasicData.ctxtFlag_), aggregateBasicData.ctxtFlag_);
+        Utility::MoveAppend(std::move(currentBasicData.fileOffset_),
+                            aggregateBasicData.fileOffset_);
+        Utility::MoveAppend(std::vector<uint16_t>(currentPbiCount, fileNumber),
+                            aggregateBasicData.fileNumber_);
+
+        // BarcodeData
+        auto& aggregateBarcodeData = aggregateData.BarcodeData();
+        if (currentPbi.HasBarcodeData()) {
+            auto& currentBarcodeData = currentPbi.BarcodeData();
+            Utility::MoveAppend(std::move(currentBarcodeData.bcForward_),
+                                aggregateBarcodeData.bcForward_);
+            Utility::MoveAppend(std::move(currentBarcodeData.bcReverse_),
+                                aggregateBarcodeData.bcReverse_);
+            Utility::MoveAppend(std::move(currentBarcodeData.bcQual_),
+                                aggregateBarcodeData.bcQual_);
+        } else {
+            Utility::MoveAppend(std::vector<int16_t>(currentPbiCount, -1),
+                                aggregateBarcodeData.bcForward_);
+            Utility::MoveAppend(std::vector<int16_t>(currentPbiCount, -1),
+                                aggregateBarcodeData.bcReverse_);
+            Utility::MoveAppend(std::vector<int8_t>(currentPbiCount, -1),
+                                aggregateBarcodeData.bcQual_);
+        }
+
+        // MappedData
+        auto& aggregateMappedData = aggregateData.MappedData();
+        if (currentPbi.HasMappedData()) {
+            auto& currentMappedData = currentPbi.MappedData();
+            Utility::MoveAppend(std::move(currentMappedData.tId_), aggregateMappedData.tId_);
+            Utility::MoveAppend(std::move(currentMappedData.tStart_), aggregateMappedData.tStart_);
+            Utility::MoveAppend(std::move(currentMappedData.tEnd_), aggregateMappedData.tEnd_);
+            Utility::MoveAppend(std::move(currentMappedData.aStart_), aggregateMappedData.aStart_);
+            Utility::MoveAppend(std::move(currentMappedData.aEnd_), aggregateMappedData.aEnd_);
+            Utility::MoveAppend(std::move(currentMappedData.revStrand_),
+                                aggregateMappedData.revStrand_);
+            Utility::MoveAppend(std::move(currentMappedData.nM_), aggregateMappedData.nM_);
+            Utility::MoveAppend(std::move(currentMappedData.nMM_), aggregateMappedData.nMM_);
+            Utility::MoveAppend(std::move(currentMappedData.mapQV_), aggregateMappedData.mapQV_);
+        } else {
+            Utility::MoveAppend(std::vector<int32_t>(currentPbiCount, -1),
+                                aggregateMappedData.tId_);
+            Utility::MoveAppend(std::vector<uint32_t>(currentPbiCount, UnmappedPosition),
+                                aggregateMappedData.tStart_);
+            Utility::MoveAppend(std::vector<uint32_t>(currentPbiCount, UnmappedPosition),
+                                aggregateMappedData.tEnd_);
+            Utility::MoveAppend(std::vector<uint32_t>(currentPbiCount, UnmappedPosition),
+                                aggregateMappedData.aStart_);
+            Utility::MoveAppend(std::vector<uint32_t>(currentPbiCount, UnmappedPosition),
+                                aggregateMappedData.aEnd_);
+            Utility::MoveAppend(std::vector<uint8_t>(currentPbiCount, 0),
+                                aggregateMappedData.revStrand_);
+            Utility::MoveAppend(std::vector<uint32_t>(currentPbiCount, 0), aggregateMappedData.nM_);
+            Utility::MoveAppend(std::vector<uint32_t>(currentPbiCount, 0),
+                                aggregateMappedData.nMM_);
+            Utility::MoveAppend(std::vector<uint8_t>(currentPbiCount, 255),
+                                aggregateMappedData.mapQV_);
+        }
+
+        ++fileNumber;
+    }
+}
+
+void PbiIndexIO::LoadBarcodeData(PbiRawBarcodeData& barcodeData, const uint32_t numReads, BGZF* fp)
+{
+    // read from file
+    LoadBgzfVector(fp, barcodeData.bcForward_, numReads);
+    LoadBgzfVector(fp, barcodeData.bcReverse_, numReads);
+    LoadBgzfVector(fp, barcodeData.bcQual_, numReads);
+
+    // validate
+    CheckExpectedSize(barcodeData, numReads);
+}
+
+void PbiIndexIO::LoadHeader(PbiRawData& index, BGZF* fp)
+{
+    // 'magic' string
+    char magic[4];
+    auto bytesRead = bgzf_read(fp, magic, 4);
+    if (bytesRead != 4 || strncmp(magic, "PBI\1", 4))
+        throw std::runtime_error{"expected PBI file, found unknown format instead"};
+
+    // version, pbi_flags, & n_reads
+    uint32_t version;
+    uint16_t sections;
+    uint32_t numReads;
+    bytesRead = bgzf_read(fp, &version, sizeof(version));
+    bytesRead = bgzf_read(fp, &sections, sizeof(sections));
+    bytesRead = bgzf_read(fp, &numReads, sizeof(numReads));
+    if (fp->is_be) {
+        version = ed_swap_4(version);
+        sections = ed_swap_2(sections);
+        numReads = ed_swap_4(numReads);
+    }
+
+    index.Version(PbiFile::VersionEnum(version));
+    index.FileSections(sections);
+    index.NumReads(numReads);
+
+    // skip reserved section
+    size_t reservedLength = 18;
+    // adjust depending on version
+    char reserved[18];
+    bytesRead = bgzf_read(fp, &reserved, reservedLength);
+}
+
+void PbiIndexIO::LoadMappedData(PbiRawMappedData& mappedData, const uint32_t numReads, BGZF* fp)
+{
+    // read from file
+    LoadBgzfVector(fp, mappedData.tId_, numReads);
+    LoadBgzfVector(fp, mappedData.tStart_, numReads);
+    LoadBgzfVector(fp, mappedData.tEnd_, numReads);
+    LoadBgzfVector(fp, mappedData.aStart_, numReads);
+    LoadBgzfVector(fp, mappedData.aEnd_, numReads);
+    LoadBgzfVector(fp, mappedData.revStrand_, numReads);
+    LoadBgzfVector(fp, mappedData.nM_, numReads);
+    LoadBgzfVector(fp, mappedData.nMM_, numReads);
+    LoadBgzfVector(fp, mappedData.mapQV_, numReads);
+
+    // validate
+    CheckExpectedSize(mappedData, numReads);
+}
+
+void PbiIndexIO::LoadReferenceData(PbiRawReferenceData& referenceData, BGZF* fp)
+{
+    assert(sizeof(PbiReferenceEntry::ID) == 4);
+    assert(sizeof(PbiReferenceEntry::Row) == 4);
+
+    // num refs
+    uint32_t numRefs;
+    auto ret = bgzf_read(fp, &numRefs, 4);
+    if (fp->is_be) numRefs = ed_swap_4(numRefs);
+
+    // reference entries
+    referenceData.entries_.clear();
+    referenceData.entries_.resize(numRefs);
+    for (auto& entry : referenceData.entries_) {
+        ret = bgzf_read(fp, &entry.tId_, 4);
+        ret = bgzf_read(fp, &entry.beginRow_, 4);
+        ret = bgzf_read(fp, &entry.endRow_, 4);
+        if (fp->is_be) {
+            entry.tId_ = ed_swap_4(entry.tId_);
+            entry.beginRow_ = ed_swap_4(entry.beginRow_);
+            entry.endRow_ = ed_swap_4(entry.endRow_);
+        }
+    }
+    UNUSED(ret);
+}
+
+void PbiIndexIO::LoadBasicData(PbiRawBasicData& basicData, const uint32_t numReads, BGZF* fp)
+{
+    // read from file
+    LoadBgzfVector(fp, basicData.rgId_, numReads);
+    LoadBgzfVector(fp, basicData.qStart_, numReads);
+    LoadBgzfVector(fp, basicData.qEnd_, numReads);
+    LoadBgzfVector(fp, basicData.holeNumber_, numReads);
+    LoadBgzfVector(fp, basicData.readQual_, numReads);
+    LoadBgzfVector(fp, basicData.ctxtFlag_, numReads);
+    LoadBgzfVector(fp, basicData.fileOffset_, numReads);
+
+    // validate
+    CheckExpectedSize(basicData, numReads);
+}
+
+void PbiIndexIO::Save(const PbiRawData& index, const std::string& filename)
+{
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> bgzf(bgzf_open(filename.c_str(), "wb"));
+    auto* fp = bgzf.get();
+    if (fp == nullptr)
+        throw std::runtime_error{"could not open PBI file: " + filename + "for writing"};
+
+    WriteHeader(index, fp);
+    const auto numReads = index.NumReads();
+    if (numReads > 0) {
+        WriteBasicData(index.BasicData(), numReads, fp);
+
+        if (index.HasMappedData()) WriteMappedData(index.MappedData(), numReads, fp);
+        if (index.HasReferenceData()) WriteReferenceData(index.ReferenceData(), fp);
+        if (index.HasBarcodeData()) WriteBarcodeData(index.BarcodeData(), numReads, fp);
+    }
+}
+
+void PbiIndexIO::WriteBarcodeData(const PbiRawBarcodeData& barcodeData, const uint32_t numReads,
+                                  BGZF* fp)
+{
+    // validate
+    CheckExpectedSize(barcodeData, numReads);
+
+    // write to file
+    WriteBgzfVector(fp, barcodeData.bcForward_);
+    WriteBgzfVector(fp, barcodeData.bcReverse_);
+    WriteBgzfVector(fp, barcodeData.bcQual_);
+}
+
+void PbiIndexIO::WriteHeader(const PbiRawData& index, BGZF* fp)
+{
+    // 'magic' string
+    constexpr static const std::array<char, 4> magic{{'P', 'B', 'I', '\1'}};
+    auto ret = bgzf_write(fp, magic.data(), 4);
+
+    // version, pbi_flags, & n_reads
+    auto version = static_cast<uint32_t>(index.Version());
+    uint16_t pbi_flags = index.FileSections();
+    auto numReads = static_cast<uint16_t>(index.NumReads());
+    if (fp->is_be) {
+        version = ed_swap_4(version);
+        pbi_flags = ed_swap_2(pbi_flags);
+        numReads = ed_swap_4(numReads);
+    }
+    ret = bgzf_write(fp, &version, 4);
+    ret = bgzf_write(fp, &pbi_flags, 2);
+    ret = bgzf_write(fp, &numReads, 4);
+
+    // reserved space
+    char reserved[18];
+    memset(reserved, 0, 18);
+    ret = bgzf_write(fp, reserved, 18);
+    UNUSED(ret);
+}
+
+void PbiIndexIO::WriteMappedData(const PbiRawMappedData& mappedData, const uint32_t numReads,
+                                 BGZF* fp)
+{
+    // validate
+    CheckExpectedSize(mappedData, numReads);
+
+    // write to file
+    WriteBgzfVector(fp, mappedData.tId_);
+    WriteBgzfVector(fp, mappedData.tStart_);
+    WriteBgzfVector(fp, mappedData.tEnd_);
+    WriteBgzfVector(fp, mappedData.aStart_);
+    WriteBgzfVector(fp, mappedData.aEnd_);
+    WriteBgzfVector(fp, mappedData.revStrand_);
+    WriteBgzfVector(fp, mappedData.nM_);
+    WriteBgzfVector(fp, mappedData.nMM_);
+    WriteBgzfVector(fp, mappedData.mapQV_);
+}
+
+void PbiIndexIO::WriteReferenceData(const PbiRawReferenceData& referenceData, BGZF* fp)
+{
+    // num_refs
+    auto numRefs = referenceData.entries_.size();
+    if (fp->is_be) numRefs = ed_swap_4(numRefs);
+    auto ret = bgzf_write(fp, &numRefs, 4);
+
+    // reference entries
+    for (const auto& entry : referenceData.entries_) {
+        auto tId = entry.tId_;
+        auto beginRow = entry.beginRow_;
+        auto endRow = entry.endRow_;
+        if (fp->is_be) {
+            tId = ed_swap_4(tId);
+            beginRow = ed_swap_4(beginRow);
+            endRow = ed_swap_4(endRow);
+        }
+        ret = bgzf_write(fp, &tId, 4);
+        ret = bgzf_write(fp, &beginRow, 4);
+        ret = bgzf_write(fp, &endRow, 4);
+    }
+    UNUSED(ret);
+}
+
+void PbiIndexIO::WriteBasicData(const PbiRawBasicData& basicData, const uint32_t numReads, BGZF* fp)
+{
+    // validate
+    CheckExpectedSize(basicData, numReads);
+
+    // write to file
+    WriteBgzfVector(fp, basicData.rgId_);
+    WriteBgzfVector(fp, basicData.qStart_);
+    WriteBgzfVector(fp, basicData.qEnd_);
+    WriteBgzfVector(fp, basicData.holeNumber_);
+    WriteBgzfVector(fp, basicData.readQual_);
+    WriteBgzfVector(fp, basicData.ctxtFlag_);
+    WriteBgzfVector(fp, basicData.fileOffset_);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiIndexIO.h b/src/PbiIndexIO.h

new file mode 100644 (file)

index 0000000..cedfc63
--- /dev/null
+++ b/src/PbiIndexIO.h
@@ -0,0 +1,115 @@
+// Author: Derek Barnett
+
+#ifndef PBIINDEXIO_H
+#define PBIINDEXIO_H
+
+#include "pbbam/Config.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <htslib/bgzf.h>
+#include <htslib/sam.h>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/PbiFile.h"
+#include "pbbam/PbiRawData.h"
+#include "pbbam/Unused.h"
+
+namespace PacBio {
+namespace BAM {
+
+class PbiIndexIO
+{
+public:
+    // top-level entry points
+    static PbiRawData Load(const std::string& filename);
+    static void Load(PbiRawData& rawData, const std::string& filename);
+    static void LoadFromDataSet(PbiRawData& aggregateData, const DataSet& dataset);
+    static void Save(const PbiRawData& rawData, const std::string& filename);
+
+    // per-component load
+    static void LoadBarcodeData(PbiRawBarcodeData& barcodeData, const uint32_t numReads, BGZF* fp);
+    static void LoadHeader(PbiRawData& index, BGZF* fp);
+    static void LoadMappedData(PbiRawMappedData& mappedData, const uint32_t numReads, BGZF* fp);
+    static void LoadReferenceData(PbiRawReferenceData& referenceData, BGZF* fp);
+    static void LoadBasicData(PbiRawBasicData& basicData, const uint32_t numReads, BGZF* fp);
+
+    // per-data-field load
+    template <typename T>
+    static void LoadBgzfVector(BGZF* fp, std::vector<T>& data, const uint32_t numReads);
+
+    // per-component write
+    static void WriteBarcodeData(const PbiRawBarcodeData& barcodeData, const uint32_t numReads,
+                                 BGZF* fp);
+    static void WriteHeader(const PbiRawData& index, BGZF* fp);
+    static void WriteMappedData(const PbiRawMappedData& mappedData, const uint32_t numReads,
+                                BGZF* fp);
+    static void WriteReferenceData(const PbiRawReferenceData& referenceData, BGZF* fp);
+    static void WriteBasicData(const PbiRawBasicData& subreadData, const uint32_t numReads,
+                               BGZF* fp);
+
+    // per-data-field write
+    template <typename T>
+    static void WriteBgzfVector(BGZF* fp, const std::vector<T>& data);
+
+private:
+    // helper functions
+    template <typename T>
+    static void SwapEndianness(std::vector<T>& data);
+};
+
+template <typename T>
+inline void PbiIndexIO::LoadBgzfVector(BGZF* fp, std::vector<T>& data, const uint32_t numReads)
+{
+    assert(fp);
+    data.resize(numReads);
+    auto ret = bgzf_read(fp, &data[0], numReads * sizeof(T));
+    if (fp->is_be) SwapEndianness(data);
+    UNUSED(ret);
+}
+
+template <typename T>
+inline void PbiIndexIO::SwapEndianness(std::vector<T>& data)
+{
+    const auto elementSize = sizeof(T);
+    const auto numReads = data.size();
+    switch (elementSize) {
+        case 1:
+            break;  // no swapping necessary
+        case 2:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_2p(&data[i]);
+            break;
+        case 4:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_4p(&data[i]);
+            break;
+        case 8:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_8p(&data[i]);
+            break;
+        default:
+            throw std::runtime_error{"unsupported element size"};
+    }
+}
+
+template <typename T>
+inline void PbiIndexIO::WriteBgzfVector(BGZF* fp, const std::vector<T>& data)
+{
+    assert(fp);
+    std::vector<T> output = data;
+    if (fp->is_be) SwapEndianness(output);
+    auto ret = bgzf_write(fp, &output[0], data.size() * sizeof(T));
+    UNUSED(ret);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBIINDEXIO_H
diff --git a/src/PbiIndexedBamReader.cpp b/src/PbiIndexedBamReader.cpp

new file mode 100644 (file)

index 0000000..7cc6ab2
--- /dev/null
+++ b/src/PbiIndexedBamReader.cpp
@@ -0,0 +1,183 @@
+// File Description
+/// \file PbiIndexedBamReader.cpp
+/// \brief Implements the PbiIndexedBamReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiIndexedBamReader.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+
+#include <htslib/bgzf.h>
+
+namespace PacBio {
+namespace BAM {
+
+class PbiIndexedBamReader::PbiIndexedBamReaderPrivate
+{
+public:
+    explicit PbiIndexedBamReaderPrivate(BamFile file, const std::shared_ptr<PbiRawData>& index)
+        : file_{std::move(file)}, index_{index}, currentBlockReadCount_{0}, numMatchingReads_{0}
+    {
+    }
+
+    void ApplyOffsets()
+    {
+        const auto& fileOffsets = index_->BasicData().fileOffset_;
+        for (IndexResultBlock& block : blocks_)
+            block.virtualOffset_ = fileOffsets.at(block.firstIndex_);
+    }
+
+    void Filter(const PbiFilter filter)
+    {
+        // store request & reset counters
+        filter_ = std::move(filter);
+        currentBlockReadCount_ = 0;
+        blocks_.clear();
+        numMatchingReads_ = 0;
+
+        // find blocks of reads passing filter criteria
+        const auto totalReads = index_->NumReads();
+        if (totalReads == 0) {  // empty PBI - no reads to use
+            return;
+        } else if (filter_.IsEmpty()) {  // empty filter - use all reads
+            numMatchingReads_ = totalReads;
+            blocks_.emplace_back(0, totalReads);
+        } else {
+            IndexList indices;
+            indices.reserve(totalReads);
+            const auto& idx = *index_;
+            for (size_t i = 0; i < totalReads; ++i) {
+                if (filter_.Accepts(idx, i)) {
+                    indices.push_back(i);
+                    ++numMatchingReads_;
+                }
+            }
+            blocks_ = MergedIndexBlocks(std::move(indices));
+        }
+
+        // apply offsets
+        ApplyOffsets();
+    }
+
+    IndexResultBlocks MergedIndexBlocks(IndexList indices) const
+    {
+        if (indices.empty()) return {};
+
+        std::sort(indices.begin(), indices.end());
+        auto newEndIter = std::unique(indices.begin(), indices.end());
+        auto numIndices = std::distance(indices.begin(), newEndIter);
+        auto result = IndexResultBlocks{IndexResultBlock{indices.at(0), 1}};
+        for (auto i = 1; i < numIndices; ++i) {
+            if (indices.at(i) == indices.at(i - 1) + 1)
+                ++result.back().numReads_;
+            else
+                result.emplace_back(indices.at(i), 1);
+        }
+        return result;
+    }
+
+    int ReadRawData(BGZF* bgzf, bam1_t* b)
+    {
+        // no data to fetch, return false
+        if (blocks_.empty()) return -1;  // "EOF"
+
+        // if on new block, seek to its first record
+        if (currentBlockReadCount_ == 0) {
+            const auto seekResult = bgzf_seek(bgzf, blocks_.at(0).virtualOffset_, SEEK_SET);
+            if (seekResult == -1)
+                throw std::runtime_error{"PbiIndexedBamReader: could not seek in BAM file"};
+        }
+
+        // read next record
+        const auto result = bam_read1(bgzf, b);
+
+        // update counters. if block finished, pop & reset
+        ++currentBlockReadCount_;
+        if (currentBlockReadCount_ == blocks_.at(0).numReads_) {
+            blocks_.pop_front();
+            currentBlockReadCount_ = 0;
+        }
+
+        return result;
+    }
+
+    BamFile file_;
+    PbiFilter filter_;
+    std::shared_ptr<PbiRawData> index_;
+    IndexResultBlocks blocks_;
+    size_t currentBlockReadCount_;
+    uint32_t numMatchingReads_;
+};
+
+PbiIndexedBamReader::PbiIndexedBamReader(PbiFilter filter, const std::string& filename)
+    : PbiIndexedBamReader{std::move(filter), BamFile{filename}}
+{
+}
+
+PbiIndexedBamReader::PbiIndexedBamReader(PbiFilter filter, const std::string& filename,
+                                         const std::shared_ptr<PbiRawData>& index)
+    : PbiIndexedBamReader{std::move(filter), BamFile{filename}, index}
+{
+}
+
+PbiIndexedBamReader::PbiIndexedBamReader(PbiFilter filter, BamFile bamFile)
+    : PbiIndexedBamReader{std::move(bamFile)}
+{
+    Filter(std::move(filter));
+}
+
+PbiIndexedBamReader::PbiIndexedBamReader(PbiFilter filter, BamFile bamFile,
+                                         const std::shared_ptr<PbiRawData>& index)
+    : PbiIndexedBamReader{std::move(bamFile), index}
+{
+    Filter(std::move(filter));
+}
+
+PbiIndexedBamReader::PbiIndexedBamReader(const std::string& bamFilename)
+    : PbiIndexedBamReader{BamFile{bamFilename}}
+{
+}
+
+PbiIndexedBamReader::PbiIndexedBamReader(const std::string& bamFilename,
+                                         const std::shared_ptr<PbiRawData>& index)
+    : PbiIndexedBamReader{BamFile{bamFilename}, index}
+{
+}
+
+PbiIndexedBamReader::PbiIndexedBamReader(BamFile bamFile) : BamReader{bamFile.Filename()}
+{
+    auto indexCache = MakePbiIndexCache(bamFile);
+    d_ = std::make_unique<PbiIndexedBamReaderPrivate>(std::move(bamFile), indexCache->at(0));
+}
+
+PbiIndexedBamReader::PbiIndexedBamReader(BamFile bamFile, const std::shared_ptr<PbiRawData>& index)
+    : BamReader{bamFile.Filename()}
+    , d_{std::make_unique<PbiIndexedBamReaderPrivate>(std::move(bamFile), index)}
+{
+}
+
+PbiIndexedBamReader::~PbiIndexedBamReader() = default;
+
+const BamFile& PbiIndexedBamReader::File() const { return d_->file_; }
+
+const PbiFilter& PbiIndexedBamReader::Filter() const { return d_->filter_; }
+
+PbiIndexedBamReader& PbiIndexedBamReader::Filter(PbiFilter filter)
+{
+    d_->Filter(std::move(filter));
+    return *this;
+}
+
+const IndexResultBlocks& PbiIndexedBamReader::IndexBlocks() const { return d_->blocks_; }
+
+uint32_t PbiIndexedBamReader::NumReads() const { return d_->numMatchingReads_; }
+
+int PbiIndexedBamReader::ReadRawData(BGZF* bgzf, bam1_t* b) { return d_->ReadRawData(bgzf, b); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiRawData.cpp b/src/PbiRawData.cpp

new file mode 100644 (file)

index 0000000..381bb51
--- /dev/null
+++ b/src/PbiRawData.cpp
@@ -0,0 +1,352 @@
+// File Description
+/// \file PbiRawData.cpp
+/// \brief Implements the classes used for working with raw PBI data.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiRawData.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <tuple>
+#include <type_traits>
+
+#include <boost/numeric/conversion/cast.hpp>
+
+#include "PbiIndexIO.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/RecordType.h"
+
+namespace PacBio {
+namespace BAM {
+
+// ----------------------------------
+// PbiRawBarcodeData implementation
+// ----------------------------------
+
+static_assert(std::is_copy_constructible<PbiRawBarcodeData>::value,
+              "PbiRawBarcodeData(const PbiRawBarcodeData&) is not = default");
+static_assert(std::is_copy_assignable<PbiRawBarcodeData>::value,
+              "PbiRawBarcodeData& operator=(const PbiRawBarcodeData&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<PbiRawBarcodeData>::value,
+              "PbiRawBarcodeData(PbiRawBarcodeData&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<PbiRawBarcodeData>::value,
+              "PbiRawBarcodeData& operator=(PbiRawBarcodeData&&) is not = noexcept");
+
+PbiRawBarcodeData::PbiRawBarcodeData(uint32_t numReads)
+{
+    bcForward_.reserve(numReads);
+    bcReverse_.reserve(numReads);
+    bcQual_.reserve(numReads);
+}
+
+void PbiRawBarcodeData::AddRecord(const BamRecord& b)
+{
+    // check for any barcode data (both required)
+    if (b.HasBarcodes() && b.HasBarcodeQuality()) {
+
+        // fetch data from record
+        int16_t bcForward;
+        int16_t bcReverse;
+        std::tie(bcForward, bcReverse) = b.Barcodes();
+
+        const auto bcQuality = boost::numeric_cast<int8_t>(b.BarcodeQuality());
+
+        // only store actual data if all values >= 0
+        if (bcForward >= 0 && bcReverse >= 0 && bcQuality >= 0) {
+            bcForward_.push_back(bcForward);
+            bcReverse_.push_back(bcReverse);
+            bcQual_.push_back(bcQuality);
+            return;
+        }
+    }
+
+    // if we get here, at least one value is either missing or is -1
+    bcForward_.push_back(-1);
+    bcReverse_.push_back(-1);
+    bcQual_.push_back(-1);
+}
+
+// ----------------------------------
+// PbiRawMappedData implementation
+// ----------------------------------
+
+static_assert(std::is_copy_constructible<PbiRawMappedData>::value,
+              "PbiRawMappedData(const PbiRawMappedData&) is not = default");
+static_assert(std::is_copy_assignable<PbiRawMappedData>::value,
+              "PbiRawMappedData& operator=(const PbiRawMappedData&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<PbiRawMappedData>::value,
+              "PbiRawMappedData(PbiRawMappedData&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<PbiRawMappedData>::value,
+              "PbiRawMappedData& operator=(PbiRawMappedData&&) is not = noexcept");
+
+PbiRawMappedData::PbiRawMappedData(uint32_t numReads)
+{
+    tId_.reserve(numReads);
+    tStart_.reserve(numReads);
+    tEnd_.reserve(numReads);
+    aStart_.reserve(numReads);
+    aEnd_.reserve(numReads);
+    revStrand_.reserve(numReads);
+    nM_.reserve(numReads);
+    nMM_.reserve(numReads);
+    mapQV_.reserve(numReads);
+}
+
+void PbiRawMappedData::AddRecord(const BamRecord& b)
+{
+    tId_.push_back(b.ReferenceId());
+    tStart_.push_back(b.ReferenceStart());
+    tEnd_.push_back(b.ReferenceEnd());
+    aStart_.push_back(b.AlignedStart());
+    aEnd_.push_back(b.AlignedEnd());
+    revStrand_.push_back((b.AlignedStrand() == Strand::REVERSE ? 1 : 0));
+    mapQV_.push_back(b.MapQuality());
+
+    const auto matchesAndMismatches = b.NumMatchesAndMismatches();
+    nM_.push_back(matchesAndMismatches.first);
+    nMM_.push_back(matchesAndMismatches.second);
+}
+
+uint32_t PbiRawMappedData::NumDeletedBasesAt(size_t recordIndex) const
+{
+    return NumDeletedAndInsertedBasesAt(recordIndex).first;
+}
+
+std::pair<uint32_t, uint32_t> PbiRawMappedData::NumDeletedAndInsertedBasesAt(
+    size_t recordIndex) const
+{
+    const auto aStart = aStart_.at(recordIndex);
+    const auto aEnd = aEnd_.at(recordIndex);
+    const auto tStart = tStart_.at(recordIndex);
+    const auto tEnd = tEnd_.at(recordIndex);
+    const auto nM = nM_.at(recordIndex);
+    const auto nMM = nMM_.at(recordIndex);
+
+    const auto numIns = (aEnd - aStart - nM - nMM);
+    const auto numDel = (tEnd - tStart - nM - nMM);
+    return {numDel, numIns};
+}
+
+uint32_t PbiRawMappedData::NumInsertedBasesAt(size_t recordIndex) const
+{
+    return NumDeletedAndInsertedBasesAt(recordIndex).second;
+}
+
+// ------------------------------------
+// PbiReferenceEntry implementation
+// ------------------------------------
+
+static_assert(std::is_copy_constructible<PbiReferenceEntry>::value,
+              "PbiReferenceEntry(const PbiReferenceEntry&) is not = default");
+static_assert(std::is_copy_assignable<PbiReferenceEntry>::value,
+              "PbiReferenceEntry& operator=(const PbiReferenceEntry&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<PbiReferenceEntry>::value,
+              "PbiReferenceEntry(PbiReferenceEntry&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<PbiReferenceEntry>::value,
+              "PbiReferenceEntry& operator=(PbiReferenceEntry&&) is not = noexcept");
+
+const PbiReferenceEntry::ID PbiReferenceEntry::UNMAPPED_ID = static_cast<PbiReferenceEntry::ID>(-1);
+const PbiReferenceEntry::Row PbiReferenceEntry::UNSET_ROW = static_cast<PbiReferenceEntry::Row>(-1);
+
+PbiReferenceEntry::PbiReferenceEntry() : PbiReferenceEntry{UNMAPPED_ID, UNSET_ROW, UNSET_ROW} {}
+
+PbiReferenceEntry::PbiReferenceEntry(ID id) : PbiReferenceEntry{id, UNSET_ROW, UNSET_ROW} {}
+
+PbiReferenceEntry::PbiReferenceEntry(ID id, Row beginRow, Row endRow)
+    : tId_{id}, beginRow_{beginRow}, endRow_{endRow}
+{
+}
+
+// ------------------------------------
+// PbiRawReferenceData implementation
+// ------------------------------------
+
+static_assert(std::is_copy_constructible<PbiRawReferenceData>::value,
+              "PbiRawReferenceData(const PbiRawReferenceData&) is not = default");
+static_assert(std::is_copy_assignable<PbiRawReferenceData>::value,
+              "PbiRawReferenceData& operator=(const PbiRawReferenceData&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<PbiRawReferenceData>::value,
+              "PbiRawReferenceData(PbiRawReferenceData&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<PbiRawReferenceData>::value,
+              "PbiRawReferenceData& operator=(PbiRawReferenceData&&) is not = noexcept");
+
+PbiRawReferenceData::PbiRawReferenceData(uint32_t numRefs) { entries_.reserve(numRefs); }
+
+// ----------------------------------
+// PbiRawBasicData implementation
+// ----------------------------------
+
+static_assert(std::is_copy_constructible<PbiRawBasicData>::value,
+              "PbiRawBasicData(const PbiRawBasicData&) is not = default");
+static_assert(std::is_copy_assignable<PbiRawBasicData>::value,
+              "PbiRawBasicData& operator=(const PbiRawBasicData&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<PbiRawBasicData>::value,
+              "PbiRawBasicData(PbiRawBasicData&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<PbiRawBasicData>::value,
+              "PbiRawBasicData& operator=(PbiRawBasicData&&) is not = noexcept");
+
+PbiRawBasicData::PbiRawBasicData(uint32_t numReads)
+{
+    rgId_.reserve(numReads);
+    qStart_.reserve(numReads);
+    qEnd_.reserve(numReads);
+    holeNumber_.reserve(numReads);
+    readQual_.reserve(numReads);
+    ctxtFlag_.reserve(numReads);
+    fileOffset_.reserve(numReads);
+    fileNumber_.reserve(numReads);
+}
+
+void PbiRawBasicData::AddRecord(const BamRecord& b, int64_t offset)
+{
+    // read group ID
+    auto rgId = b.ReadGroupBaseId();
+    if (rgId.empty()) rgId = MakeReadGroupId(b.MovieName(), ToString(b.Type()));
+    const auto rawid = std::stoul(rgId, nullptr, 16);
+    const auto id = static_cast<int32_t>(rawid);
+    rgId_.push_back(id);
+
+    // query start/end
+    if (IsCcsOrTranscript(b.Type())) {
+        qStart_.push_back(0);
+        qEnd_.push_back(b.Impl().SequenceLength());
+    } else {
+        qStart_.push_back(b.QueryStart());
+        qEnd_.push_back(b.QueryEnd());
+    }
+
+    // add'l basic data
+    holeNumber_.push_back(b.HasHoleNumber() ? b.HoleNumber() : 0);
+    readQual_.push_back(b.HasReadAccuracy() ? static_cast<float>(b.ReadAccuracy()) : 0.0f);
+    ctxtFlag_.push_back(b.HasLocalContextFlags() ? b.LocalContextFlags()
+                                                 : LocalContextFlags::NO_LOCAL_CONTEXT);
+
+    // virtual offset of record start
+    fileOffset_.push_back(offset);
+
+    // default file number
+    fileNumber_.push_back(0);
+}
+
+// ----------------------------------
+// PbiRawData implementation
+// ----------------------------------
+
+static_assert(std::is_copy_constructible<PbiRawData>::value,
+              "PbiRawData(const PbiRawData&) is not = default");
+static_assert(std::is_copy_assignable<PbiRawData>::value,
+              "PbiRawData& operator=(const PbiRawData&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<PbiRawData>::value,
+              "PbiRawData(PbiRawData&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<PbiRawData>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+PbiRawData::PbiRawData(std::string pbiFilename) : filename_{std::move(pbiFilename)}
+{
+    PbiIndexIO::Load(*this, filename_);
+}
+
+PbiRawData::PbiRawData(const DataSet& dataset)
+    : sections_{PbiFile::BASIC | PbiFile::MAPPED | PbiFile::BARCODE}
+{
+    PbiIndexIO::LoadFromDataSet(*this, dataset);
+}
+
+const PbiRawBarcodeData& PbiRawData::BarcodeData() const { return barcodeData_; }
+
+PbiRawBarcodeData& PbiRawData::BarcodeData() { return barcodeData_; }
+
+const PbiRawBasicData& PbiRawData::BasicData() const { return basicData_; }
+
+PbiRawBasicData& PbiRawData::BasicData() { return basicData_; }
+
+std::string PbiRawData::Filename() const { return filename_; }
+
+PbiFile::Sections PbiRawData::FileSections() const { return sections_; }
+
+PbiRawData& PbiRawData::FileSections(PbiFile::Sections sections)
+{
+    sections_ = sections;
+    return *this;
+}
+
+bool PbiRawData::HasBarcodeData() const { return HasSection(PbiFile::BARCODE); }
+
+bool PbiRawData::HasMappedData() const { return HasSection(PbiFile::MAPPED); }
+
+bool PbiRawData::HasReferenceData() const { return HasSection(PbiFile::REFERENCE); }
+
+bool PbiRawData::HasSection(const PbiFile::Section section) const
+{
+    return (sections_ & section) != 0;
+}
+
+uint32_t PbiRawData::NumReads() const { return numReads_; }
+
+PbiRawData& PbiRawData::NumReads(uint32_t num)
+{
+    numReads_ = num;
+    return *this;
+}
+
+const PbiRawMappedData& PbiRawData::MappedData() const { return mappedData_; }
+
+PbiRawMappedData& PbiRawData::MappedData() { return mappedData_; }
+
+const PbiRawReferenceData& PbiRawData::ReferenceData() const { return referenceData_; }
+
+PbiRawReferenceData& PbiRawData::ReferenceData() { return referenceData_; }
+
+PbiFile::VersionEnum PbiRawData::Version() const { return version_; }
+
+PbiRawData& PbiRawData::Version(PbiFile::VersionEnum version)
+{
+    version_ = version;
+    return *this;
+}
+
+bool PbiReferenceEntry::operator==(const PbiReferenceEntry& other) const
+{
+    return std::tie(tId_, beginRow_, endRow_) ==
+           std::tie(other.tId_, other.beginRow_, other.endRow_);
+}
+
+// PBI index caching
+
+PbiIndexCache MakePbiIndexCache(const DataSet& dataset)
+{
+    return MakePbiIndexCache(dataset.BamFiles());
+}
+
+PbiIndexCache MakePbiIndexCache(const std::vector<BamFile>& bamFiles)
+{
+    PbiIndexCache cache = std::make_shared<std::vector<std::shared_ptr<PbiRawData>>>();
+    auto& indices = *cache.get();
+    for (const auto& bamFile : bamFiles) {
+        const auto& pbiFilename = bamFile.PacBioIndexFilename();
+        indices.push_back(std::make_shared<PbiRawData>(pbiFilename));
+    }
+    return cache;
+}
+
+PbiIndexCache MakePbiIndexCache(const BamFile& bamFile)
+{
+    std::vector<BamFile> bamFiles{bamFile};
+    return MakePbiIndexCache(bamFiles);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ProgramInfo.cpp b/src/ProgramInfo.cpp

new file mode 100644 (file)

index 0000000..74700b2
--- /dev/null
+++ b/src/ProgramInfo.cpp
@@ -0,0 +1,157 @@
+// File Description
+/// \file ProgramInfo.cpp
+/// \brief Implements the ProgramInfo class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ProgramInfo.h"
+
+#include <cassert>
+#include <sstream>
+#include <type_traits>
+
+#include "pbbam/SamTagCodec.h"
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+const std::string ProgramInfoTokenID{"ID"};
+const std::string ProgramInfoTokenCL{"CL"};
+const std::string ProgramInfoTokenDS{"DS"};
+const std::string ProgramInfoTokenPN{"PN"};
+const std::string ProgramInfoTokenPP{"PP"};
+const std::string ProgramInfoTokenVN{"VN"};
+
+}  // anonymous
+
+static_assert(std::is_copy_constructible<ProgramInfo>::value,
+              "ProgramInfo(const ProgramInfo&) is not = default");
+static_assert(std::is_copy_assignable<ProgramInfo>::value,
+              "ProgramInfo& operator=(const ProgramInfo&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<ProgramInfo>::value,
+              "ProgramInfo(ProgramInfo&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<ProgramInfo>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+ProgramInfo::ProgramInfo(std::string id) : id_{std::move(id)} {}
+
+std::string ProgramInfo::CommandLine() const { return commandLine_; }
+
+ProgramInfo& ProgramInfo::CommandLine(std::string cmd)
+{
+    commandLine_ = std::move(cmd);
+    return *this;
+}
+
+std::map<std::string, std::string> ProgramInfo::CustomTags() const { return custom_; }
+
+ProgramInfo& ProgramInfo::CustomTags(std::map<std::string, std::string> custom)
+{
+    custom_ = std::move(custom);
+    return *this;
+}
+
+std::string ProgramInfo::Description() const { return description_; }
+
+ProgramInfo& ProgramInfo::Description(std::string description)
+{
+    description_ = std::move(description);
+    return *this;
+}
+
+ProgramInfo ProgramInfo::FromSam(const std::string& sam)
+{
+    // pop off '@PG\t', then split rest of line into tokens
+    const auto tokens = Split(sam.substr(4), '\t');
+    if (tokens.empty()) return {};
+
+    ProgramInfo prog;
+    std::map<std::string, std::string> custom;
+
+    // iterate over tokens
+    for (const auto& token : tokens) {
+        const auto tokenTag = token.substr(0, 2);
+        auto tokenValue = token.substr(3);
+
+        // set program contents
+        // clang-format off
+        if      (tokenTag == ProgramInfoTokenID) prog.Id(std::move(tokenValue));
+        else if (tokenTag == ProgramInfoTokenCL) prog.CommandLine(std::move(tokenValue));
+        else if (tokenTag == ProgramInfoTokenDS) prog.Description(std::move(tokenValue));
+        else if (tokenTag == ProgramInfoTokenPN) prog.Name(std::move(tokenValue));
+        else if (tokenTag == ProgramInfoTokenPP) prog.PreviousProgramId(std::move(tokenValue));
+        else if (tokenTag == ProgramInfoTokenVN) prog.Version(std::move(tokenValue));
+        // clang-format on
+
+        // otherwise, "custom" tag
+        else
+            custom[tokenTag] = std::move(tokenValue);
+    }
+
+    prog.CustomTags(custom);
+    return prog;
+}
+
+std::string ProgramInfo::Id() const { return id_; }
+
+ProgramInfo& ProgramInfo::Id(std::string id)
+{
+    id_ = std::move(id);
+    return *this;
+}
+
+bool ProgramInfo::IsValid() const { return !id_.empty(); }
+
+std::string ProgramInfo::Name() const { return name_; }
+
+ProgramInfo& ProgramInfo::Name(std::string name)
+{
+    name_ = std::move(name);
+    return *this;
+}
+
+std::string ProgramInfo::PreviousProgramId() const { return previousProgramId_; }
+
+ProgramInfo& ProgramInfo::PreviousProgramId(std::string id)
+{
+    previousProgramId_ = std::move(id);
+    return *this;
+}
+
+std::string ProgramInfo::ToSam(const ProgramInfo& prog) { return prog.ToSam(); }
+
+std::string ProgramInfo::ToSam() const
+{
+    std::ostringstream out;
+    out << "@PG" << MakeSamTag(ProgramInfoTokenID, id_);
+
+    // clang-format off
+    if (!name_.empty())              out << MakeSamTag(ProgramInfoTokenPN, name_);
+    if (!version_.empty())           out << MakeSamTag(ProgramInfoTokenVN, version_);
+    if (!description_.empty())       out << MakeSamTag(ProgramInfoTokenDS, description_);
+    if (!previousProgramId_.empty()) out << MakeSamTag(ProgramInfoTokenPP, previousProgramId_);
+    if (!commandLine_.empty())       out << MakeSamTag(ProgramInfoTokenCL, commandLine_);
+    // clang-format on
+
+    // append any custom tags
+    for (const auto& attribute : custom_)
+        out << MakeSamTag(attribute.first, attribute.second);
+    return out.str();
+}
+
+std::string ProgramInfo::Version() const { return version_; }
+
+ProgramInfo& ProgramInfo::Version(std::string version)
+{
+    version_ = std::move(version);
+    return *this;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Pulse2BaseCache.h b/src/Pulse2BaseCache.h

new file mode 100644 (file)

index 0000000..3b60825
--- /dev/null
+++ b/src/Pulse2BaseCache.h
@@ -0,0 +1,117 @@
+// Author: Derek Barnett
+
+#ifndef PULSE2BASECACHE_H
+#define PULSE2BASECACHE_H
+
+#include "pbbam/Config.h"
+
+#include <cassert>
+#include <cctype>
+#include <cstddef>
+#include <string>
+#include <type_traits>
+
+#include <boost/dynamic_bitset.hpp>
+
+namespace PacBio {
+namespace BAM {
+
+class Pulse2BaseCache
+{
+public:
+    /// \brief Creates a Pulse2BaseCache from pulseCall data ('pc' tag)
+    ///
+    /// Computes & stores cache of basecalled vs. squashed pulse positions for
+    /// later masking of pulse data.
+    ///
+    /// \param pulseCalls[in]   string contents of 'pc' tag
+    ///
+    explicit Pulse2BaseCache(const std::string& pulseCalls) : data_(pulseCalls.size())
+    {
+        // basecalled pulse -> data[i] == 1
+        // squashed pulse   -> data[i] == 0
+        //
+        const auto numPulses = pulseCalls.size();
+        for (size_t i = 0; i < numPulses; ++i)
+            data_[i] = std::isupper(pulseCalls.at(i));
+    }
+
+    Pulse2BaseCache() = delete;
+    Pulse2BaseCache(const Pulse2BaseCache&) = default;
+    Pulse2BaseCache(Pulse2BaseCache&&) noexcept(
+        std::is_nothrow_move_constructible<boost::dynamic_bitset<>>::value);
+    Pulse2BaseCache& operator=(const Pulse2BaseCache&) = default;
+    Pulse2BaseCache& operator=(Pulse2BaseCache&&) noexcept(
+        std::is_nothrow_move_assignable<boost::dynamic_bitset<>>::value);
+    ~Pulse2BaseCache() = default;
+
+    ///
+    /// \brief FindFirst
+    /// \return
+    ///
+    size_t FindFirst() const { return data_.find_first(); }
+
+    ///
+    /// \brief FindNext
+    /// \param from
+    /// \return
+    ///
+    size_t FindNext(size_t from) const { return data_.find_next(from); }
+
+    ///
+    /// \brief IsBasecallAt
+    /// \param pos
+    /// \return
+    ///
+    bool IsBasecallAt(const size_t pos) const { return data_[pos]; }
+
+    /// \returns the total number of pulses (basecalled & squashed)
+    ///
+    size_t NumPulses() const { return data_.size(); }
+
+    /// \returns the total number of basecalled pulses
+    ///
+    size_t NumBases() const { return data_.count(); }
+
+    /// \brief Removes squashed pulse positions from input data.
+    ///
+    /// \param[in]  Contents of any per-pulse tag.
+    /// \returns    Input \p pulseData less all squashed pulses
+    ///
+    template <typename T>
+    T RemoveSquashedPulses(const T& pulseData) const
+    {
+        const auto numPulses = pulseData.size();
+        assert(numPulses == data_.size());
+
+        // The reserve() below overshoots the required space, but numPulses is cheap
+        // to compute, and by definition will be sufficient to hold the result. Thus
+        // we only ever need to do one allocation.
+        //
+        T result;
+        result.reserve(numPulses);
+
+        // Only include data at positions that match our cached pulse data.
+        //
+        size_t inputIndex = 0;
+        for (size_t i = 0; i < numPulses; ++i) {
+            if (data_[i]) result.push_back(pulseData.at(inputIndex));
+            ++inputIndex;
+        }
+        return result;
+    }
+
+private:
+    boost::dynamic_bitset<> data_;
+};
+
+inline Pulse2BaseCache::Pulse2BaseCache(Pulse2BaseCache&&) noexcept(
+    std::is_nothrow_move_constructible<boost::dynamic_bitset<>>::value) = default;
+
+inline Pulse2BaseCache& Pulse2BaseCache::operator=(Pulse2BaseCache&&) noexcept(
+    std::is_nothrow_move_assignable<boost::dynamic_bitset<>>::value) = default;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PULSE2BASECACHE_H
diff --git a/src/QNameQuery.cpp b/src/QNameQuery.cpp

new file mode 100644 (file)

index 0000000..493e215
--- /dev/null
+++ b/src/QNameQuery.cpp
@@ -0,0 +1,74 @@
+// File Description
+/// \file QNameQuery.cpp
+/// \brief Implements the QNameQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/QNameQuery.h"
+
+#include <cassert>
+
+#include <boost/optional.hpp>
+
+#include "pbbam/CompositeBamReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+class QNameQuery::QNameQueryPrivate
+{
+public:
+    QNameQueryPrivate(const DataSet& dataset)
+        : reader_{std::make_unique<SequentialCompositeBamReader>(dataset)}, nextRecord_(boost::none)
+    {
+    }
+
+    bool GetNext(std::vector<BamRecord>& records)
+    {
+        records.clear();
+
+        std::string groupRecordName;
+
+        if (nextRecord_.is_initialized()) {
+            BamRecord r = nextRecord_.get();
+            groupRecordName = r.FullName();
+            records.push_back(std::move(r));
+            nextRecord_ = boost::none;
+        }
+
+        BamRecord record;
+        while (reader_->GetNext(record)) {
+            if (records.empty()) {
+                groupRecordName = record.FullName();
+                records.push_back(record);
+            } else {
+                assert(!records.empty());
+                if (record.FullName() == groupRecordName)
+                    records.push_back(record);
+                else {
+                    nextRecord_ = record;
+                    return true;
+                }
+            }
+        }
+        return !records.empty();
+    }
+
+private:
+    std::unique_ptr<SequentialCompositeBamReader> reader_;
+    boost::optional<BamRecord> nextRecord_;
+};
+
+QNameQuery::QNameQuery(const DataSet& dataset)
+    : internal::IGroupQuery(), d_{std::make_unique<QNameQueryPrivate>(dataset)}
+{
+}
+
+QNameQuery::~QNameQuery() = default;
+
+bool QNameQuery::GetNext(std::vector<BamRecord>& records) { return d_->GetNext(records); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ReadAccuracyQuery.cpp b/src/ReadAccuracyQuery.cpp

new file mode 100644 (file)

index 0000000..f258529
--- /dev/null
+++ b/src/ReadAccuracyQuery.cpp
@@ -0,0 +1,43 @@
+// File Description
+/// \file ReadAccuracyQuery.cpp
+/// \brief Implements the ReadAccuracyQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ReadAccuracyQuery.h"
+
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/PbiFilterTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+class ReadAccuracyQuery::ReadAccuracyQueryPrivate
+{
+public:
+    ReadAccuracyQueryPrivate(const Accuracy accuracy, const Compare::Type compareType,
+                             const DataSet& dataset)
+        : reader_{PbiReadAccuracyFilter{accuracy, compareType}, dataset}
+    {
+    }
+
+    PbiFilterCompositeBamReader<Compare::None> reader_;  // unsorted
+};
+
+ReadAccuracyQuery::ReadAccuracyQuery(const Accuracy accuracy, const Compare::Type compareType,
+                                     const DataSet& dataset)
+    : internal::IQuery()
+    , d_{std::make_unique<ReadAccuracyQueryPrivate>(accuracy, compareType, dataset)}
+{
+}
+
+ReadAccuracyQuery::~ReadAccuracyQuery() = default;
+
+bool ReadAccuracyQuery::GetNext(BamRecord& r) { return d_->reader_.GetNext(r); }
+
+uint32_t ReadAccuracyQuery::NumReads() const { return d_->reader_.NumReads(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ReadGroupInfo.cpp b/src/ReadGroupInfo.cpp

new file mode 100644 (file)

index 0000000..791b5e1
--- /dev/null
+++ b/src/ReadGroupInfo.cpp
@@ -0,0 +1,920 @@
+// File Description
+/// \file ReadGroupInfo.cpp
+/// \brief Implements the ReadGroupInfo class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ReadGroupInfo.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <iomanip>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+
+#include <boost/algorithm/cxx14/equal.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include "ChemistryTable.h"
+#include "pbbam/MD5.h"
+#include "pbbam/SamTagCodec.h"
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+static const std::string sam_ID{"ID"};
+static const std::string sam_CN{"CN"};
+static const std::string sam_DS{"DS"};
+static const std::string sam_DT{"DT"};
+static const std::string sam_FO{"FO"};
+static const std::string sam_KS{"KS"};
+static const std::string sam_LB{"LB"};
+static const std::string sam_PG{"PG"};
+static const std::string sam_PI{"PI"};
+static const std::string sam_PL{"PL"};
+static const std::string sam_PM{"PM"};
+static const std::string sam_PU{"PU"};
+static const std::string sam_SM{"SM"};
+static const std::string sam_BC{"BC"};
+
+static const std::string feature_DQ{"DeletionQV"};
+static const std::string feature_DT{"DeletionTag"};
+static const std::string feature_IQ{"InsertionQV"};
+static const std::string feature_MQ{"MergeQV"};
+static const std::string feature_SQ{"SubstitutionQV"};
+static const std::string feature_ST{"SubstitutionTag"};
+static const std::string feature_IP{"Ipd"};
+static const std::string feature_PW{"PulseWidth"};
+static const std::string feature_PM{"PkMid"};
+static const std::string feature_PA{"PkMean"};
+static const std::string feature_PI{"PkMid2"};
+static const std::string feature_PS{"PkMean2"};
+static const std::string feature_LT{"Label"};
+static const std::string feature_PQ{"LabelQV"};
+static const std::string feature_PT{"AltLabel"};
+static const std::string feature_PV{"AltLabelQV"};
+static const std::string feature_PG{"PulseMergeQV"};
+static const std::string feature_PC{"PulseCall"};
+static const std::string feature_PD{"PrePulseFrames"};
+static const std::string feature_PX{"PulseCallWidth"};
+static const std::string feature_SF{"StartFrame"};
+static const std::string feature_PE{"PulseExclusion"};
+
+static const std::string token_RT{"READTYPE"};
+static const std::string token_BK{"BINDINGKIT"};
+static const std::string token_SK{"SEQUENCINGKIT"};
+static const std::string token_BV{"BASECALLERVERSION"};
+static const std::string token_FR{"FRAMERATEHZ"};
+static const std::string token_CT{"CONTROL"};
+
+static const std::string token_BF{"BarcodeFile"};
+static const std::string token_BH{"BarcodeHash"};
+static const std::string token_BC{"BarcodeCount"};
+static const std::string token_BM{"BarcodeMode"};
+static const std::string token_BQ{"BarcodeQuality"};
+
+static const std::string codec_RAW{"Frames"};
+static const std::string codec_V1{"CodecV1"};
+
+static const std::string barcodemode_NONE{"None"};
+static const std::string barcodemode_SYM{"Symmetric"};
+static const std::string barcodemode_ASYM{"Asymmetric"};
+static const std::string barcodemode_TAIL{"Tailed"};
+
+static const std::string barcodequal_NONE{"None"};
+static const std::string barcodequal_SCORE{"Score"};
+static const std::string barcodequal_PROB{"Probability"};
+
+static const std::string platformModelType_ASTRO{"ASTRO"};
+static const std::string platformModelType_RS{"RS"};
+static const std::string platformModelType_SEQUEL{"SEQUEL"};
+static const std::string platformModelType_SEQUELII{"SEQUELII"};
+
+// clang-format off
+std::string BaseFeatureName(const BaseFeature& feature)
+{
+    static const std::unordered_map<BaseFeature, std::string> lookup{
+        {BaseFeature::DELETION_QV,      feature_DQ},
+        {BaseFeature::DELETION_TAG,     feature_DT},
+        {BaseFeature::INSERTION_QV,     feature_IQ},
+        {BaseFeature::MERGE_QV,         feature_MQ},
+        {BaseFeature::SUBSTITUTION_QV,  feature_SQ},
+        {BaseFeature::SUBSTITUTION_TAG, feature_ST},
+        {BaseFeature::IPD,              feature_IP},
+        {BaseFeature::PULSE_WIDTH,      feature_PW},
+        {BaseFeature::PKMID,            feature_PM},
+        {BaseFeature::PKMEAN,           feature_PA},
+        {BaseFeature::PKMID2,           feature_PI},
+        {BaseFeature::PKMEAN2,          feature_PS},
+        {BaseFeature::LABEL_QV,         feature_PQ},
+        {BaseFeature::ALT_LABEL,        feature_PT},
+        {BaseFeature::ALT_LABEL_QV,     feature_PV},
+        {BaseFeature::PULSE_MERGE_QV,   feature_PG},
+        {BaseFeature::PULSE_CALL,       feature_PC},
+        {BaseFeature::PRE_PULSE_FRAMES, feature_PD},
+        {BaseFeature::PULSE_CALL_WIDTH, feature_PX},
+        {BaseFeature::START_FRAME,      feature_SF},
+        {BaseFeature::PULSE_EXCLUSION,  feature_PE}
+    };
+
+    const auto found = lookup.find(feature);
+    if (found != lookup.cend())
+        return found->second;
+    throw std::runtime_error{ "ReadGroupInfo: unrecognized base feature" };
+}
+
+std::string FrameCodecName(const FrameCodec& codec)
+{
+    static const std::unordered_map<FrameCodec, std::string> lookup{
+        {FrameCodec::RAW, codec_RAW},
+        {FrameCodec::V1,  codec_V1}
+    };
+
+    const auto found = lookup.find(codec);
+    if (found != lookup.cend())
+        return found->second;
+    throw std::runtime_error{ "ReadGroupInfo: unrecognized frame codec" };
+}
+
+std::string BarcodeModeName(const BarcodeModeType& mode)
+{
+    static const std::unordered_map<BarcodeModeType, std::string> lookup{
+        {BarcodeModeType::NONE,       barcodemode_NONE},
+        {BarcodeModeType::SYMMETRIC,  barcodemode_SYM},
+        {BarcodeModeType::ASYMMETRIC, barcodemode_ASYM},
+        {BarcodeModeType::TAILED,     barcodemode_TAIL}
+    };
+
+    const auto found = lookup.find(mode);
+    if (found != lookup.cend())
+        return found->second;
+    throw std::runtime_error{ "ReadGroupInfo: unrecognized barcode mode type" };
+}
+
+std::string BarcodeQualityName(const BarcodeQualityType& type)
+{
+    static const std::unordered_map<BarcodeQualityType, std::string> lookup{
+        {BarcodeQualityType::NONE,        barcodequal_NONE},
+        {BarcodeQualityType::SCORE,       barcodequal_SCORE},
+        {BarcodeQualityType::PROBABILITY, barcodequal_PROB}
+    };
+
+    const auto found = lookup.find(type);
+    if (found != lookup.cend())
+        return found->second;
+    throw std::runtime_error{ "ReadGroupInfo: unrecognized barcode quality type" };
+}
+
+std::string PlatformModelName(const PlatformModelType& type)
+{
+    static const std::unordered_map<PlatformModelType, std::string> lookup{
+        {PlatformModelType::ASTRO,    platformModelType_ASTRO},
+        {PlatformModelType::RS,       platformModelType_RS},
+        {PlatformModelType::SEQUEL,   platformModelType_SEQUEL},
+        {PlatformModelType::SEQUELII, platformModelType_SEQUELII}
+    };
+
+    const auto found = lookup.find(type);
+    if (found != lookup.cend())
+        return found->second;
+    throw std::runtime_error{ "ReadGroupInfo: unrecognized platform model type" };
+}
+
+static const std::map<std::string, BaseFeature> nameToFeature
+{
+    { feature_DQ, BaseFeature::DELETION_QV },
+    { feature_DT, BaseFeature::DELETION_TAG },
+    { feature_IQ, BaseFeature::INSERTION_QV },
+    { feature_MQ, BaseFeature::MERGE_QV },
+    { feature_SQ, BaseFeature::SUBSTITUTION_QV },
+    { feature_ST, BaseFeature::SUBSTITUTION_TAG },
+    { feature_IP, BaseFeature::IPD },
+    { feature_PW, BaseFeature::PULSE_WIDTH },
+    { feature_PM, BaseFeature::PKMID },
+    { feature_PA, BaseFeature::PKMEAN },
+    { feature_PI, BaseFeature::PKMID2 },
+    { feature_PS, BaseFeature::PKMEAN2 },
+    { feature_PQ, BaseFeature::LABEL_QV },
+    { feature_PT, BaseFeature::ALT_LABEL },
+    { feature_PV, BaseFeature::ALT_LABEL_QV },
+    { feature_PC, BaseFeature::PULSE_CALL },
+    { feature_PG, BaseFeature::PULSE_MERGE_QV },
+    { feature_PD, BaseFeature::PRE_PULSE_FRAMES },
+    { feature_PX, BaseFeature::PULSE_CALL_WIDTH },
+    { feature_SF, BaseFeature::START_FRAME },
+    { feature_PE, BaseFeature::PULSE_EXCLUSION }
+};
+
+static const std::map<std::string, FrameCodec> nameToCodec
+{
+    { codec_RAW, FrameCodec::RAW },
+    { codec_V1,  FrameCodec::V1 }
+};
+
+static const std::map<std::string, BarcodeModeType> nameToBarcodeMode
+{
+    { barcodemode_NONE, BarcodeModeType::NONE },
+    { barcodemode_SYM,  BarcodeModeType::SYMMETRIC },
+    { barcodemode_ASYM, BarcodeModeType::ASYMMETRIC },
+    { barcodemode_TAIL, BarcodeModeType::TAILED }
+};
+
+static const std::map<std::string, BarcodeQualityType> nameToBarcodeQuality
+{
+    { barcodequal_NONE,  BarcodeQualityType::NONE },
+    { barcodequal_SCORE, BarcodeQualityType::SCORE },
+    { barcodequal_PROB,  BarcodeQualityType::PROBABILITY }
+};
+
+static const std::map<std::string, PlatformModelType> nameToPlatformModel
+{
+    { platformModelType_ASTRO,    PlatformModelType::ASTRO },
+    { platformModelType_RS,       PlatformModelType::RS },
+    { platformModelType_SEQUEL,   PlatformModelType::SEQUEL },
+    { platformModelType_SEQUELII, PlatformModelType::SEQUELII }
+};
+// clang-format on
+
+bool IsLikelyBarcodeKey(const std::string& name) { return name.find("Barcode") == 0; }
+
+bool IsBaseFeature(const std::string& name)
+{
+    return nameToFeature.find(name) != nameToFeature.cend();
+}
+
+BaseFeature BaseFeatureFromName(const std::string& name) { return nameToFeature.at(name); }
+
+FrameCodec FrameCodecFromName(const std::string& name) { return nameToCodec.at(name); }
+
+BarcodeModeType BarcodeModeFromName(const std::string& name) { return nameToBarcodeMode.at(name); }
+
+BarcodeQualityType BarcodeQualityFromName(const std::string& name)
+{
+    return nameToBarcodeQuality.at(name);
+}
+
+PlatformModelType PlatformModelFromName(std::string name) { return nameToPlatformModel.at(name); }
+
+}  // namespace
+
+static_assert(std::is_copy_constructible<ReadGroupInfo>::value,
+              "ReadGroupInfo(const ReadGroupInfo&) is not = default");
+static_assert(std::is_copy_assignable<ReadGroupInfo>::value,
+              "ReadGroupInfo& operator=(const ReadGroupInfo&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<ReadGroupInfo>::value,
+              "ReadGroupInfo(ReadGroupInfo&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<ReadGroupInfo>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+ReadGroupInfo::ReadGroupInfo(std::string baseId, std::pair<uint16_t, uint16_t> barcodes)
+
+{
+    std::ostringstream id;
+    id << baseId << '/' << std::to_string(barcodes.first) << "--"
+       << std::to_string(barcodes.second);
+    id_ = id.str();
+    baseId_ = std::move(baseId);
+    barcodes_ = std::move(barcodes);
+}
+
+ReadGroupInfo::ReadGroupInfo() : readType_{"UNKNOWN"} {}
+
+ReadGroupInfo::ReadGroupInfo(std::string id) : readType_{"UNKNOWN"} { Id(std::move(id)); }
+
+ReadGroupInfo::ReadGroupInfo(std::string movieName, std::string readType)
+    : ReadGroupInfo{std::move(movieName), std::move(readType), PlatformModelType::SEQUEL}
+{
+}
+
+ReadGroupInfo::ReadGroupInfo(std::string movieName, std::string readType,
+                             std::pair<uint16_t, uint16_t> barcodes)
+    : ReadGroupInfo{std::move(movieName), std::move(readType), PlatformModelType::SEQUEL,
+                    std::move(barcodes)}
+{
+}
+
+ReadGroupInfo::ReadGroupInfo(std::string movieName, std::string readType,
+                             PlatformModelType platform)
+    : platformModel_{std::move(platform)}
+{
+    Id(MakeReadGroupId(movieName, readType));
+    movieName_ = std::move(movieName);
+    readType_ = std::move(readType);
+}
+
+ReadGroupInfo::ReadGroupInfo(std::string movieName, std::string readType,
+                             PlatformModelType platform, std::pair<uint16_t, uint16_t> barcodes)
+    : ReadGroupInfo{MakeReadGroupId(movieName, readType), std::move(barcodes)}
+{
+    platformModel_ = std::move(platform);
+}
+
+bool ReadGroupInfo::operator==(const ReadGroupInfo& other) const
+{
+    const auto lhsFields = std::tie(
+        id_, sequencingCenter_, date_, flowOrder_, keySequence_, library_, programs_,
+        platformModel_, predictedInsertSize_, movieName_, sample_, readType_, bindingKit_,
+        sequencingKit_, basecallerVersion_, frameRateHz_, control_, ipdCodec_, pulseWidthCodec_,
+        hasBarcodeData_, barcodeFile_, barcodeHash_, barcodeCount_, barcodeMode_, barcodeQuality_);
+
+    const auto rhsFields = std::tie(
+        other.id_, other.sequencingCenter_, other.date_, other.flowOrder_, other.keySequence_,
+        other.library_, other.programs_, other.platformModel_, other.predictedInsertSize_,
+        other.movieName_, other.sample_, other.readType_, other.bindingKit_, other.sequencingKit_,
+        other.basecallerVersion_, other.frameRateHz_, other.control_, other.ipdCodec_,
+        other.pulseWidthCodec_, other.hasBarcodeData_, other.barcodeFile_, other.barcodeHash_,
+        other.barcodeCount_, other.barcodeMode_, other.barcodeQuality_);
+
+    return lhsFields == rhsFields &&
+           boost::algorithm::equal(features_.cbegin(), features_.cend(), other.features_.cbegin(),
+                                   other.features_.cend()) &&
+           boost::algorithm::equal(custom_.cbegin(), custom_.cend(), other.custom_.cbegin(),
+                                   other.custom_.cend());
+}
+
+bool ReadGroupInfo::operator<(const ReadGroupInfo& other) const { return id_ < other.id_; }
+
+size_t ReadGroupInfo::BarcodeCount() const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error{
+            "ReadGroupInfo: barcode count requested but barcode data is missing"};
+    return barcodeCount_;
+}
+
+ReadGroupInfo& ReadGroupInfo::BarcodeData(std::string barcodeFile, std::string barcodeHash,
+                                          size_t barcodeCount, BarcodeModeType barcodeMode,
+                                          BarcodeQualityType barcodeQuality)
+{
+    barcodeFile_ = std::move(barcodeFile);
+    barcodeHash_ = std::move(barcodeHash);
+    barcodeCount_ = barcodeCount;
+    barcodeMode_ = barcodeMode;
+    barcodeQuality_ = barcodeQuality;
+    hasBarcodeData_ = true;
+    return *this;
+}
+
+std::string ReadGroupInfo::BarcodeFile() const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error{
+            "ReadGroupInfo: barcode file requested but barcode data is missing"};
+    return barcodeFile_;
+}
+
+std::string ReadGroupInfo::BarcodeHash() const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error{
+            "ReadGroupInfo: barcode hash requested but barcode data is missing"};
+    return barcodeHash_;
+}
+
+BarcodeModeType ReadGroupInfo::BarcodeMode() const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error{
+            "ReadGroupInfo: barcode mode requested but barcode data is missing"};
+    return barcodeMode_;
+}
+
+BarcodeQualityType ReadGroupInfo::BarcodeQuality() const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error{
+            "ReadGroupInfo: barcode quality requested but barcode data is missing"};
+    return barcodeQuality_;
+}
+
+boost::optional<uint16_t> ReadGroupInfo::BarcodeForward() const
+{
+    const auto barcodes = Barcodes();
+    if (barcodes) return barcodes->first;
+    return boost::make_optional(false, uint16_t{0});
+}
+
+boost::optional<uint16_t> ReadGroupInfo::BarcodeReverse() const
+{
+    const auto barcodes = Barcodes();
+    if (barcodes) return barcodes->second;
+    return boost::make_optional(false, uint16_t{0});
+}
+
+boost::optional<std::pair<uint16_t, uint16_t>> ReadGroupInfo::Barcodes() const { return barcodes_; }
+
+std::string ReadGroupInfo::BasecallerVersion() const { return basecallerVersion_; }
+
+ReadGroupInfo& ReadGroupInfo::BasecallerVersion(std::string versionNumber)
+{
+    if (basecallerVersion_ != versionNumber) {
+        basecallerVersion_ = std::move(versionNumber);
+        sequencingChemistry_.clear();  // reset cached chemistry name
+    }
+    return *this;
+}
+
+std::string ReadGroupInfo::BaseFeatureTag(BaseFeature feature) const
+{
+    const auto iter = features_.find(feature);
+    if (iter == features_.end()) return {};
+    return iter->second;
+}
+
+ReadGroupInfo& ReadGroupInfo::BaseFeatureTag(BaseFeature feature, std::string tag)
+{
+    features_[feature] = std::move(tag);
+    return *this;
+}
+
+std::string ReadGroupInfo::BaseId() const { return baseId_; }
+
+std::string ReadGroupInfo::BindingKit() const { return bindingKit_; }
+
+ReadGroupInfo& ReadGroupInfo::BindingKit(std::string kitNumber)
+{
+    if (bindingKit_ != kitNumber) {
+        bindingKit_ = std::move(kitNumber);
+        sequencingChemistry_.clear();  // reset cached chemistry name
+    }
+    return *this;
+}
+
+ReadGroupInfo& ReadGroupInfo::ClearBarcodeData()
+{
+    barcodeFile_.clear();
+    barcodeHash_.clear();
+    hasBarcodeData_ = false;
+    return *this;
+}
+
+ReadGroupInfo& ReadGroupInfo::ClearBaseFeatures()
+{
+    features_.clear();
+    return *this;
+}
+
+bool ReadGroupInfo::Control() const { return control_; }
+
+ReadGroupInfo& ReadGroupInfo::Control(bool ctrl)
+{
+    control_ = ctrl;
+    return *this;
+}
+
+std::map<std::string, std::string> ReadGroupInfo::CustomTags() const { return custom_; }
+
+ReadGroupInfo& ReadGroupInfo::CustomTags(std::map<std::string, std::string> custom)
+{
+    custom_ = std::move(custom);
+    return *this;
+}
+
+std::string ReadGroupInfo::Date() const { return date_; }
+
+ReadGroupInfo& ReadGroupInfo::Date(std::string date)
+{
+    date_ = std::move(date);
+    return *this;
+}
+
+void ReadGroupInfo::DecodeBarcodeKey(const std::string& key, std::string value)
+{
+    if (key == token_BF)
+        barcodeFile_ = std::move(value);
+    else if (key == token_BH)
+        barcodeHash_ = std::move(value);
+    else if (key == token_BC)
+        barcodeCount_ = std::stoul(value);
+    else if (key == token_BM)
+        barcodeMode_ = BarcodeModeFromName(value);
+    else if (key == token_BQ)
+        barcodeQuality_ = BarcodeQualityFromName(value);
+}
+
+void ReadGroupInfo::DecodeFrameCodecKey(const std::string& key, std::string value)
+{
+    const auto keyParts = Split(key, ':');
+    if (keyParts.size() == 2) {
+        const auto& subkey = keyParts.at(0);
+        if (subkey == feature_IP) {
+            ipdCodec_ = FrameCodecFromName(keyParts.at(1));
+            features_[BaseFeature::IPD] = std::move(value);
+        } else if (subkey == feature_PW) {
+            pulseWidthCodec_ = FrameCodecFromName(keyParts.at(1));
+            features_[BaseFeature::PULSE_WIDTH] = std::move(value);
+        }
+    }
+}
+
+void ReadGroupInfo::DecodeSamDescription(const std::string& description)
+{
+    const auto tokens = Split(description, ';');
+    if (tokens.empty()) return;
+
+    // iterate over tokens
+    for (const auto& token : tokens) {
+
+        const auto foundEqual = token.find('=');
+        if (foundEqual == std::string::npos) continue;
+
+        const auto key = token.substr(0, foundEqual);
+        auto value = token.substr(foundEqual + 1);
+
+        // 'mandatory' items
+        // clang-format off
+        if      (key == token_RT) readType_ = std::move(value);
+        else if (key == token_BK) bindingKit_ = std::move(value);
+        else if (key == token_BV) basecallerVersion_ = std::move(value);
+        else if (key == token_SK) sequencingKit_ = std::move(value);
+        else if (key == token_FR) frameRateHz_ = std::move(value);
+        else if (key == token_CT) control_ = (value == "TRUE");
+        // clang-format on
+
+        // base features
+        else if (IsBaseFeature(key))
+            features_[BaseFeatureFromName(key)] = std::move(value);
+
+        // barcode data
+        else if (IsLikelyBarcodeKey(key))
+            DecodeBarcodeKey(key, std::move(value));
+
+        // frame codecs
+        else
+            DecodeFrameCodecKey(key, std::move(value));
+    }
+
+    hasBarcodeData_ = !barcodeFile_.empty();
+}
+
+std::string ReadGroupInfo::EncodeSamDescription() const
+{
+    constexpr static const char SEP = ';';
+    constexpr static const char COLON = ':';
+    constexpr static const char EQ = '=';
+
+    std::string result{token_RT + EQ + readType_};
+
+    std::string featureName;
+    for (const auto& feature : features_) {
+
+        featureName = BaseFeatureName(feature.first);
+        if (featureName.empty() || feature.second.empty())
+            continue;
+        else if (featureName == feature_IP) {
+            featureName.push_back(COLON);
+            featureName.append(FrameCodecName(ipdCodec_));
+        } else if (featureName == feature_PW) {
+            featureName.push_back(COLON);
+            featureName.append(FrameCodecName(pulseWidthCodec_));
+        }
+        result.append(SEP + featureName + EQ + feature.second);
+    }
+
+    // clang-format off
+    if (!bindingKit_.empty())        result.append(SEP + token_BK + EQ + bindingKit_);
+    if (!sequencingKit_.empty())     result.append(SEP + token_SK + EQ + sequencingKit_);
+    if (!basecallerVersion_.empty()) result.append(SEP + token_BV + EQ + basecallerVersion_);
+    if (!frameRateHz_.empty())       result.append(SEP + token_FR + EQ + frameRateHz_);
+    if (control_)                    result.append(SEP + token_CT + EQ + (control_ ? "TRUE" : "FALSE"));
+    // clang-format on
+
+    if (hasBarcodeData_) {
+        const std::string barcodeData{SEP + token_BF + EQ + barcodeFile_ + SEP + token_BH + EQ +
+                                      barcodeHash_ + SEP + token_BC + EQ +
+                                      std::to_string(barcodeCount_) + SEP + token_BM + EQ +
+                                      BarcodeModeName(barcodeMode_) + SEP + token_BQ + EQ +
+                                      BarcodeQualityName(barcodeQuality_)};
+        result.append(barcodeData);
+    }
+
+    return result;
+}
+
+std::string ReadGroupInfo::FlowOrder() const { return flowOrder_; }
+
+ReadGroupInfo& ReadGroupInfo::FlowOrder(std::string order)
+{
+    flowOrder_ = std::move(order);
+    return *this;
+}
+
+std::string ReadGroupInfo::FrameRateHz() const { return frameRateHz_; }
+
+ReadGroupInfo& ReadGroupInfo::FrameRateHz(std::string frameRateHz)
+{
+    frameRateHz_ = std::move(frameRateHz);
+    return *this;
+}
+
+ReadGroupInfo ReadGroupInfo::FromSam(const std::string& sam)
+{
+    // pop off '@RG\t', then split rest of line into tokens
+    const auto tokens = Split(sam.substr(4), '\t');
+    if (tokens.empty()) return {};
+
+    ReadGroupInfo rg;
+    std::map<std::string, std::string> custom;
+
+    for (const auto& token : tokens) {
+        const auto tokenTag = token.substr(0, 2);
+        auto tokenValue = token.substr(3);
+
+        // set read group info
+        // clang-format off
+        if      (tokenTag == sam_ID) rg.Id(std::move(tokenValue));
+        else if (tokenTag == sam_CN) rg.SequencingCenter(std::move(tokenValue));
+        else if (tokenTag == sam_DT) rg.Date(std::move(tokenValue));
+        else if (tokenTag == sam_FO) rg.FlowOrder(std::move(tokenValue));
+        else if (tokenTag == sam_KS) rg.KeySequence(std::move(tokenValue));
+        else if (tokenTag == sam_LB) rg.Library(std::move(tokenValue));
+        else if (tokenTag == sam_PG) rg.Programs(std::move(tokenValue));
+        else if (tokenTag == sam_PI) rg.PredictedInsertSize(std::move(tokenValue));
+        else if (tokenTag == sam_PU) rg.MovieName(std::move(tokenValue));
+        else if (tokenTag == sam_SM) rg.Sample(std::move(tokenValue));
+        else if (tokenTag == sam_DS) rg.DecodeSamDescription(std::move(tokenValue));
+        else if (tokenTag == sam_PM) rg.PlatformModel(PlatformModelFromName(std::move(tokenValue)));
+        // clang-format on
+
+        // if not platform name (always "PACBIO" for us), store as a custom tag
+        else if (tokenTag != sam_PL)
+            custom[tokenTag] = std::move(tokenValue);
+    }
+    rg.CustomTags(std::move(custom));
+
+    return rg;
+}
+
+std::string ReadGroupInfo::GetBaseId(const std::string& id)
+{
+    const auto slashAt = id.find('/');
+    if (slashAt == std::string::npos)
+        return id;
+    else
+        return id.substr(0, slashAt);
+}
+
+bool ReadGroupInfo::HasBarcodeData() const { return hasBarcodeData_; }
+
+bool ReadGroupInfo::HasBaseFeature(BaseFeature feature) const
+{
+    return features_.find(feature) != features_.end();
+}
+
+std::string ReadGroupInfo::Id() const { return id_; }
+
+ReadGroupInfo& ReadGroupInfo::Id(const std::string& movieName, const std::string& readType)
+{
+    return Id(MakeReadGroupId(movieName, readType));
+}
+
+ReadGroupInfo& ReadGroupInfo::Id(std::string id)
+{
+    barcodes_.reset();
+
+    // maybe parse for barcode labels
+    const auto slashAt = id.find('/');
+    if (slashAt != std::string::npos) {
+        // looks like we do, parse & store
+        const auto tokens = Split(id.substr(slashAt + 1), '-');
+        if (tokens.size() != 3) {
+            throw std::runtime_error{
+                "ReadGroupInfo: could not fetch barcodes from malformatted read group ID: " + id +
+                " Must be in the form: {RGID_STRING}/{bcForward}--{bcReverse}"};
+        }
+
+        // catch here so we can give more informative message
+        try {
+            barcodes_ = std::pair<uint16_t, uint16_t>(static_cast<uint16_t>(std::stoul(tokens[0])),
+                                                      static_cast<uint16_t>(std::stoul(tokens[2])));
+        } catch (std::exception& e) {
+            throw std::runtime_error{
+                "ReadGroupInfo: could not fetch barcodes from malformatted read group ID: " + id +
+                " Must be in the form: {RGID_STRING}/{bcForward}--{bcReverse}"};
+        }
+    }
+
+    baseId_ = id.substr(0, slashAt);
+    id_ = std::move(id);
+    return *this;
+}
+
+int32_t ReadGroupInfo::IdToInt(const std::string& rgId)
+{
+    const auto id = GetBaseId(rgId);
+    const uint32_t rawid = std::stoul(id, nullptr, 16);
+    return static_cast<int32_t>(rawid);
+}
+
+std::string ReadGroupInfo::IntToId(const int32_t id)
+{
+    std::ostringstream s;
+    s << std::setfill('0') << std::setw(8) << std::hex << id;
+    return s.str();
+}
+
+FrameCodec ReadGroupInfo::IpdCodec() const { return ipdCodec_; }
+
+ReadGroupInfo& ReadGroupInfo::IpdCodec(FrameCodec codec, std::string tag)
+{
+    // store desired codec type
+    ipdCodec_ = std::move(codec);
+
+    // update base features map
+    const std::string actualTag = (tag.empty() ? "ip" : std::move(tag));
+    BaseFeatureTag(BaseFeature::IPD, actualTag);
+    return *this;
+}
+
+bool ReadGroupInfo::IsValid() const { return !id_.empty(); }
+
+std::string ReadGroupInfo::KeySequence() const { return keySequence_; }
+
+ReadGroupInfo& ReadGroupInfo::KeySequence(std::string sequence)
+{
+    keySequence_ = std::move(sequence);
+    return *this;
+}
+
+std::string ReadGroupInfo::Library() const { return library_; }
+
+ReadGroupInfo& ReadGroupInfo::Library(std::string library)
+{
+    library_ = std::move(library);
+    return *this;
+}
+
+std::string ReadGroupInfo::MovieName() const { return movieName_; }
+
+ReadGroupInfo& ReadGroupInfo::MovieName(std::string movieName)
+{
+    movieName_ = std::move(movieName);
+    return *this;
+}
+
+std::string ReadGroupInfo::Platform() const { return std::string("PACBIO"); }
+
+PlatformModelType ReadGroupInfo::PlatformModel() const { return platformModel_; }
+
+ReadGroupInfo& ReadGroupInfo::PlatformModel(PlatformModelType platform)
+{
+    platformModel_ = platform;
+    return *this;
+}
+
+std::string ReadGroupInfo::PredictedInsertSize() const { return predictedInsertSize_; }
+
+ReadGroupInfo& ReadGroupInfo::PredictedInsertSize(std::string size)
+{
+    predictedInsertSize_ = std::move(size);
+    return *this;
+}
+
+std::string ReadGroupInfo::Programs() const { return programs_; }
+
+ReadGroupInfo& ReadGroupInfo::Programs(std::string programs)
+{
+    programs_ = std::move(programs);
+    return *this;
+}
+
+FrameCodec ReadGroupInfo::PulseWidthCodec() const { return pulseWidthCodec_; }
+
+ReadGroupInfo& ReadGroupInfo::PulseWidthCodec(FrameCodec codec, std::string tag)
+{
+    // store desired codec type
+    pulseWidthCodec_ = std::move(codec);
+
+    // update base features map
+    const std::string actualTag = (tag.empty() ? "pw" : std::move(tag));
+    BaseFeatureTag(BaseFeature::PULSE_WIDTH, actualTag);
+    return *this;
+}
+
+std::string ReadGroupInfo::ReadType() const { return readType_; }
+
+ReadGroupInfo& ReadGroupInfo::ReadType(std::string type)
+{
+    readType_ = std::move(type);
+    return *this;
+}
+
+ReadGroupInfo& ReadGroupInfo::RemoveBaseFeature(BaseFeature feature)
+{
+    const auto iter = features_.find(feature);
+    if (iter != features_.end()) features_.erase(iter);
+    return *this;
+}
+
+std::string ReadGroupInfo::Sample() const { return sample_; }
+
+ReadGroupInfo& ReadGroupInfo::Sample(std::string sample)
+{
+    sample_ = std::move(sample);
+    return *this;
+}
+
+std::string ReadGroupInfo::SequencingCenter() const { return sequencingCenter_; }
+
+ReadGroupInfo& ReadGroupInfo::SequencingCenter(std::string center)
+{
+    sequencingCenter_ = std::move(center);
+    return *this;
+}
+
+std::string ReadGroupInfo::SequencingChemistry() const
+{
+    if (!sequencingChemistry_.empty()) return sequencingChemistry_;
+    return sequencingChemistry_ =
+               SequencingChemistryFromTriple(BindingKit(), SequencingKit(), BasecallerVersion());
+}
+
+std::string ReadGroupInfo::SequencingChemistryFromTriple(const std::string& bindingKit,
+                                                         const std::string& sequencingKit,
+                                                         const std::string& basecallerVersion)
+{
+    const auto verFields = Split(basecallerVersion, '.');
+    if (verFields.size() < 2)
+        throw std::runtime_error{"ReadGroupInfo: basecaller version is too short: " +
+                                 basecallerVersion};
+    const std::string version{verFields.at(0) + '.' + verFields.at(1)};
+
+    // check updated table first, if it exists (empty if not), overriding the built-in lookup
+    for (const auto& row : GetChemistryTableFromEnv()) {
+        if (bindingKit == row[0] && sequencingKit == row[1] && version == row[2]) return row[3];
+    }
+
+    for (const auto& row : BuiltInChemistryTable()) {
+        if (bindingKit == row[0] && sequencingKit == row[1] && version == row[2]) return row[3];
+    }
+
+    // not found
+    throw InvalidSequencingChemistryException{bindingKit, sequencingKit, basecallerVersion};
+}
+
+std::string ReadGroupInfo::SequencingKit() const { return sequencingKit_; }
+
+ReadGroupInfo& ReadGroupInfo::SequencingKit(std::string kitNumber)
+{
+    if (sequencingKit_ != kitNumber) {
+        sequencingKit_ = std::move(kitNumber);
+        sequencingChemistry_.clear();  // reset cached chemistry name
+    }
+    return *this;
+}
+
+std::string ReadGroupInfo::ToSam(const ReadGroupInfo& rg) { return rg.ToSam(); }
+
+std::string ReadGroupInfo::ToSam() const
+{
+    std::ostringstream out;
+    out << "@RG" << MakeSamTag(sam_ID, id_) << MakeSamTag(sam_PL, Platform());
+
+    const auto description = EncodeSamDescription();
+    if (!description.empty()) out << MakeSamTag(sam_DS, description);
+
+    // clang-format off
+    if (!sequencingCenter_.empty())    out << MakeSamTag(sam_CN, sequencingCenter_);
+    if (!date_.empty())                out << MakeSamTag(sam_DT, date_);
+    if (!flowOrder_.empty())           out << MakeSamTag(sam_FO, flowOrder_);
+    if (!keySequence_.empty())         out << MakeSamTag(sam_KS, keySequence_);
+    if (!library_.empty())             out << MakeSamTag(sam_LB, library_);
+    if (!programs_.empty())            out << MakeSamTag(sam_PG, programs_);
+    if (!predictedInsertSize_.empty()) out << MakeSamTag(sam_PI, predictedInsertSize_);
+    if (!movieName_.empty())           out << MakeSamTag(sam_PU, movieName_);
+    if (!sample_.empty())              out << MakeSamTag(sam_SM, sample_);
+    if (barcodes_)
+    {
+        out << '\t' << sam_BC << ':'
+            << barcodes_->first << "--" << barcodes_->second;
+    }
+    // clang-format on
+
+    out << MakeSamTag(sam_PM, PlatformModelName(platformModel_));
+
+    // append any custom tags
+    for (const auto& attribute : custom_)
+        out << MakeSamTag(attribute.first, attribute.second);
+
+    return out.str();
+}
+
+std::string MakeReadGroupId(const std::string& movieName, const std::string& readType)
+{
+    return MD5Hash(movieName + "//" + readType).substr(0, 8);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/RecordType.cpp b/src/RecordType.cpp

new file mode 100644 (file)

index 0000000..02b29e6
--- /dev/null
+++ b/src/RecordType.cpp
@@ -0,0 +1,45 @@
+// File Description
+/// \file RecordType.h
+/// \brief Implements the RecordType-related methods
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/RecordType.h"
+
+#include <map>
+#include <stdexcept>
+
+namespace PacBio {
+namespace BAM {
+
+bool IsCcsOrTranscript(const RecordType type)
+{
+    return (type == RecordType::CCS) || (type == RecordType::TRANSCRIPT);
+}
+
+std::string ToString(const RecordType type)
+{
+    // clang-format off
+    static const auto lookup = std::map<RecordType, std::string>
+    {
+        { RecordType::ZMW,        "ZMW" },
+        { RecordType::HQREGION,   "HQREGION" },
+        { RecordType::SUBREAD,    "SUBREAD" },
+        { RecordType::CCS,        "CCS" },
+        { RecordType::SCRAP,      "SCRAP" },
+        { RecordType::TRANSCRIPT, "TRANSCRIPT" },
+        { RecordType::UNKNOWN,    "UNKNOWN" }
+    };
+    // clang-format on
+
+    try {
+        return lookup.at(type);
+    } catch (std::exception&) {
+        throw std::runtime_error{"BamRecordType: unknown type"};
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/SamTagCodec.cpp b/src/SamTagCodec.cpp

new file mode 100644 (file)

index 0000000..47bda7e
--- /dev/null
+++ b/src/SamTagCodec.cpp
@@ -0,0 +1,281 @@
+// File Description
+/// \file SamTagCodec.h
+/// \brief Implements the SamTagCodec class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/SamTagCodec.h"
+
+#include <cstdint>
+#include <limits>
+#include <string>
+
+#include <boost/lexical_cast.hpp>
+
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+std::vector<float> readFloatSamMultiValue(const std::string& data)
+{
+    std::vector<float> result;
+    auto* c = const_cast<char*>(data.c_str());
+    const char* end = c + data.length();
+    while (c + 1 < end)
+        result.emplace_back(strtof(c + 1, &c));
+    return result;
+}
+
+template <typename T>
+std::vector<T> readSignedSamMultiValue(const std::string& data)
+{
+    std::vector<T> result;
+    auto* c = const_cast<char*>(data.c_str());
+    const char* end = c + data.length();
+    while (c + 1 < end)
+        result.emplace_back(strtol(c + 1, &c, 0));
+    return result;
+}
+
+template <typename T>
+std::vector<T> readUnsignedSamMultiValue(const std::string& data)
+{
+    std::vector<T> result;
+    auto* c = const_cast<char*>(data.c_str());
+    const char* end = c + data.length();
+    while (c + 1 < end)
+        result.emplace_back(strtoul(c + 1, &c, 0));
+    return result;
+}
+
+}  // namespace
+
+TagCollection SamTagCodec::Decode(const std::string& tagString)
+{
+    TagCollection tags;
+
+    const auto tokens = Split(tagString, '\t');
+    for (const auto& token : tokens) {
+        if (token.size() < 6)  // TT:t:X
+            continue;
+
+        const auto name = token.substr(0, 2);
+        const auto type = token.at(3);
+        const auto remainder = token.substr(5);
+        if (remainder.empty()) throw std::runtime_error{"SamTagCodec: malformatted tag: " + token};
+
+        switch (type) {
+
+            // technically only 'A' is allowed in SAM chars,
+            // but we'll be a little permissive
+            case 'A':
+            case 'a': {
+                tags[name] = Tag{static_cast<char>(remainder[0], TagModifier::ASCII_CHAR)};
+                break;
+            }
+
+            // technically only 'i' is allowed in SAM ints, but we'll be a little
+            // permissive since SAM might be a bit more "user-edited" than BAM
+            case 'c':
+            case 'C':
+            case 's':
+            case 'S':
+            case 'i':
+            case 'I': {
+                // check out boost::numeric cast for these conversions
+
+                // negative value (force signed int)
+                if (remainder[0] == '-') {
+                    const auto x = boost::lexical_cast<int32_t>(remainder);
+                    if (x >= std::numeric_limits<int8_t>::min())
+                        tags[name] = static_cast<int8_t>(x);
+                    else if (x >= std::numeric_limits<int16_t>::min())
+                        tags[name] = static_cast<int16_t>(x);
+                    else
+                        tags[name] = x;
+                }
+
+                // unsigned int
+                else {
+                    const auto x = boost::lexical_cast<uint32_t>(remainder);
+                    if (x <= std::numeric_limits<uint8_t>::max())
+                        tags[name] = static_cast<uint8_t>(x);
+                    else if (x <= std::numeric_limits<uint16_t>::max())
+                        tags[name] = static_cast<uint16_t>(x);
+                    else
+                        tags[name] = x;
+                }
+                break;
+            }
+
+            case 'f': {
+                tags[name] = boost::lexical_cast<float>(remainder);
+                break;
+            }
+
+            case 'Z': {
+                tags[name] = remainder;
+                break;
+            }
+
+            case 'H': {
+                tags[name] = Tag(remainder, TagModifier::HEX_STRING);
+                break;
+            }
+
+            case 'B': {
+                const auto elementType = remainder[0];
+                const auto arrayData = remainder.substr(1);
+                switch (elementType) {
+                    case 'c':
+                        tags[name] = readSignedSamMultiValue<int8_t>(arrayData);
+                        break;
+                    case 'C':
+                        tags[name] = readUnsignedSamMultiValue<uint8_t>(arrayData);
+                        break;
+                    case 's':
+                        tags[name] = readSignedSamMultiValue<int16_t>(arrayData);
+                        break;
+                    case 'S':
+                        tags[name] = readUnsignedSamMultiValue<uint16_t>(arrayData);
+                        break;
+                    case 'i':
+                        tags[name] = readSignedSamMultiValue<int32_t>(arrayData);
+                        break;
+                    case 'I':
+                        tags[name] = readUnsignedSamMultiValue<uint32_t>(arrayData);
+                        break;
+                    case 'f':
+                        tags[name] = readFloatSamMultiValue(arrayData);
+                        break;
+                    default:
+                        throw std::runtime_error{
+                            "SamTagCodec: unsupported array-tag-type encountered: " +
+                            std::string{1, elementType}};
+                }
+                break;
+            }
+
+            // unsupported SAM tag type
+            default:
+                throw std::runtime_error{"SamTagCodec: unsupported tag-type encountered: " +
+                                         std::string{1, type}};
+        }
+    }
+
+    return tags;
+}
+
+std::string SamTagCodec::Encode(const std::string& name, const PacBio::BAM::Tag& tag)
+{
+    // upfront checks
+    if (name.size() != 2) {
+        throw std::runtime_error{"SamTagCodec: malformatted tag name: " + name};
+    }
+    if (tag.IsNull()) return {};
+
+    // "<TAG>:"
+    std::ostringstream result;
+    result << name << ':';
+
+    // ASCII char
+    if (tag.HasModifier(TagModifier::ASCII_CHAR)) {
+        const auto c = tag.ToAscii();
+        if (c != '\0') {
+            result << "A:" << c;
+            return result.str();
+        }
+    }
+
+    // "<TYPE>:<DATA>" for all other data
+    switch (tag.Type()) {
+        case TagDataType::INT8:
+            result << "i:" << static_cast<int32_t>(tag.ToInt8());
+            break;
+        case TagDataType::UINT8:
+            result << "i:" << static_cast<int32_t>(tag.ToUInt8());
+            break;
+        case TagDataType::INT16:
+            result << "i:" << tag.ToInt16();
+            break;
+        case TagDataType::UINT16:
+            result << "i:" << tag.ToUInt16();
+            break;
+        case TagDataType::INT32:
+            result << "i:" << tag.ToInt32();
+            break;
+        case TagDataType::UINT32:
+            result << "i:" << tag.ToUInt32();
+            break;
+        case TagDataType::FLOAT:
+            result << "f:" << tag.ToFloat();
+            break;
+        case TagDataType::STRING:
+            result << (tag.HasModifier(TagModifier::HEX_STRING) ? 'H' : 'Z') << ':'
+                   << tag.ToString();
+            break;
+        case TagDataType::INT8_ARRAY:
+            result << "B:c";
+            for (const int8_t x : tag.ToInt8Array())
+                result << ',' << static_cast<int32_t>(x);
+            break;
+        case TagDataType::UINT8_ARRAY:
+            result << "B:C";
+            for (const uint8_t x : tag.ToUInt8Array())
+                result << ',' << static_cast<uint32_t>(x);
+            break;
+        case TagDataType::INT16_ARRAY:
+            result << "B:s";
+            for (const int16_t x : tag.ToInt16Array())
+                result << ',' << x;
+            break;
+        case TagDataType::UINT16_ARRAY:
+            result << "B:S";
+            for (const uint16_t x : tag.ToUInt16Array())
+                result << ',' << x;
+            break;
+        case TagDataType::INT32_ARRAY:
+            result << "B:i";
+            for (const int32_t x : tag.ToInt32Array())
+                result << ',' << x;
+            break;
+        case TagDataType::UINT32_ARRAY:
+            result << "B:I";
+            for (const uint32_t x : tag.ToUInt32Array())
+                result << ',' << x;
+            break;
+        case TagDataType::FLOAT_ARRAY:
+            result << "B:f";
+            for (const float x : tag.ToFloatArray())
+                result << ',' << x;
+            break;
+        default:
+            throw std::runtime_error{"SamTagCodec: unsupported tag-type encountered: " +
+                                     std::to_string(static_cast<uint16_t>(tag.Type()))};
+    }
+    return result.str();
+}
+
+std::string SamTagCodec::Encode(const TagCollection& tags)
+{
+    std::ostringstream result;
+    for (const auto& tagIter : tags) {
+        const std::string& name = tagIter.first;
+        const Tag& tag = tagIter.second;
+        if (!result.str().empty()) result << '\t';
+        result << Encode(name, tag);
+    }
+    return result.str();
+}
+
+std::string MakeSamTag(std::string tag, std::string value)
+{
+    return '\t' + std::move(tag) + ':' + std::move(value);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/SamWriter.cpp b/src/SamWriter.cpp

new file mode 100644 (file)

index 0000000..e3a78b9
--- /dev/null
+++ b/src/SamWriter.cpp
@@ -0,0 +1,116 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/SamWriter.h"
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <type_traits>
+
+#include <htslib/hfile.h>
+#include <htslib/sam.h>
+
+#include "Autovalidate.h"
+#include "FileProducer.h"
+#include "MemoryUtils.h"
+#include "pbbam/Validator.h"
+
+namespace PacBio {
+namespace BAM {
+
+class SamWriter::SamWriterPrivate : public FileProducer
+{
+public:
+    SamWriterPrivate(std::string filename, const std::shared_ptr<bam_hdr_t> rawHeader)
+        : FileProducer{std::move(filename)}, header_{rawHeader}
+    {
+        if (!header_) throw std::runtime_error{"SamWriter: null header provided"};
+
+        // open file
+        const auto& usingFilename = TempFilename();
+        const std::string mode(1, 'w');
+        file_.reset(sam_open(usingFilename.c_str(), mode.c_str()));
+        if (!file_)
+            throw std::runtime_error{"SamWriter: could not open file for writing: " +
+                                     usingFilename};
+
+        // write header
+        const auto ret = sam_hdr_write(file_.get(), header_.get());
+        if (ret != 0)
+            throw std::runtime_error{"SamWriter: could not write header to file: " + usingFilename};
+    }
+
+    void Write(BamRecord record)
+    {
+#if PBBAM_AUTOVALIDATE
+        Validator::Validate(record);
+#endif
+
+        const auto rawRecord = BamRecordMemory::GetRawData(record);
+
+        // store bin number
+        // min_shift=14 & n_lvls=5 are SAM/BAM "magic numbers"
+        rawRecord->core.bin = hts_reg2bin(rawRecord->core.pos, bam_endpos(rawRecord.get()), 14, 5);
+
+        // Maybe adjust location of long CIGAR (>65535 ops) data, depending on the
+        // runtime htslib version.
+        //
+        // SAM formatting in htslib verions previous to 1.7 are unaware of the new
+        // long CIGAR implementation ("CG") tag. So we need to move that back to the
+        // "standard" field so that SAM output is correct. Versions >=1.7 properly
+        // display long CIGARs.
+        //
+        // This transform will become unecessary when we drop support for htslib pre-v1.7.
+        //
+        static const bool has_native_long_cigar_support = DoesHtslibSupportLongCigar();
+        const auto cigar = record.CigarData();
+        if (!has_native_long_cigar_support && cigar.size() > 65535) {
+            if (record.Impl().HasTag("CG")) record.Impl().RemoveTag("CG");
+            record.Impl().SetCigarData(cigar);
+        }
+
+        // write record to file
+        const int ret = sam_write1(file_.get(), header_.get(), rawRecord.get());
+        if (ret <= 0) throw std::runtime_error{"SamWriter: could not write record"};
+    }
+
+    std::unique_ptr<samFile, HtslibFileDeleter> file_;
+    std::shared_ptr<bam_hdr_t> header_;
+};
+
+static_assert(!std::is_copy_constructible<SamWriter>::value,
+              "SamWriter(const SamWriter&) is not = delete");
+static_assert(!std::is_copy_assignable<SamWriter>::value,
+              "SamWriter& operator=(const SamWriter&) is not = delete");
+
+SamWriter::SamWriter(std::string filename, const BamHeader& header)
+    : IRecordWriter()
+    , d_{std::make_unique<SamWriterPrivate>(std::move(filename),
+                                            BamHeaderMemory::MakeRawHeader(header))}
+{
+#if PBBAM_AUTOVALIDATE
+    Validator::Validate(header);
+#endif
+}
+
+SamWriter::SamWriter(SamWriter&&) noexcept = default;
+
+SamWriter& SamWriter::operator=(SamWriter&&) noexcept = default;
+
+SamWriter::~SamWriter() = default;
+
+void SamWriter::TryFlush()
+{
+    const auto ret = d_->file_.get()->fp.hfile;
+    if (ret != nullptr)
+        throw std::runtime_error{"SamWriter: could not flush output buffer contents"};
+}
+
+void SamWriter::Write(const BamRecord& record) { d_->Write(record); }
+
+void SamWriter::Write(const BamRecordImpl& recordImpl) { Write(BamRecord{recordImpl}); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/SequenceInfo.cpp b/src/SequenceInfo.cpp

new file mode 100644 (file)

index 0000000..519e55e
--- /dev/null
+++ b/src/SequenceInfo.cpp
@@ -0,0 +1,180 @@
+// File Description
+/// \file SequenceInfo.cpp
+/// \brief Implements the SequenceInfo class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/SequenceInfo.h"
+
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <sstream>
+#include <tuple>
+#include <type_traits>
+
+#include "pbbam/SamTagCodec.h"
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+const std::string token_SN{"SN"};
+const std::string token_LN{"LN"};
+const std::string token_AS{"AS"};
+const std::string token_M5{"M5"};
+const std::string token_SP{"SP"};
+const std::string token_UR{"UR"};
+
+}  // anonymous
+
+static_assert(std::is_copy_constructible<SequenceInfo>::value,
+              "SequenceInfo(const SequenceInfo&) is not = default");
+static_assert(std::is_copy_assignable<SequenceInfo>::value,
+              "SequenceInfo& operator=(const SequenceInfo&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<SequenceInfo>::value,
+              "SequenceInfo(SequenceInfo&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<SequenceInfo>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+SequenceInfo::SequenceInfo(std::string name, std::string length)
+    : name_(std::move(name)), length_(std::move(length))
+{
+}
+
+bool SequenceInfo::operator==(const SequenceInfo& other) const
+{
+    return std::tie(assemblyId_, checksum_, length_, name_, species_, uri_, custom_) ==
+           std::tie(other.assemblyId_, other.checksum_, other.length_, other.name_, other.species_,
+                    other.uri_, other.custom_);
+}
+
+bool SequenceInfo::operator!=(const SequenceInfo& other) const { return !(*this == other); }
+
+std::string SequenceInfo::AssemblyId() const { return assemblyId_; }
+
+SequenceInfo& SequenceInfo::AssemblyId(std::string id)
+{
+    assemblyId_ = std::move(id);
+    return *this;
+}
+
+std::string SequenceInfo::Checksum() const { return checksum_; }
+
+SequenceInfo& SequenceInfo::Checksum(std::string checksum)
+{
+    checksum_ = std::move(checksum);
+    return *this;
+}
+
+std::map<std::string, std::string> SequenceInfo::CustomTags() const { return custom_; }
+
+SequenceInfo& SequenceInfo::CustomTags(std::map<std::string, std::string> custom)
+{
+    custom_ = std::move(custom);
+    return *this;
+}
+
+SequenceInfo SequenceInfo::FromSam(const std::string& sam)
+{
+    // pop off '@SQ\t', then split rest of line into tokens
+    const auto tokens = Split(sam.substr(4), '\t');
+    if (tokens.empty()) return {};
+
+    SequenceInfo seq;
+    std::map<std::string, std::string> custom;
+
+    // iterate over tokens
+    for (const auto& token : tokens) {
+        const auto tokenTag = token.substr(0, 2);
+        auto tokenValue = token.substr(3);
+
+        // set sequence info
+        // clang-format off
+        if      (tokenTag == token_SN) seq.Name(std::move(tokenValue));
+        else if (tokenTag == token_LN) seq.Length(std::move(tokenValue));
+        else if (tokenTag == token_AS) seq.AssemblyId(std::move(tokenValue));
+        else if (tokenTag == token_M5) seq.Checksum(std::move(tokenValue));
+        else if (tokenTag == token_SP) seq.Species(std::move(tokenValue));
+        else if (tokenTag == token_UR) seq.Uri(std::move(tokenValue));
+        // clang-format on
+
+        // otherwise, "custom" tag
+        else
+            custom[tokenTag] = std::move(tokenValue);
+    }
+
+    seq.CustomTags(std::move(custom));
+    return seq;
+}
+
+bool SequenceInfo::IsValid() const
+{
+    if (name_.empty()) return false;
+
+    // use long instead of int32_t, just to make sure we can catch overflow
+    const long l = atol(length_.c_str());
+    return l >= 0 && l <= std::numeric_limits<int32_t>::max();
+}
+
+std::string SequenceInfo::Length() const { return length_; }
+
+SequenceInfo& SequenceInfo::Length(std::string length)
+{
+    length_ = std::move(length);
+    return *this;
+}
+
+std::string SequenceInfo::Name() const { return name_; }
+
+SequenceInfo& SequenceInfo::Name(std::string name)
+{
+    name_ = std::move(name);
+    return *this;
+}
+
+std::string SequenceInfo::Species() const { return species_; }
+
+SequenceInfo& SequenceInfo::Species(std::string species)
+{
+    species_ = std::move(species);
+    return *this;
+}
+
+std::string SequenceInfo::ToSam(const SequenceInfo& seq) { return seq.ToSam(); }
+
+std::string SequenceInfo::ToSam() const
+{
+    std::ostringstream out;
+    out << "@SQ" << MakeSamTag(token_SN, name_);
+
+    // clang-format off
+    if (!length_.empty())     out << MakeSamTag(token_LN, length_);
+    if (!assemblyId_.empty()) out << MakeSamTag(token_AS, assemblyId_);
+    if (!checksum_.empty())   out << MakeSamTag(token_M5, checksum_);
+    if (!species_.empty())    out << MakeSamTag(token_SP, species_);
+    if (!uri_.empty())        out << MakeSamTag(token_UR, uri_);
+    // clang-format on
+
+    // append any custom tags
+    for (auto&& attribute : custom_)
+        out << MakeSamTag(std::move(attribute.first), std::move(attribute.second));
+
+    return out.str();
+}
+
+std::string SequenceInfo::Uri() const { return uri_; }
+
+SequenceInfo& SequenceInfo::Uri(std::string uri)
+{
+    uri_ = std::move(uri);
+    return *this;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/SequenceUtils.h b/src/SequenceUtils.h

new file mode 100644 (file)

index 0000000..8bad6ca
--- /dev/null
+++ b/src/SequenceUtils.h
@@ -0,0 +1,126 @@
+// Author: Derek Barnett
+
+#ifndef SEQUENCEUTILS_H
+#define SEQUENCEUTILS_H
+
+#include "pbbam/Config.h"
+
+#include <algorithm>
+#include <array>
+#include <cctype>
+#include <cstdint>
+#include <string>
+
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline char Complement(const char character)
+{
+    constexpr const std::array<char, 256> lookupTable{
+        {/*   0 -   7: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /*   8 -  15: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /*  16 -  23: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /*  24 -  31: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /*  32 -  39: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /*  40 -  47: */ 0,   0,   '*', 0,   0,   '-', 0,   0,
+         /*  48 -  55: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /*  56 -  63: */ 0,   0,   0,   0,   0,   0,   0,   0,
+
+         /*  64 -  71: */ 0,   'T', 'V', 'G', 'H', 0,   0,   'C',
+         /*  72 -  79: */ 'D', 0,   0,   'M', 0,   'K', 'N', 0,
+         /*  80 -  87: */ 0,   0,   'Y', 'S', 'A', 'A', 'B', 'W',
+         /*  88 -  95: */ 0,   'R', 0,   0,   0,   0,   0,   0,
+
+         /*  96 - 103: */ 0,   'T', 'V', 'G', 'H', 0,   0,   'C',
+         /* 104 - 111: */ 'D', 0,   0,   'M', 0,   'K', 'N', 0,
+         /* 112 - 119: */ 0,   0,   'Y', 'S', 'A', 'A', 'B', 'W',
+         /* 120 - 127: */ 0,   'R', 0,   0,   0,   0,   0,   0,
+
+         /* 128 - 135: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 136 - 143: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 144 - 151: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 152 - 159: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 160 - 167: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 168 - 175: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 176 - 183: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 184 - 191: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 192 - 199: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 200 - 207: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 208 - 215: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 216 - 223: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 224 - 231: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 232 - 239: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 240 - 247: */ 0,   0,   0,   0,   0,   0,   0,   0,
+         /* 248 - 255: */ 0,   0,   0,   0,   0,   0,   0,   0}};
+
+    return lookupTable[static_cast<unsigned char>(character)];
+}
+
+template <typename T>
+void Reverse(T& input)
+{
+    std::reverse(input.begin(), input.end());
+}
+
+template <typename T>
+T MaybeReverse(T&& input, bool reverse)
+{
+    if (reverse) std::reverse(input.begin(), input.end());
+    return input;
+}
+
+template <typename T>
+T Reversed(const T& input)
+{
+    T result = input;
+    Reverse(result);
+    return result;
+}
+inline void ReverseComplement(std::string& seq)
+{
+    std::transform(seq.begin(), seq.end(), seq.begin(), Complement);
+    Reverse(seq);
+}
+
+inline std::string MaybeReverseComplement(std::string&& seq, bool reverse)
+{
+    if (reverse) ReverseComplement(seq);
+    return std::move(seq);
+}
+
+/// Reverse complement a DNA sequence case-sensitive
+inline void ReverseComplementCaseSens(std::string& seq)
+{
+    const std::string original = seq;
+    constexpr const static int8_t rc_table[128] = {
+        4,  4, 4, 4, 4, 4, 4,  4,  4, 4, 4, 4, 4,  4,  4, 4,  4,  4, 4, 4,   4, 4,   4, 4, 4, 4,
+        4,  4, 4, 4, 4, 4, 32, 4,  4, 4, 4, 4, 4,  4,  4, 4,  42, 4, 4, 45,  4, 4,   4, 4, 4, 4,
+        4,  4, 4, 4, 4, 4, 4,  4,  4, 4, 4, 4, 4,  84, 4, 71, 4,  4, 4, 67,  4, 4,   4, 4, 4, 4,
+        78, 4, 4, 4, 4, 4, 65, 65, 4, 4, 4, 4, 4,  4,  4, 4,  4,  4, 4, 116, 4, 103, 4, 4, 4, 99,
+        4,  4, 4, 4, 4, 4, 4,  4,  4, 4, 4, 4, 97, 97, 4, 4,  4,  4, 4, 4,   4, 4,   4, 4};
+    std::string reverseCompl(original.length(), 'N');
+    for (uint32_t i = 0; i < original.length(); ++i)
+        reverseCompl[original.length() - i - 1] =
+            static_cast<char>(rc_table[static_cast<int8_t>(original[i])]);
+    seq = reverseCompl;
+}
+
+inline std::string MaybeReverseComplementCaseSens(std::string&& seq, bool reverse)
+{
+    if (reverse) ReverseComplementCaseSens(seq);
+    return std::move(seq);
+}
+
+inline std::string ReverseComplemented(const std::string& input)
+{
+    std::string result = input;
+    ReverseComplement(result);
+    return result;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // SEQUENCEUTILS_H
diff --git a/src/StringUtilities.cpp b/src/StringUtilities.cpp

new file mode 100644 (file)

index 0000000..622d73c
--- /dev/null
+++ b/src/StringUtilities.cpp
@@ -0,0 +1,48 @@
+// File Description
+/// \file SequenceInfo.cpp
+/// \brief Implements the SequenceInfo class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/StringUtilities.h"
+
+#include <algorithm>
+#include <sstream>
+
+namespace PacBio {
+namespace BAM {
+
+std::string Join(const std::vector<std::string>& tokens, const char delim)
+{
+    std::string result;
+    bool first = true;
+    for (const auto& token : tokens) {
+        if (!first) result += delim;
+        result += token;
+        first = false;
+    }
+    return result;
+}
+
+std::vector<std::string> Split(const std::string& line, const char delim)
+{
+    std::vector<std::string> tokens;
+    std::istringstream lineStream(line);
+    std::string token;
+    while (std::getline(lineStream, token, delim))
+        tokens.push_back(token);
+    return tokens;
+}
+
+std::string RemoveAllWhitespace(std::string input)
+{
+    input.erase(
+        std::remove_if(input.begin(), input.end(), [](const char c) { return std::isspace(c); }),
+        input.end());
+    return input;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/SubreadLengthQuery.cpp b/src/SubreadLengthQuery.cpp

new file mode 100644 (file)

index 0000000..4eb35a9
--- /dev/null
+++ b/src/SubreadLengthQuery.cpp
@@ -0,0 +1,45 @@
+// File Description
+/// \file SubreadLengthQuery.cpp
+/// \brief Implements the SubreadLengthQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/SubreadLengthQuery.h"
+
+#include <cstdint>
+
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/PbiFilterTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+class SubreadLengthQuery::SubreadLengthQueryPrivate
+{
+public:
+    SubreadLengthQueryPrivate(const int32_t length, const Compare::Type compareType,
+                              const DataSet& dataset)
+        : reader_(PbiQueryLengthFilter(length, compareType), dataset)
+    {
+    }
+
+    PbiFilterCompositeBamReader<Compare::None> reader_;  // unsorted
+};
+
+SubreadLengthQuery::SubreadLengthQuery(const int32_t length, const Compare::Type compareType,
+                                       const DataSet& dataset)
+    : internal::IQuery()
+    , d_{std::make_unique<SubreadLengthQueryPrivate>(length, compareType, dataset)}
+{
+}
+
+SubreadLengthQuery::~SubreadLengthQuery() = default;
+
+bool SubreadLengthQuery::GetNext(BamRecord& r) { return d_->reader_.GetNext(r); }
+
+uint32_t SubreadLengthQuery::NumReads() const { return d_->reader_.NumReads(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Tag.cpp b/src/Tag.cpp

new file mode 100644 (file)

index 0000000..d0c322e
--- /dev/null
+++ b/src/Tag.cpp
@@ -0,0 +1,401 @@
+// File Description
+/// \file Tag.cpp
+/// \brief Implements the Tag class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/Tag.h"
+
+#include <cassert>
+#include <iostream>
+#include <type_traits>
+
+#include <boost/numeric/conversion/cast.hpp>
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+template <typename T>
+bool InAsciiRange(const T x)
+{
+    return (x >= 33 && x <= 127);
+}
+
+struct AsciiConvertVisitor : public boost::static_visitor<char>
+{
+    // only valid for numeric types - maybe even more restrictive?
+    char operator()(const int8_t& x) const { return Helper(x); }
+    char operator()(const uint8_t& x) const { return Helper(x); }
+    char operator()(const int16_t& x) const { return Helper(x); }
+    char operator()(const uint16_t& x) const { return Helper(x); }
+    char operator()(const int32_t& x) const { return Helper(x); }
+    char operator()(const uint32_t& x) const { return Helper(x); }
+
+    // anything else always throws
+    template <typename T>
+    char operator()(const T&) const
+    {
+        throw std::runtime_error{"Tag: cannot convert to ASCII"};
+        return 0;
+    }
+
+private:
+    template <typename T>
+    char Helper(const T& x) const
+    {
+        if (!InAsciiRange(x)) throw std::runtime_error{"Tag: char is outside valid ASCII range"};
+        return static_cast<char>(x);
+    }
+};
+
+template <typename DesiredType>
+struct NumericConvertVisitor : public boost::static_visitor<DesiredType>
+{
+    // only valid for integral types
+    DesiredType operator()(const int8_t& x) const { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator()(const uint8_t& x) const { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator()(const int16_t& x) const { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator()(const uint16_t& x) const { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator()(const int32_t& x) const { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator()(const uint32_t& x) const { return boost::numeric_cast<DesiredType>(x); }
+
+    // anything else always throws
+    template <typename T>
+    DesiredType operator()(const T& t) const
+    {
+        const std::string from = typeid(t).name();
+        const std::string to = typeid(DesiredType).name();
+        const std::string msg = "Tag: cannot convert type " + from + " to " + to;
+        throw std::runtime_error(msg);
+        return 0;
+    }
+};
+
+using ToInt8ConvertVisitor = NumericConvertVisitor<int8_t>;
+using ToUInt8ConvertVisitor = NumericConvertVisitor<uint8_t>;
+using ToInt16ConvertVisitor = NumericConvertVisitor<int16_t>;
+using ToUInt16ConvertVisitor = NumericConvertVisitor<uint16_t>;
+using ToInt32ConvertVisitor = NumericConvertVisitor<int32_t>;
+using ToUInt32ConvertVisitor = NumericConvertVisitor<uint32_t>;
+
+struct IsEqualVisitor : public boost::static_visitor<bool>
+{
+    template <typename T, typename U>
+    bool operator()(const T&, const U&) const
+    {
+        // maybe allow conversions down the road?
+        // but for now, just fail if types are different
+        return false;
+    }
+
+    bool operator()(const boost::blank&, const boost::blank&) const { return true; }
+
+    template <typename T>
+    bool operator()(const T& lhs, const T& rhs) const
+    {
+        return lhs == rhs;
+    }
+};
+
+struct TypenameVisitor : public boost::static_visitor<std::string>
+{
+    std::string operator()(const boost::blank&) const { return "none"; }
+    std::string operator()(const int8_t&) const { return "int8_t"; }
+    std::string operator()(const uint8_t&) const { return "uint8_t"; }
+    std::string operator()(const int16_t&) const { return "int16_t"; }
+    std::string operator()(const uint16_t&) const { return "uint16_t"; }
+    std::string operator()(const int32_t&) const { return "int32_t"; }
+    std::string operator()(const uint32_t&) const { return "uint32_t"; }
+    std::string operator()(const float&) const { return "float"; }
+    std::string operator()(const std::string&) const { return "string"; }
+    std::string operator()(const std::vector<int8_t>&) const { return "vector<int8_t>"; }
+    std::string operator()(const std::vector<uint8_t>&) const { return "vector<uint8_t>"; }
+    std::string operator()(const std::vector<int16_t>&) const { return "vector<int16_t>"; }
+    std::string operator()(const std::vector<uint16_t>&) const { return "vector<uint16_t>"; }
+    std::string operator()(const std::vector<int32_t>&) const { return "vector<int32_t>"; }
+    std::string operator()(const std::vector<uint32_t>&) const { return "vector<uint32_t>"; }
+    std::string operator()(const std::vector<float>&) const { return "vector<float>"; }
+};
+
+}  // anonymous
+
+static_assert(std::is_copy_constructible<Tag>::value, "Tag(const Tag&) is not = default");
+static_assert(std::is_copy_assignable<Tag>::value, "Tag& operator=(const Tag&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<Tag>::value, "Tag(Tag&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<Tag>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+Tag::Tag(int8_t value) : data_{value} {}
+Tag::Tag(uint8_t value) : data_{value} {}
+Tag::Tag(int16_t value) : data_{value} {}
+Tag::Tag(uint16_t value) : data_{value} {}
+Tag::Tag(int32_t value) : data_{value} {}
+Tag::Tag(uint32_t value) : data_{value} {}
+Tag::Tag(float value) : data_{value} {}
+Tag::Tag(std::string value) : data_{std::move(value)} {}
+Tag::Tag(std::vector<int8_t> value) : data_{std::move(value)} {}
+Tag::Tag(std::vector<uint8_t> value) : data_{std::move(value)} {}
+Tag::Tag(std::vector<int16_t> value) : data_{std::move(value)} {}
+Tag::Tag(std::vector<uint16_t> value) : data_{std::move(value)} {}
+Tag::Tag(std::vector<int32_t> value) : data_{std::move(value)} {}
+Tag::Tag(std::vector<uint32_t> value) : data_{std::move(value)} {}
+Tag::Tag(std::vector<float> value) : data_{std::move(value)} {}
+
+Tag::Tag(int8_t value, const TagModifier mod) : data_{value}, modifier_(mod)
+{
+    if (mod == TagModifier::HEX_STRING)
+        throw std::runtime_error{
+            "Tag: HEX_STRING is not a valid tag modifier for int8_t data. "
+            "It is intended for string-type data only."};
+}
+
+Tag::Tag(std::string value, TagModifier mod) : data_{std::move(value)}, modifier_{mod}
+{
+    if (mod == TagModifier::ASCII_CHAR)
+        throw std::runtime_error{
+            "Tag: ASCII_CHAR is not a valid tag modifier for string-type data. "
+            "To construct an ASCII char tag, use a single-quoted value (e.g. 'X' instead of "
+            "\"X\")"};
+}
+
+Tag& Tag::operator=(boost::blank value)
+{
+    data_ = value;
+    return *this;
+}
+
+Tag& Tag::operator=(int8_t value)
+{
+    data_ = value;
+    return *this;
+}
+
+Tag& Tag::operator=(uint8_t value)
+{
+    data_ = value;
+    return *this;
+}
+
+Tag& Tag::operator=(int16_t value)
+{
+    data_ = value;
+    return *this;
+}
+
+Tag& Tag::operator=(uint16_t value)
+{
+    data_ = value;
+    return *this;
+}
+
+Tag& Tag::operator=(int32_t value)
+{
+    data_ = value;
+    return *this;
+}
+
+Tag& Tag::operator=(uint32_t value)
+{
+    data_ = value;
+    return *this;
+}
+
+Tag& Tag::operator=(float value)
+{
+    data_ = value;
+    return *this;
+}
+
+Tag& Tag::operator=(std::string value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+Tag& Tag::operator=(std::vector<int8_t> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+Tag& Tag::operator=(std::vector<uint8_t> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+Tag& Tag::operator=(std::vector<int16_t> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+Tag& Tag::operator=(std::vector<uint16_t> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+Tag& Tag::operator=(std::vector<int32_t> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+Tag& Tag::operator=(std::vector<uint32_t> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+Tag& Tag::operator=(std::vector<float> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+bool Tag::operator==(const Tag& other) const
+{
+    return boost::apply_visitor(IsEqualVisitor(), data_, other.data_) &&
+           (modifier_ == other.modifier_);
+}
+
+bool Tag::operator!=(const Tag& other) const { return !(*this == other); }
+
+bool Tag::HasModifier(const TagModifier m) const
+{
+    // we just allow one at a time (for now at least)
+    return modifier_ == m;
+}
+
+bool Tag::IsNull() const { return Type() == TagDataType::INVALID; }
+
+bool Tag::IsInt8() const { return Type() == TagDataType::INT8; }
+
+bool Tag::IsUInt8() const { return Type() == TagDataType::UINT8; }
+
+bool Tag::IsInt16() const { return Type() == TagDataType::INT16; }
+
+bool Tag::IsUInt16() const { return Type() == TagDataType::UINT16; }
+
+bool Tag::IsInt32() const { return Type() == TagDataType::INT32; }
+
+bool Tag::IsUInt32() const { return Type() == TagDataType::UINT32; }
+
+bool Tag::IsFloat() const { return Type() == TagDataType::FLOAT; }
+
+bool Tag::IsString() const { return Type() == TagDataType::STRING; }
+
+bool Tag::IsHexString() const { return IsString() && modifier_ == TagModifier::HEX_STRING; }
+
+bool Tag::IsInt8Array() const { return Type() == TagDataType::INT8_ARRAY; }
+
+bool Tag::IsUInt8Array() const { return Type() == TagDataType::UINT8_ARRAY; }
+
+bool Tag::IsInt16Array() const { return Type() == TagDataType::INT16_ARRAY; }
+
+bool Tag::IsUInt16Array() const { return Type() == TagDataType::UINT16_ARRAY; }
+
+bool Tag::IsInt32Array() const { return Type() == TagDataType::INT32_ARRAY; }
+
+bool Tag::IsUInt32Array() const { return Type() == TagDataType::UINT32_ARRAY; }
+
+bool Tag::IsFloatArray() const { return Type() == TagDataType::FLOAT_ARRAY; }
+
+bool Tag::IsSignedInt() const { return IsInt8() || IsInt16() || IsInt32(); }
+
+bool Tag::IsUnsignedInt() const { return IsUInt8() || IsUInt16() || IsUInt32(); }
+
+bool Tag::IsIntegral() const { return IsSignedInt() || IsUnsignedInt(); }
+
+bool Tag::IsNumeric() const { return IsIntegral() || IsFloat(); }
+
+bool Tag::IsSignedArray() const { return IsInt8Array() || IsInt16Array() || IsInt32Array(); }
+
+bool Tag::IsUnsignedArray() const { return IsUInt8Array() || IsUInt16Array() || IsUInt32Array(); }
+
+bool Tag::IsIntegralArray() const { return IsSignedArray() || IsUnsignedArray(); }
+
+bool Tag::IsArray() const { return IsIntegralArray() || IsFloatArray(); }
+
+TagModifier Tag::Modifier() const { return modifier_; }
+
+Tag& Tag::Modifier(const TagModifier m)
+{
+    modifier_ = m;
+    return *this;
+}
+
+char Tag::ToAscii() const { return boost::apply_visitor(AsciiConvertVisitor(), data_); }
+
+int8_t Tag::ToInt8() const
+{
+    if (IsInt8()) return boost::get<int8_t>(data_);
+    return boost::apply_visitor(ToInt8ConvertVisitor(), data_);
+}
+
+uint8_t Tag::ToUInt8() const
+{
+    if (IsUInt8()) return boost::get<uint8_t>(data_);
+    return boost::apply_visitor(ToUInt8ConvertVisitor(), data_);
+}
+
+int16_t Tag::ToInt16() const
+{
+    if (IsInt16()) return boost::get<int16_t>(data_);
+    return boost::apply_visitor(ToInt16ConvertVisitor(), data_);
+}
+
+uint16_t Tag::ToUInt16() const
+{
+    if (IsUInt16()) return boost::get<uint16_t>(data_);
+    return boost::apply_visitor(ToUInt16ConvertVisitor(), data_);
+}
+
+int32_t Tag::ToInt32() const
+{
+    if (IsInt32()) return boost::get<int32_t>(data_);
+    return boost::apply_visitor(ToInt32ConvertVisitor(), data_);
+}
+
+uint32_t Tag::ToUInt32() const
+{
+    if (IsUInt32()) return boost::get<uint32_t>(data_);
+    return boost::apply_visitor(ToUInt32ConvertVisitor(), data_);
+}
+
+float Tag::ToFloat() const { return boost::get<float>(data_); }
+
+std::string Tag::ToString() const { return boost::get<std::string>(data_); }
+
+std::vector<int8_t> Tag::ToInt8Array() const { return boost::get<std::vector<int8_t> >(data_); }
+
+std::vector<uint8_t> Tag::ToUInt8Array() const { return boost::get<std::vector<uint8_t> >(data_); }
+
+std::vector<int16_t> Tag::ToInt16Array() const { return boost::get<std::vector<int16_t> >(data_); }
+
+std::vector<uint16_t> Tag::ToUInt16Array() const
+{
+    return boost::get<std::vector<uint16_t> >(data_);
+}
+
+std::vector<int32_t> Tag::ToInt32Array() const { return boost::get<std::vector<int32_t> >(data_); }
+
+std::vector<uint32_t> Tag::ToUInt32Array() const
+{
+    return boost::get<std::vector<uint32_t> >(data_);
+}
+
+std::vector<float> Tag::ToFloatArray() const { return boost::get<std::vector<float> >(data_); }
+
+TagDataType Tag::Type() const { return TagDataType(data_.which()); }
+
+std::string Tag::Typename() const { return boost::apply_visitor(TypenameVisitor(), data_); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/TagCollection.cpp b/src/TagCollection.cpp

new file mode 100644 (file)

index 0000000..feab76d
--- /dev/null
+++ b/src/TagCollection.cpp
@@ -0,0 +1,17 @@
+// File Description
+/// \file TagCollection.cpp
+/// \brief Implements the TagCollection class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/TagCollection.h"
+
+namespace PacBio {
+namespace BAM {
+
+bool TagCollection::Contains(const std::string& name) const { return count(name) != 0; }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/TextFileReader.cpp b/src/TextFileReader.cpp

new file mode 100644 (file)

index 0000000..fa76e66
--- /dev/null
+++ b/src/TextFileReader.cpp
@@ -0,0 +1,115 @@
+// File Description
+/// \file TextFileReader.cpp
+/// \brief Implements the TextFileReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/TextFileReader.h"
+
+#include <cassert>
+
+#include <stdexcept>
+#include <type_traits>
+
+#include <htslib/bgzf.h>
+#include <htslib/kstring.h>
+
+#include "MemoryUtils.h"
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(!std::is_copy_constructible<TextFileReader>::value,
+              "TextFileReader(const TextFileReader&) is not = delete");
+static_assert(!std::is_copy_assignable<TextFileReader>::value,
+              "TextFileReader& operator=(const TextFileReader&) is not = delete");
+
+class TextFileReader::TextFileReaderPrivate
+{
+public:
+    TextFileReaderPrivate(std::string filename) : filename_{std::move(filename)}, k_{0, 0, nullptr}
+    {
+        // "ru" : read & supply plain output
+        bgzf_.reset(bgzf_open(filename_.c_str(), "ru"));
+        if (bgzf_.get() == nullptr)
+            throw std::runtime_error("TextFileReader - could not open zipped file: " + filename_ +
+                                     " for reading");
+
+        // pre-fetch first line
+        GetNext();
+    }
+
+    ~TextFileReaderPrivate() { free(k_.s); }
+
+    void GetNext()
+    {
+        line_.clear();
+
+        // be sure we skip empty lines
+        while (line_.empty()) {
+            const int result = bgzf_getline(bgzf_.get(), '\n', &k_);
+
+            // found data
+            if (result > 0) {
+                line_ = std::string{k_.s, k_.l};
+                return;
+            }
+
+            // empty line, try again
+            else if (result == 0)
+                continue;
+
+            // EOF (no error, but will stop next TextFileReader iteration
+            else if (result == -1)
+                return;
+
+            // else error
+            else {
+                throw std::runtime_error("TextFileReader - could not read from text file: " +
+                                         filename_ + "\nreason: htslib error code " +
+                                         std::to_string(result));
+            }
+        }
+    }
+
+    std::string filename_;
+    std::string line_;
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> bgzf_;  // can handle plain text or gzipped
+    kstring_t k_;
+};
+
+TextFileReader::TextFileReader(std::string filename)
+    : PacBio::BAM::internal::QueryBase<std::string>{}
+    , d_{std::make_unique<TextFileReaderPrivate>(std::move(filename))}
+{
+}
+
+TextFileReader::TextFileReader(TextFileReader&&) noexcept = default;
+
+TextFileReader& TextFileReader::operator=(TextFileReader&&) noexcept = default;
+
+TextFileReader::~TextFileReader() = default;
+
+bool TextFileReader::GetNext(std::string& line)
+{
+    if (d_->line_.empty()) return false;
+
+    line = d_->line_;
+    d_->GetNext();
+    return true;
+}
+
+std::vector<std::string> TextFileReader::ReadAll(const std::string& fn)
+{
+    std::vector<std::string> result;
+    result.reserve(256);
+    TextFileReader reader{fn};
+    for (const auto& seq : reader)
+        result.emplace_back(seq);
+    return result;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+\ No newline at end of file
diff --git a/src/TextFileWriter.cpp b/src/TextFileWriter.cpp

new file mode 100644 (file)

index 0000000..7c97432
--- /dev/null
+++ b/src/TextFileWriter.cpp
@@ -0,0 +1,88 @@
+
+// File Description
+/// \file TextFileWriter.cpp
+/// \brief Implements the TextFileWriter class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/TextFileWriter.h"
+
+#include <cassert>
+
+#include <fstream>
+#include <iostream>
+#include <type_traits>
+
+#include <boost/algorithm/string.hpp>
+
+#include "FileProducer.h"
+#include "MemoryUtils.h"
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(!std::is_copy_constructible<TextFileWriter>::value,
+              "TextFileWriter(const TextFileWriter&) is not = delete");
+static_assert(!std::is_copy_assignable<TextFileWriter>::value,
+              "TextFileWriter& operator=(const TextFileWriter&) is not = delete");
+
+class TextFileWriter::TextFileWriterPrivate : public FileProducer
+{
+public:
+    TextFileWriterPrivate(const std::string& filename) : FileProducer{filename}
+    {
+        isZipped_ = boost::algorithm::iends_with(filename, ".gz");
+
+        if (isZipped_) {
+            // open for gzipped text
+            bgzf_.reset(bgzf_open(TempFilename().c_str(), "wg"));
+            if (bgzf_.get() == nullptr) {
+                throw std::runtime_error("TextFileWriter - could not open file: " + filename +
+                                         " for writing");
+            }
+        } else {
+            // open for plain text
+            out_.open(TempFilename());
+            if (!out_) {
+                throw std::runtime_error("TextFileWriter - could not open file: " + filename +
+                                         " for writing");
+            }
+        }
+    }
+
+    void Write(const std::string& line)
+    {
+        if (isZipped_) {
+            const size_t length = line.size();
+            ssize_t written = bgzf_write(bgzf_.get(), line.c_str(), length);
+            written += bgzf_write(bgzf_.get(), "\n", 1);
+            if (written != static_cast<ssize_t>(length + 1))
+                throw std::runtime_error("TextFileWriter - error writing to file: " +
+                                         TargetFilename());
+        } else {
+            out_ << line << '\n';
+        }
+    }
+
+    bool isZipped_ = false;
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> bgzf_;
+    std::ofstream out_;
+};
+
+TextFileWriter::TextFileWriter(const std::string& filename)
+    : d_{std::make_unique<TextFileWriterPrivate>(filename)}
+{
+}
+
+TextFileWriter::TextFileWriter(TextFileWriter&&) noexcept = default;
+
+TextFileWriter& TextFileWriter::operator=(TextFileWriter&&) noexcept = default;
+
+TextFileWriter::~TextFileWriter() = default;
+
+void TextFileWriter::Write(const std::string& line) { d_->Write(line); }
+
+}  // namespace BAM
+}  // namespace PacBio
+\ No newline at end of file
diff --git a/src/TimeUtils.h b/src/TimeUtils.h

new file mode 100644 (file)

index 0000000..270afd5
--- /dev/null
+++ b/src/TimeUtils.h
@@ -0,0 +1,73 @@
+// Author: Derek Barnett
+
+#ifndef TIMEUTILS_H
+#define TIMEUTILS_H
+
+#include "pbbam/Config.h"
+
+#include <cassert>
+#include <chrono>
+#include <ctime>
+#include <stdexcept>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+class TimeUtils
+{
+public:
+    static std::string ToIso8601(const std::chrono::system_clock::time_point& tp)
+    {
+        // get time info
+        const time_t ttime_t = std::chrono::system_clock::to_time_t(tp);
+        const std::chrono::system_clock::time_point tp_sec =
+            std::chrono::system_clock::from_time_t(ttime_t);
+        const std::chrono::milliseconds ms =
+            std::chrono::duration_cast<std::chrono::milliseconds>(tp - tp_sec);
+        const std::tm* ttm =
+            gmtime(&ttime_t);  // static obj, no free needed (may not be thread-safe though)
+
+        // format output
+        constexpr static const char date_time_format[] = "%FT%T";
+        char date_time_str[50];
+        strftime(date_time_str, sizeof(date_time_str), date_time_format, ttm);
+        std::string result(date_time_str);
+        if (ms.count() > 0) {
+            result.append(".");
+            result.append(std::to_string(ms.count()));
+        }
+        result.append("Z");
+        return result;
+    }
+
+    static std::string ToDataSetFormat(const std::chrono::system_clock::time_point& tp)
+    {
+        // get time info
+        const time_t ttime_t = std::chrono::system_clock::to_time_t(tp);
+        const std::chrono::system_clock::time_point tp_sec =
+            std::chrono::system_clock::from_time_t(ttime_t);
+        const std::chrono::milliseconds ms =
+            std::chrono::duration_cast<std::chrono::milliseconds>(tp - tp_sec);
+        const std::tm* ttm =
+            gmtime(&ttime_t);  // static obj, no free needed (may not be thread-safe though)
+
+        // format output
+        constexpr static const char date_time_format[] = "%y%m%d_%H%M%S";
+        char date_time_str[50];
+        strftime(date_time_str, sizeof(date_time_str), date_time_format, ttm);
+        std::string result(date_time_str);
+        if (ms.count() > 0) result.append(std::to_string(ms.count()));
+        return result;
+    }
+
+    static std::chrono::system_clock::time_point CurrentTime()
+    {
+        return std::chrono::system_clock::now();
+    }
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // TIMEUTILS_H
diff --git a/src/ValidationErrors.cpp b/src/ValidationErrors.cpp

new file mode 100644 (file)

index 0000000..5e88885
--- /dev/null
+++ b/src/ValidationErrors.cpp
@@ -0,0 +1,74 @@
+// File Description
+/// \file ValidationErrors.cpp
+/// \brief Implements the ValidationErrors class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/exception/ValidationException.h"
+
+#include <cstddef>
+#include <sstream>
+
+#include "ValidationErrors.h"
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+
+const size_t ValidationErrors::MAX;
+
+ValidationErrors::ValidationErrors(const size_t maxNumErrors)
+    : maxNumErrors_{maxNumErrors}, currentNumErrors_{0}
+{
+    if (maxNumErrors_ == 0) maxNumErrors_ = ValidationErrors::MAX;
+}
+
+void ValidationErrors::AddFileError(const std::string& fn, std::string details)
+{
+    fileErrors_[fn].push_back(std::move(details));
+    OnErrorAdded();
+}
+
+void ValidationErrors::AddReadGroupError(const std::string& rg, std::string details)
+{
+    readGroupErrors_[rg].push_back(std::move(details));
+    OnErrorAdded();
+}
+
+void ValidationErrors::AddRecordError(const std::string& name, std::string details)
+{
+    recordErrors_[name].push_back(std::move(details));
+    OnErrorAdded();
+}
+
+void ValidationErrors::AddTagLengthError(const std::string& name, const std::string& tagLabel,
+                                         const std::string& tagName, const size_t observed,
+                                         const size_t expected)
+{
+    // format
+    std::ostringstream s;
+    s << tagLabel << " tag (" << tagName << ") length: " << observed
+      << ", does not match expected length: " << expected;
+    AddRecordError(name, s.str());
+}
+
+bool ValidationErrors::IsEmpty() const { return currentNumErrors_ == 0; }
+
+size_t ValidationErrors::MaxNumErrors() const { return maxNumErrors_; }
+
+void ValidationErrors::OnErrorAdded()
+{
+    ++currentNumErrors_;
+    if (currentNumErrors_ == maxNumErrors_) ThrowErrors();
+}
+
+void ValidationErrors::ThrowErrors()
+{
+    throw ValidationException{std::move(fileErrors_), std::move(readGroupErrors_),
+                              std::move(recordErrors_)};
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ValidationErrors.h b/src/ValidationErrors.h

new file mode 100644 (file)

index 0000000..2700196
--- /dev/null
+++ b/src/ValidationErrors.h
@@ -0,0 +1,65 @@
+// File Description
+/// \file ValidationErrors.h
+/// \brief Defines the ValidationErrors class.
+//
+// Author: Derek Barnett
+
+#ifndef VALIDATIONERRORS_H
+#define VALIDATIONERRORS_H
+
+#include "pbbam/Config.h"
+
+#include <cstddef>
+#include <limits>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// The ValidationErrors class catches error messages accumulated during
+/// validation (see Validator).
+///
+/// Convenience methods are provided for different BAM components, to help
+/// format the displayed output.
+///
+/// A maximum number of errors can be provided at construction, and this class
+/// will automatially throw a ValidationException whenever that count is reached.
+/// Otherwise, the Validator will check IsEmpty() and call ThrowErrors() if true.
+///
+class ValidationErrors
+{
+public:
+    typedef std::vector<std::string> ErrorList;
+    typedef std::map<std::string, ErrorList> ErrorMap;
+
+    static const size_t MAX = std::numeric_limits<size_t>::max();
+
+    explicit ValidationErrors(const size_t maxNumErrors = ValidationErrors::MAX);
+
+    void AddFileError(const std::string& fn, std::string details);
+    void AddReadGroupError(const std::string& rg, std::string details);
+    void AddRecordError(const std::string& name, std::string details);
+    void AddTagLengthError(const std::string& name, const std::string& tagLabel,
+                           const std::string& tagName, const size_t observed,
+                           const size_t expected);
+
+    bool IsEmpty() const;
+    size_t MaxNumErrors() const;
+    void ThrowErrors();
+
+private:
+    size_t maxNumErrors_;
+    size_t currentNumErrors_;
+    ErrorMap fileErrors_;
+    ErrorMap readGroupErrors_;
+    ErrorMap recordErrors_;
+
+    void OnErrorAdded();
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VALIDATIONERRORS_H
diff --git a/src/ValidationException.cpp b/src/ValidationException.cpp

new file mode 100644 (file)

index 0000000..99bcfd1
--- /dev/null
+++ b/src/ValidationException.cpp
@@ -0,0 +1,74 @@
+// File Description
+/// \file ValidationException.cpp
+/// \brief Implements the ValidationException class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/exception/ValidationException.h"
+
+namespace PacBio {
+namespace BAM {
+
+ValidationException::ValidationException(ErrorMap fileErrors, ErrorMap readGroupErrors,
+                                         ErrorMap recordErrors)
+    : std::runtime_error{""}
+    , fileErrors_{std::move(fileErrors)}
+    , readGroupErrors_{std::move(readGroupErrors)}
+    , recordErrors_{std::move(recordErrors)}
+{
+    FormatMessage();
+}
+
+const ValidationException::ErrorMap& ValidationException::FileErrors() const { return fileErrors_; }
+
+const ValidationException::ErrorMap& ValidationException::ReadGroupErrors() const
+{
+    return readGroupErrors_;
+}
+
+const ValidationException::ErrorMap& ValidationException::RecordErrors() const
+{
+    return recordErrors_;
+}
+
+const char* ValidationException::what() const noexcept { return msg_.c_str(); }
+
+void ValidationException::FormatMessage()
+{
+    std::ostringstream s;
+    s << "Validation failed:\n";
+
+    // file errors
+    if (!fileErrors_.empty()) {
+        for (const auto& fileError : fileErrors_) {
+            s << "  In file (" << fileError.first << ") : \n";
+            for (const auto& e : fileError.second)
+                s << "    " << e << '\n';
+        }
+    }
+
+    // read group errors
+    if (!readGroupErrors_.empty()) {
+        for (const auto& rgError : readGroupErrors_) {
+            s << "  In read group (" << rgError.first << ") :\n";
+            for (const auto& e : rgError.second)
+                s << "    " << e << '\n';
+        }
+    }
+
+    // record errors
+    if (!recordErrors_.empty()) {
+        for (const auto& recordError : readGroupErrors_) {
+            s << "  In record (" << recordError.first << ") : \n";
+            for (const auto& e : recordError.second)
+                s << "    " << e << '\n';
+        }
+    }
+
+    msg_ = s.str();
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Validator.cpp b/src/Validator.cpp

new file mode 100644 (file)

index 0000000..be638dc
--- /dev/null
+++ b/src/Validator.cpp
@@ -0,0 +1,406 @@
+// File Description
+/// \file Validator.cpp
+/// \brief Implements the Validator class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/Validator.h"
+
+#include <cstddef>
+#include <iostream>
+#include <map>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/core/ignore_unused.hpp>
+
+#include "ValidationErrors.h"
+#include "Version.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/ReadGroupInfo.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+struct ilexcompare_wrapper
+{
+    bool operator()(const std::string& lhs, const std::string& rhs) const
+    {
+        return boost::ilexicographical_compare(lhs, rhs);
+    }
+};
+
+// clang-format off
+static const std::set<std::string, ilexcompare_wrapper> AcceptedSortOrders
+{
+    "unknown",
+    "unsorted",
+    "queryname",
+    "coordinate"
+};
+
+static const std::set<std::string> AcceptedReadTypes
+{
+    "POLYMERASE",
+    "HQREGION",
+    "SUBREAD",
+    "CCS",
+    "SCRAP",
+    "UNKNOWN"
+};
+// clang-format on
+
+static void ValidateReadGroup(const ReadGroupInfo& rg, std::unique_ptr<ValidationErrors>& errors)
+{
+    const std::string id = rg.Id();
+
+    // has required fields
+    if (id.empty()) errors->AddReadGroupError(id, "missing ID");
+    if (rg.MovieName().empty()) errors->AddReadGroupError(id, "missing movie name (PU tag)");
+    // 3.0.2 adds required RG:PM - do not check for now, we'll add version-aware
+    // validation down the road
+
+    // description tag has required components
+    if (rg.ReadType().empty()) errors->AddReadGroupError(id, "missing READTYPE in description");
+    if (rg.BindingKit().empty()) errors->AddReadGroupError(id, "missing BINDINGKIT in description");
+    if (rg.SequencingKit().empty())
+        errors->AddReadGroupError(id, "missing SEQUENCINGKIT in description");
+    if (rg.BasecallerVersion().empty())
+        errors->AddReadGroupError(id, "missing BASECALLERVERSION in description");
+    if (rg.FrameRateHz().empty())
+        errors->AddReadGroupError(id, "missing FRAMERATEHZ in description");
+
+    // stored ID matches expected ID (as calculated from movie & type)
+    if (!id.empty()) {
+        const auto expectedId = MakeReadGroupId(rg.MovieName(), rg.ReadType());
+        if (expectedId != id) {
+            const std::string msg{"stored ID: " + id + " does not match computed ID: " +
+                                  expectedId};
+            errors->AddReadGroupError(id, std::move(msg));
+        }
+    }
+
+    // valid read type
+    if (!rg.ReadType().empty()) {
+        if (AcceptedReadTypes.find(rg.ReadType()) == AcceptedReadTypes.cend())
+            errors->AddReadGroupError(id, "read type: " + rg.ReadType() + " is unknown");
+    }
+
+    // valid read chemistry (binding, sequencing, chemistry)
+    if (!rg.BindingKit().empty() && !rg.SequencingKit().empty() &&
+        !rg.BasecallerVersion().empty()) {
+        try {
+            auto chem = rg.SequencingChemistry();
+            boost::ignore_unused(chem);
+        } catch (std::exception& e) {
+            errors->AddReadGroupError(id, e.what());
+        }
+    }
+
+    // frame rate convertable to floating point
+    if (!rg.FrameRateHz().empty()) {
+        try {
+            const float frameRate = std::stof(rg.FrameRateHz());
+            boost::ignore_unused(frameRate);
+        } catch (std::exception& e) {
+            errors->AddReadGroupError(id, e.what());
+        }
+    }
+}
+
+static void ValidateHeader(const BamHeader& header, const std::string& filename,
+                           std::unique_ptr<ValidationErrors>& errors)
+{
+    const std::string& fn = filename;
+
+    // SAM/BAM version
+    try {
+        Version v(header.Version());
+        boost::ignore_unused(v);
+    } catch (std::exception& e) {
+        errors->AddFileError(fn, std::string{"SAM version (@HD:VN) failed: "} + e.what());
+    }
+
+    // sort order
+    const std::string sortOrder = header.SortOrder();
+    if (AcceptedSortOrders.find(sortOrder) == AcceptedSortOrders.end())
+        errors->AddFileError(fn, std::string{"unknown sort order: "} + sortOrder);
+
+    // PacBio version
+    try {
+        const Version v{header.PacBioBamVersion()};
+        const Version minimum{3, 0, 1};
+        if (v < minimum) {
+
+            std::string msg{"PacBioBAM version (@HD:pb) "};
+            msg += v.ToString();
+            msg += " is older than the minimum supported version (" + minimum.ToString() + ")";
+            errors->AddFileError(fn, std::move(msg));
+        }
+    } catch (std::exception& e) {
+        errors->AddFileError(
+            fn, std::string{"PacBioBAM version (@HD:pb) failed to parse: "} + e.what());
+    }
+
+    // sequences?
+
+    // read groups
+    for (const ReadGroupInfo& rg : header.ReadGroups())
+        ValidateReadGroup(rg, errors);
+}
+
+static void ValidateMetadata(const BamFile& file, std::unique_ptr<ValidationErrors>& errors)
+{
+    // filename
+    const std::string fn{file.Filename()};
+    if (fn == "-") {
+        errors->AddFileError(fn,
+                             "validation not is available for streamed BAM. Please "
+                             "write to a file and run validation on it.");
+        errors->ThrowErrors();  // quit early
+    }
+    if (boost::algorithm::ends_with(fn, ".bam") || boost::algorithm::ends_with(fn, ".bam.tmp")) {
+        errors->AddFileError(fn, "non-standard file extension");
+    }
+
+    // EOF
+    if (!file.HasEOF()) errors->AddFileError(fn, "missing end-of-file marker");
+
+    // has PBI
+    if (!file.PacBioIndexExists()) errors->AddFileError(fn, "missing PBI file");
+
+    // header
+    ValidateHeader(file.Header(), file.Filename(), errors);
+}
+
+void ValidateMappedRecord(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    const std::string name{b.FullName()};
+    if (b.ReferenceStart() < 0) errors->AddRecordError(name, "mapped record position is invalid");
+    if (b.ReferenceId() < 0) errors->AddRecordError(name, "mapped record reference ID is invalid");
+
+    // what else??
+}
+
+void ValidateRecordCore(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    if (!IsCcsOrTranscript(b.Type())) {
+        const auto qStart = b.QueryStart();
+        const auto qEnd = b.QueryEnd();
+        if (qStart >= qEnd) {
+            errors->AddRecordError(b.FullName(), "queryStart (qs) should be < queryEnd (qe)");
+        }
+    }
+}
+
+void ValidateRecordReadGroup(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    try {
+        auto rg = b.ReadGroup();
+        boost::ignore_unused(rg);
+    } catch (std::exception& e) {
+        errors->AddRecordError(b.FullName(), e.what());
+    }
+}
+
+void ValidateRecordRequiredTags(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    const auto name = b.FullName();
+    const auto isCcsOrTranscript = IsCcsOrTranscript(b.Type());
+    if (!isCcsOrTranscript) {
+        // qe/qs
+        const bool hasQueryStart = b.HasQueryStart();
+        const bool hasQueryEnd = b.HasQueryEnd();
+        if (hasQueryStart && hasQueryEnd) {
+            const auto qStart = b.QueryStart();
+            const auto qEnd = b.QueryEnd();
+            if (qStart >= qEnd)
+                errors->AddRecordError(name, "queryStart (qs) should be < queryEnd (qe)");
+        } else {
+            if (!hasQueryStart) errors->AddRecordError(name, "missing tag: qs (queryStart)");
+            if (!hasQueryEnd) errors->AddRecordError(name, "missing tag: qe (queryEnd)");
+        }
+    }
+
+    // zm
+    if (!b.HasHoleNumber()) errors->AddRecordError(name, "missing tag: zm (ZMW hole number)");
+
+    // np
+    if (!b.HasNumPasses())
+        errors->AddRecordError(name, "missing tag: np (num passes)");
+    else {
+        const auto numPasses = b.NumPasses();
+        if (!isCcsOrTranscript && numPasses != 1)
+            errors->AddRecordError(name, "np (numPasses) tag for non-CCS records should be 1");
+    }
+
+    // rq
+    if (!b.HasReadAccuracy()) errors->AddRecordError(name, "missing tag: rq (read accuracy)");
+
+    // sn
+    if (!b.HasSignalToNoise())
+        errors->AddRecordError(name, "missing tag: sn (signal-to-noise ratio)");
+}
+
+void ValidateRecordTagLengths(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    const auto name = b.FullName();
+    const size_t expectedLength =
+        (IsCcsOrTranscript(b.Type()) ? b.Sequence().size() : (b.QueryEnd() - b.QueryStart()));
+
+    // check "per-base"-type data lengths are compatible
+    if (b.Sequence().size() != expectedLength)
+        errors->AddRecordError(name, "sequence length does not match expected length");
+
+    if (b.HasDeletionQV()) {
+        if (b.DeletionQV().size() != expectedLength)
+            errors->AddTagLengthError(name, "DeletionQV", "dq", b.DeletionQV().size(),
+                                      expectedLength);
+    }
+    if (b.HasDeletionTag()) {
+        if (b.DeletionTag().size() != expectedLength)
+            errors->AddTagLengthError(name, "DeletionTag", "dt", b.DeletionTag().size(),
+                                      expectedLength);
+    }
+    if (b.HasInsertionQV()) {
+        if (b.InsertionQV().size() != expectedLength)
+            errors->AddTagLengthError(name, "InsertionQV", "iq", b.InsertionQV().size(),
+                                      expectedLength);
+    }
+    if (b.HasMergeQV()) {
+        if (b.MergeQV().size() != expectedLength)
+            errors->AddTagLengthError(name, "MergeQV", "mq", b.MergeQV().size(), expectedLength);
+    }
+    if (b.HasSubstitutionQV()) {
+        if (b.SubstitutionQV().size() != expectedLength)
+            errors->AddTagLengthError(name, "SubstitutionQV", "sq", b.SubstitutionQV().size(),
+                                      expectedLength);
+    }
+    if (b.HasSubstitutionTag()) {
+        if (b.SubstitutionTag().size() != expectedLength)
+            errors->AddTagLengthError(name, "SubstitutionTag", "st", b.SubstitutionTag().size(),
+                                      expectedLength);
+    }
+    if (b.HasIPD()) {
+        if (b.IPD().size() != expectedLength)
+            errors->AddTagLengthError(name, "IPD", "ip", b.IPD().size(), expectedLength);
+    }
+
+    // NOTE: disabling "internal" tag checks for now, only production tags
+}
+
+void ValidateUnmappedRecord(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    const std::string name{b.FullName()};
+    if (b.ReferenceStart() != -1) errors->AddRecordError(name, "unmapped record has a position");
+    if (b.ReferenceId() != -1) errors->AddRecordError(name, "unmapped record has a reference ID");
+}
+
+static void ValidateRecord(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    ValidateRecordCore(b, errors);
+    ValidateRecordReadGroup(b, errors);
+    ValidateRecordRequiredTags(b, errors);
+    ValidateRecordTagLengths(b, errors);
+    if (b.IsMapped())
+        ValidateMappedRecord(b, errors);
+    else
+        ValidateUnmappedRecord(b, errors);
+}
+
+}  // namespace
+
+bool Validator::IsValid(const BamFile& file, const bool entireFile)
+{
+    try {
+        if (entireFile)
+            ValidateEntireFile(file, 1);
+        else
+            ValidateFileMetadata(file, 1);
+        return true;
+    } catch (std::exception&) {
+        return false;
+    }
+}
+
+bool Validator::IsValid(const BamHeader& header)
+{
+    try {
+        Validate(header, 1);
+        return true;
+    } catch (std::exception&) {
+        return false;
+    }
+}
+
+bool Validator::IsValid(const BamRecord& record)
+{
+    try {
+        Validate(record, 1);
+        return true;
+    } catch (std::exception&) {
+        return false;
+    }
+}
+
+bool Validator::IsValid(const ReadGroupInfo& rg)
+{
+    try {
+        Validate(rg, 1);
+        return true;
+    } catch (std::exception&) {
+        return false;
+    }
+}
+
+void Validator::Validate(const BamHeader& header, const size_t maxErrors)
+{
+    auto errors = std::make_unique<ValidationErrors>(maxErrors);
+    ValidateHeader(header, "unknown", errors);
+    if (!errors->IsEmpty()) errors->ThrowErrors();
+}
+
+void Validator::Validate(const ReadGroupInfo& rg, const size_t maxErrors)
+{
+    auto errors = std::make_unique<ValidationErrors>(maxErrors);
+    ValidateReadGroup(rg, errors);
+    if (!errors->IsEmpty()) errors->ThrowErrors();
+}
+
+void Validator::Validate(const BamRecord& b, const size_t maxErrors)
+{
+    auto errors = std::make_unique<ValidationErrors>(maxErrors);
+    ValidateRecord(b, errors);
+    if (!errors->IsEmpty()) errors->ThrowErrors();
+}
+
+void Validator::ValidateEntireFile(const BamFile& file, const size_t maxErrors)
+{
+    auto errors = std::make_unique<ValidationErrors>(maxErrors);
+    ValidateMetadata(file, errors);
+
+    EntireFileQuery query(file);
+    for (const BamRecord& record : query)
+        ValidateRecord(record, errors);
+
+    if (!errors->IsEmpty()) errors->ThrowErrors();
+}
+
+void Validator::ValidateFileMetadata(const BamFile& file, const size_t maxErrors)
+{
+    auto errors = std::make_unique<ValidationErrors>(maxErrors);
+    ValidateMetadata(file, errors);
+    if (!errors->IsEmpty()) errors->ThrowErrors();
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Version.cpp b/src/Version.cpp

new file mode 100644 (file)

index 0000000..c1af408
--- /dev/null
+++ b/src/Version.cpp
@@ -0,0 +1,51 @@
+// File Description
+/// \file Version.cpp
+/// \brief Implements the Version class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "Version.h"
+
+#include <sstream>
+#include <stdexcept>
+
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+
+const Version Version::Current = Version(3, 0, 7);
+const Version Version::Minimum = Version(3, 0, 1);
+
+// string must be "<major>.<minor>.<version>"
+Version::Version(const std::string& v) : major_{0}, minor_{0}, revision_{0}
+{
+    // parse string
+    try {
+        const auto fields = Split(v, '.');
+        const auto numFields = fields.size();
+        if (numFields == 0) throw std::runtime_error{"Version: empty string"};
+        major_ = std::stoi(fields.at(0));
+        if (numFields > 1) {
+            minor_ = std::stoi(fields.at(1));
+            if (numFields > 2) revision_ = std::stoi(fields.at(2));
+        }
+    } catch (std::exception&) {
+        throw std::runtime_error{"Version: could not parse: " + v};
+    }
+
+    // ensure valid numbers
+    Check();
+}
+
+std::string Version::ToString() const
+{
+    std::ostringstream s;
+    s << major_ << '.' << minor_ << '.' << revision_;
+    return s.str();
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Version.h b/src/Version.h

new file mode 100644 (file)

index 0000000..278259e
--- /dev/null
+++ b/src/Version.h
@@ -0,0 +1,134 @@
+// File Description
+/// \file Version.h
+/// \brief Defines the Version class.
+//
+// Author: Derek Barnett
+
+#ifndef PACBIOBAM_VERSION_H
+#define PACBIOBAM_VERSION_H
+
+#include "pbbam/Config.h"
+
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+
+namespace PacBio {
+namespace BAM {
+
+class Version
+{
+public:
+    static const Version Current;
+    static const Version Minimum;
+
+    constexpr Version() = default;
+
+    Version(int major, int minor, int revision);
+
+    // string must be "<major>.<minor>.<version>"
+    explicit Version(const std::string& v);
+
+    Version(const Version&) = default;
+    Version(Version&&) noexcept = default;
+    Version& operator=(const Version&) = default;
+    Version& operator=(Version&&) noexcept = default;
+    ~Version() = default;
+
+    bool operator==(const Version& other) const;
+    bool operator!=(const Version& other) const;
+    bool operator<(const Version& other) const;
+    bool operator<=(const Version& other) const;
+    bool operator>(const Version& other) const;
+    bool operator>=(const Version& other) const;
+
+    std::string ToString() const;
+    explicit operator std::string() const;
+
+    int Major() const;
+    int Minor() const;
+    int Revision() const;
+
+    Version& Major(int major);
+    Version& Minor(int minor);
+    Version& Revision(int revision);
+
+private:
+    void Check() const;
+
+    int major_ = 0;
+    int minor_ = 0;
+    int revision_ = 0;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Version& version)
+{
+    out << version.ToString();
+    return out;
+}
+
+inline Version::Version(int major, int minor, int revision)
+    : major_{major}, minor_{minor}, revision_{revision}
+{
+    Check();
+}
+
+inline bool Version::operator==(const Version& other) const
+{
+    return std::tie(major_, minor_, revision_) ==
+           std::tie(other.major_, other.minor_, other.revision_);
+}
+
+inline bool Version::operator!=(const Version& other) const { return !(*this == other); }
+
+inline bool Version::operator<(const Version& other) const
+{
+    return std::tie(major_, minor_, revision_) <
+           std::tie(other.major_, other.minor_, other.revision_);
+}
+inline bool Version::operator<=(const Version& other) const { return !(*this > other); }
+
+inline bool Version::operator>(const Version& other) const { return other < *this; }
+
+inline bool Version::operator>=(const Version& other) const { return !(*this < other); }
+
+inline Version::operator std::string() const { return ToString(); }
+
+inline void Version::Check() const
+{
+    if (major_ < 0 || minor_ < 0 || revision_ < 0)
+        throw std::runtime_error{"version cannot contain negative numbers"};
+}
+
+inline int Version::Major() const { return major_; }
+
+inline Version& Version::Major(int major)
+{
+    major_ = major;
+    Check();
+    return *this;
+}
+
+inline int Version::Minor() const { return minor_; }
+
+inline Version& Version::Minor(int minor)
+{
+    minor_ = minor;
+    Check();
+    return *this;
+}
+
+inline int Version::Revision() const { return revision_; }
+
+inline Version& Version::Revision(int revision)
+{
+    revision_ = revision;
+    Check();
+    return *this;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PACBIOBAM_VERSION_H
diff --git a/src/VirtualRegion.cpp b/src/VirtualRegion.cpp

new file mode 100644 (file)

index 0000000..3a437ce
--- /dev/null
+++ b/src/VirtualRegion.cpp
@@ -0,0 +1,53 @@
+// File Description
+/// \file VirtualRegionTypeMap.cpp
+/// \brief Implements the VirtualRegionTypeMap class.
+//
+// Author: Armin Töpfer
+
+#include "PbbamInternalConfig.h"
+
+#include <cassert>
+#include <tuple>
+#include <type_traits>
+
+#include "pbbam/virtual/VirtualRegion.h"
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(std::is_copy_constructible<VirtualRegion>::value,
+              "VirtualRegion(const VirtualRegion&) is not = default");
+static_assert(std::is_copy_assignable<VirtualRegion>::value,
+              "VirtualRegion& operator=(const VirtualRegion&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<VirtualRegion>::value,
+              "VirtualRegion(VirtualRegion&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<VirtualRegion>::value,
+              "VirtualRegion& operator=(VirtualRegion&&) is not = noexcept");
+
+VirtualRegion::VirtualRegion(const VirtualRegionType type_, const int beginPos_, const int endPos_,
+                             const int score_)
+    : type{type_}, beginPos{beginPos_}, endPos{endPos_}, cxTag{}, score{score_}
+{
+}
+
+VirtualRegion::VirtualRegion(const VirtualRegionType type_, const int beginPos_, const int endPos_,
+                             const LocalContextFlags cxTag_, const int barcodeLeft_,
+                             const int barcodeRight_, const int score_)
+    : type{type_}
+    , beginPos{beginPos_}
+    , endPos{endPos_}
+    , cxTag{cxTag_}
+    , barcodeLeft{barcodeLeft_}
+    , barcodeRight{barcodeRight_}
+    , score{score_}
+{
+}
+
+bool VirtualRegion::operator==(const VirtualRegion& v1) const
+{
+    return std::tie(type, beginPos, endPos) == std::tie(v1.type, v1.beginPos, v1.endPos);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/VirtualRegionTypeMap.cpp b/src/VirtualRegionTypeMap.cpp

new file mode 100644 (file)

index 0000000..99c99db
--- /dev/null
+++ b/src/VirtualRegionTypeMap.cpp
@@ -0,0 +1,22 @@
+// File Description
+/// \file VirtualRegionTypeMap.cpp
+/// \brief Implements the VirtualRegionTypeMap class.
+//
+// Author: Armin Töpfer
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/virtual/VirtualRegionTypeMap.h"
+
+namespace PacBio {
+namespace BAM {
+
+std::map<char, VirtualRegionType> VirtualRegionTypeMap::ParseChar{
+    {'A', VirtualRegionType::ADAPTER},
+    {'B', VirtualRegionType::BARCODE},
+    {'H', VirtualRegionType::HQREGION},
+    {'F', VirtualRegionType::FILTERED},
+    {'L', VirtualRegionType::LQREGION}};
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/VirtualStitching.h b/src/VirtualStitching.h

new file mode 100644 (file)

index 0000000..0f47002
--- /dev/null
+++ b/src/VirtualStitching.h
@@ -0,0 +1,70 @@
+// File Description
+/// \file VirtualStitching.h
+/// \brief Utilities for virtual ZMW stitiching.
+//
+// Author: Derek Barnett
+
+#ifndef VIRTUALSTITCHING_H
+#define VIRTUALSTITCHING_H
+
+#include "pbbam/Config.h"
+
+#include <deque>
+#include <string>
+#include <utility>
+
+#include <boost/optional.hpp>
+
+#include <pbbam/DataSet.h>
+
+namespace PacBio {
+namespace BAM {
+
+using StitchingSources = std::deque<std::pair<std::string, std::string>>;
+
+inline boost::optional<std::string> ScrapsFileId(const ExternalResource& resource)
+{
+    const auto& childResources = resource.ExternalResources();
+    for (const auto& childResource : childResources) {
+        const auto& childMetatype = childResource.MetaType();
+        if (childMetatype == "PacBio.SubreadFile.ScrapsBamFile" ||
+            childMetatype == "PacBio.SubreadFile.HqScrapsBamFile") {
+            return childResource.ResourceId();
+        }
+    }
+    return boost::none;
+}
+
+inline StitchingSources SourcesFromDataset(const DataSet& dataset)
+{
+    StitchingSources sources;
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    for (const ExternalResource& resource : resources) {
+
+        boost::optional<std::string> primaryId;
+        boost::optional<std::string> scrapsId;
+
+        // if resource is possible "primary" BAM, store & look for associated scraps
+        const auto& metatype = resource.MetaType();
+        if (metatype == "PacBio.SubreadFile.SubreadBamFile" ||
+            metatype == "PacBio.SubreadFile.HqRegionBamFile") {
+            primaryId = resource.ResourceId();
+            scrapsId = ScrapsFileId(resource);
+        }
+
+        // if found, resolve paths & store
+        if (primaryId && scrapsId) {
+            std::string primaryFn = dataset.ResolvePath(primaryId.get());
+            std::string scrapsFn = dataset.ResolvePath(scrapsId.get());
+            sources.emplace_back(std::make_pair(primaryFn, scrapsFn));
+        }
+    }
+
+    return sources;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALSTITCHING_H
diff --git a/src/VirtualZmwBamRecord.cpp b/src/VirtualZmwBamRecord.cpp

new file mode 100644 (file)

index 0000000..192e14f
--- /dev/null
+++ b/src/VirtualZmwBamRecord.cpp
@@ -0,0 +1,297 @@
+// File Description
+/// \file VirtualZmwBamRecord.cpp
+/// \brief Implements the VirtualZmwBamRecord class.
+//
+// Author: Armin Töpfer
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+
+#include <cassert>
+#include <cstdint>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include <pbcopper/utility/MoveAppend.h>
+
+#include "pbbam/virtual/VirtualRegionType.h"
+#include "pbbam/virtual/VirtualRegionTypeMap.h"
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(std::is_copy_constructible<VirtualZmwBamRecord>::value,
+              "VirtualZmwBamRecord(const VirtualZmwBamRecord&) is not = default");
+static_assert(std::is_copy_assignable<VirtualZmwBamRecord>::value,
+              "VirtualZmwBamRecord& operator=(const VirtualZmwBamRecord&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<VirtualZmwBamRecord>::value,
+              "VirtualZmwBamRecord(VirtualZmwBamRecord&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<VirtualZmwBamRecord>::value,
+              "VirtualZmwBamRecord& operator=(VirtualZmwBamRecord&&) is not = noexcept");
+
+VirtualZmwBamRecord::VirtualZmwBamRecord(std::vector<BamRecord> unorderedSources,
+                                         const BamHeader& header)
+    : BamRecord{header}, sources_{std::move(unorderedSources)}
+{
+    // Sort sources by queryStart,queryEnd
+    std::sort(sources_.begin(), sources_.end(), [](const BamRecord& l1, const BamRecord& l2) {
+        const auto l1_qStart = l1.QueryStart();
+        const auto l1_qEnd = l1.QueryEnd();
+        const auto l2_qStart = l2.QueryStart();
+        const auto l2_qEnd = l2.QueryEnd();
+
+        return std::tie(l1_qStart, l1_qEnd) < std::tie(l2_qStart, l2_qEnd);
+    });
+
+    StitchSources();
+}
+
+bool VirtualZmwBamRecord::HasVirtualRegionType(const VirtualRegionType regionType) const
+{
+    return virtualRegionsMap_.find(regionType) != virtualRegionsMap_.end();
+}
+
+Frames VirtualZmwBamRecord::IPDV1Frames(Orientation orientation) const
+{
+    const auto rawFrames = this->IPDRaw(orientation);
+    const std::vector<uint8_t> rawData(rawFrames.Data().begin(), rawFrames.Data().end());
+    return Frames::Decode(rawData);
+}
+
+void VirtualZmwBamRecord::StitchSources()
+{
+    const auto& firstRecord = sources_[0];
+    const auto& lastRecord = sources_[sources_.size() - 1];
+
+    std::string sequence;
+    std::string deletionTag;
+    std::string substitutionTag;
+    std::string alternativeLabelTag;
+    std::string pulseCall;
+
+    QualityValues qualities;
+    QualityValues deletionQv;
+    QualityValues insertionQv;
+    QualityValues mergeQv;
+    QualityValues pulseMergeQv;
+    QualityValues substitutionQv;
+    QualityValues labelQv;
+    QualityValues alternativeLabelQv;
+
+    Frames ipd;
+    Frames pw;
+    Frames pd;
+    Frames px;
+    std::vector<float> pa;
+    std::vector<float> pm;
+    std::vector<uint32_t> sf;
+    std::vector<PacBio::BAM::PulseExclusionReason> pe;
+
+    // initialize capacity
+    const auto stitchedSize = lastRecord.QueryEnd() - firstRecord.QueryStart();
+    sequence.reserve(stitchedSize);
+    deletionTag.reserve(stitchedSize);
+    substitutionTag.reserve(stitchedSize);
+    alternativeLabelTag.reserve(stitchedSize);
+    pulseCall.reserve(stitchedSize);
+    qualities.reserve(stitchedSize);
+    deletionQv.reserve(stitchedSize);
+    insertionQv.reserve(stitchedSize);
+    mergeQv.reserve(stitchedSize);
+    pulseMergeQv.reserve(stitchedSize);
+    substitutionQv.reserve(stitchedSize);
+    labelQv.reserve(stitchedSize);
+    alternativeLabelQv.reserve(stitchedSize);
+    ipd.DataRaw().reserve(stitchedSize);
+    pw.DataRaw().reserve(stitchedSize);
+    pd.DataRaw().reserve(stitchedSize);
+    px.DataRaw().reserve(stitchedSize);
+    pa.reserve(stitchedSize);
+    pm.reserve(stitchedSize);
+    sf.reserve(stitchedSize);
+    pe.reserve(stitchedSize);
+
+    // Stitch using tmp vars
+    for (auto& b : sources_) {
+        sequence.append(b.Sequence());
+
+        Utility::MoveAppend(b.Qualities(), qualities);
+
+        if (b.HasDeletionQV()) Utility::MoveAppend(std::move(b.DeletionQV()), deletionQv);
+
+        if (b.HasInsertionQV()) Utility::MoveAppend(std::move(b.InsertionQV()), insertionQv);
+
+        if (b.HasMergeQV()) Utility::MoveAppend(std::move(b.MergeQV()), mergeQv);
+
+        if (b.HasPulseMergeQV()) Utility::MoveAppend(std::move(b.PulseMergeQV()), pulseMergeQv);
+
+        if (b.HasSubstitutionQV())
+            Utility::MoveAppend(std::move(b.SubstitutionQV()), substitutionQv);
+
+        if (b.HasLabelQV()) Utility::MoveAppend(std::move(b.LabelQV()), labelQv);
+
+        if (b.HasAltLabelQV()) Utility::MoveAppend(std::move(b.AltLabelQV()), alternativeLabelQv);
+
+        if (b.HasDeletionTag()) deletionTag.append(std::move(b.DeletionTag()));
+
+        if (b.HasSubstitutionTag()) substitutionTag.append(std::move(b.SubstitutionTag()));
+
+        if (b.HasAltLabelTag()) alternativeLabelTag.append(std::move(b.AltLabelTag()));
+
+        if (b.HasPulseCall()) pulseCall.append(std::move(b.PulseCall()));
+
+        if (b.HasIPD()) Utility::MoveAppend(b.IPDRaw().DataRaw(), ipd.DataRaw());
+
+        if (b.HasPulseWidth()) Utility::MoveAppend(b.PulseWidthRaw().DataRaw(), pw.DataRaw());
+
+        if (b.HasPulseCallWidth()) Utility::MoveAppend(b.PulseCallWidth().DataRaw(), px.DataRaw());
+
+        if (b.HasPrePulseFrames()) Utility::MoveAppend(b.PrePulseFrames().DataRaw(), pd.DataRaw());
+
+        if (b.HasPkmid()) Utility::MoveAppend(b.Pkmid(), pm);
+
+        if (b.HasPkmean()) Utility::MoveAppend(b.Pkmean(), pa);
+
+        if (b.HasPkmid2()) Utility::MoveAppend(b.Pkmid2(), pm);
+
+        if (b.HasPkmean2()) Utility::MoveAppend(b.Pkmean2(), pa);
+
+        if (b.HasPulseExclusion()) Utility::MoveAppend(b.PulseExclusionReason(), pe);
+
+        if (b.HasStartFrame()) Utility::MoveAppend(b.StartFrame(), sf);
+
+        if (b.HasScrapRegionType()) {
+            const VirtualRegionType regionType = b.ScrapRegionType();
+
+            if (!HasVirtualRegionType(regionType))
+                virtualRegionsMap_[regionType] = std::vector<VirtualRegion>{};
+
+            virtualRegionsMap_[regionType].emplace_back(regionType, b.QueryStart(), b.QueryEnd());
+        }
+
+        if (b.HasLocalContextFlags()) {
+            std::pair<int, int> barcodes{-1, -1};
+            if (b.HasBarcodes()) barcodes = b.Barcodes();
+
+            static constexpr const auto regionType = VirtualRegionType::SUBREAD;
+            if (!HasVirtualRegionType(regionType))
+                virtualRegionsMap_[regionType] = std::vector<VirtualRegion>{};
+
+            virtualRegionsMap_[regionType].emplace_back(regionType, b.QueryStart(), b.QueryEnd(),
+                                                        b.LocalContextFlags(), barcodes.first,
+                                                        barcodes.second);
+        }
+
+        if (b.HasBarcodes() && !this->HasBarcodes()) this->Barcodes(b.Barcodes());
+
+        if (b.HasBarcodeQuality() && !this->HasBarcodeQuality())
+            this->BarcodeQuality(b.BarcodeQuality());
+
+        if (b.HasReadAccuracy() && !this->HasReadAccuracy()) this->ReadAccuracy(b.ReadAccuracy());
+
+        if (b.HasScrapZmwType()) {
+            if (!this->HasScrapZmwType())
+                this->ScrapZmwType(b.ScrapZmwType());
+            else if (this->ScrapZmwType() != b.ScrapZmwType())
+                throw std::runtime_error{"VirtualZmwBamRecord: scrap types do not match"};
+        }
+    }
+
+    // ReadGroup
+    this->ReadGroup(this->header_.ReadGroups()[0]);
+
+    this->NumPasses(1);
+
+    // All records should contain the same SNR and hole number
+    if (firstRecord.HasSignalToNoise()) this->SignalToNoise(firstRecord.SignalToNoise());
+    this->HoleNumber(firstRecord.HoleNumber());
+
+    // QueryStart
+    this->QueryStart(firstRecord.QueryStart());
+    this->QueryEnd(lastRecord.QueryEnd());
+    this->UpdateName();
+
+    const std::string qualitiesStr = qualities.Fastq();
+    if (sequence.size() == qualitiesStr.size())
+        this->Impl().SetSequenceAndQualities(sequence, qualitiesStr);
+    else
+        this->Impl().SetSequenceAndQualities(sequence);
+
+    // Tags as strings
+    if (!deletionTag.empty()) this->DeletionTag(deletionTag);
+    if (!substitutionTag.empty()) this->SubstitutionTag(substitutionTag);
+    if (!alternativeLabelTag.empty()) this->AltLabelTag(alternativeLabelTag);
+    if (!pulseCall.empty()) this->PulseCall(pulseCall);
+
+    // QVs
+    if (!deletionQv.empty()) this->DeletionQV(deletionQv);
+    if (!insertionQv.empty()) this->InsertionQV(insertionQv);
+    if (!mergeQv.empty()) this->MergeQV(mergeQv);
+    if (!pulseMergeQv.empty()) this->PulseMergeQV(pulseMergeQv);
+    if (!substitutionQv.empty()) this->SubstitutionQV(substitutionQv);
+    if (!labelQv.empty()) this->LabelQV(labelQv);
+    if (!alternativeLabelQv.empty()) this->AltLabelQV(alternativeLabelQv);
+
+    // PulseExclusionReason
+    if (!pe.empty()) this->PulseExclusionReason(pe);
+
+    // 16 bit arrays
+    if (!ipd.Data().empty()) this->IPD(ipd, FrameEncodingType::LOSSLESS);
+    if (!pw.Data().empty()) this->PulseWidth(pw, FrameEncodingType::LOSSLESS);
+    if (!pa.empty()) this->Pkmean(pa);
+    if (!pm.empty()) this->Pkmid(pm);
+    if (!pd.Data().empty()) this->PrePulseFrames(pd, FrameEncodingType::LOSSLESS);
+    if (!px.Data().empty()) this->PulseCallWidth(px, FrameEncodingType::LOSSLESS);
+
+    // 32 bit arrays
+    if (!sf.empty()) this->StartFrame(sf);
+
+    // Determine HQREGION bases on LQREGIONS
+    if (HasVirtualRegionType(VirtualRegionType::LQREGION)) {
+        if (virtualRegionsMap_[VirtualRegionType::LQREGION].size() == 1) {
+            const auto lq = virtualRegionsMap_[VirtualRegionType::LQREGION][0];
+            if (lq.beginPos == 0)
+                virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back(
+                    VirtualRegionType::HQREGION, lq.endPos, sequence.size());
+            else if (lq.endPos == static_cast<int>(sequence.size()))
+                virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back(
+                    VirtualRegionType::HQREGION, 0, lq.beginPos);
+            else
+                throw std::runtime_error{"VirtualZmwBamRecord: unknown HQREGION"};
+        } else {
+            int beginPos = 0;
+            for (const auto& lqregion : virtualRegionsMap_[VirtualRegionType::LQREGION]) {
+                if (lqregion.beginPos - beginPos > 0)
+                    virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back(
+                        VirtualRegionType::HQREGION, beginPos, lqregion.beginPos);
+                beginPos = lqregion.endPos;
+            }
+        }
+    } else {
+        virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back(VirtualRegionType::HQREGION, 0,
+                                                                     sequence.size());
+    }
+}
+
+std::map<VirtualRegionType, std::vector<VirtualRegion>> VirtualZmwBamRecord::VirtualRegionsMap()
+    const
+{
+    return virtualRegionsMap_;
+}
+
+std::vector<VirtualRegion> VirtualZmwBamRecord::VirtualRegionsTable(
+    const VirtualRegionType regionType) const
+{
+    const auto iter = virtualRegionsMap_.find(regionType);
+    if (iter != virtualRegionsMap_.cend()) return iter->second;
+    return {};
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/VirtualZmwCompositeReader.cpp b/src/VirtualZmwCompositeReader.cpp

new file mode 100644 (file)

index 0000000..5bb53d2
--- /dev/null
+++ b/src/VirtualZmwCompositeReader.cpp
@@ -0,0 +1,73 @@
+// File Description
+/// \file VirtualZmwCompositeReader.cpp
+/// \brief Implements the VirtualZmwCompositeReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "VirtualZmwCompositeReader.h"
+
+#include <boost/algorithm/string.hpp>
+
+namespace PacBio {
+namespace BAM {
+
+VirtualZmwCompositeReader::VirtualZmwCompositeReader(const DataSet& dataset)
+    : currentReader_(nullptr), filter_(PbiFilter::FromDataSet(dataset))
+{
+    sources_ = SourcesFromDataset(dataset);
+    OpenNextReader();
+}
+
+bool VirtualZmwCompositeReader::HasNext() { return (currentReader_ && currentReader_->HasNext()); }
+
+VirtualZmwBamRecord VirtualZmwCompositeReader::Next()
+{
+    if (currentReader_) {
+        const auto result = currentReader_->Next();
+        if (!currentReader_->HasNext()) OpenNextReader();
+        return result;
+    }
+
+    // no reader active
+    throw std::runtime_error{
+        "VirtualZmwCompositeReader: "
+        "no readers active, make sure you use "
+        "VirtualZmwCompositeReader::HasNext before "
+        "requesting next record"};
+}
+
+std::vector<BamRecord> VirtualZmwCompositeReader::NextRaw()
+{
+    if (currentReader_) {
+        const auto result = currentReader_->NextRaw();
+        if (!currentReader_->HasNext()) OpenNextReader();
+        return result;
+    }
+
+    // no reader active
+    throw std::runtime_error{
+        "VirtualZmwCompositeReader: "
+        "no readers active, make sure you use "
+        "VirtualZmwCompositeReader::HasNext before "
+        "requesting next group of records"};
+}
+
+void VirtualZmwCompositeReader::OpenNextReader()
+{
+    currentReader_.reset(nullptr);
+
+    // find next source pair with data
+    while (!sources_.empty()) {
+        const auto nextSource = sources_.front();
+        sources_.pop_front();
+
+        currentReader_ =
+            std::make_unique<VirtualZmwReader>(nextSource.first, nextSource.second, filter_);
+        if (currentReader_->HasNext()) return;
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/VirtualZmwCompositeReader.h b/src/VirtualZmwCompositeReader.h

new file mode 100644 (file)

index 0000000..18533f6
--- /dev/null
+++ b/src/VirtualZmwCompositeReader.h
@@ -0,0 +1,79 @@
+// File Description
+/// \file VirtualZmwCompositeReader.h
+/// \brief Defines the VirtualZmwCompositeReader class.
+//
+// Author: Derek Barnett
+
+#ifndef VIRTUALZMWCOMPOSITEREADER_H
+#define VIRTUALZMWCOMPOSITEREADER_H
+
+#include "pbbam/Config.h"
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include <pbbam/DataSet.h>
+#include <pbbam/PbiFilter.h>
+
+#include "VirtualStitching.h"
+#include "VirtualZmwReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The VirtualZmwCompositeReader provides an interface for
+///        re-stitching "virtual" polymerase reads from their constituent parts,
+///        across multiple %BAM resources from a DataSet.
+///
+/// This class is essentially a DataSet-aware wrapper around
+/// VirtualZmwReader, enabling multiple resources as input. See that
+/// class's documentation for more info.
+///
+class PBBAM_EXPORT VirtualZmwCompositeReader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    explicit VirtualZmwCompositeReader(const DataSet& dataset);
+
+    VirtualZmwCompositeReader() = delete;
+    VirtualZmwCompositeReader(const VirtualZmwCompositeReader&) = delete;
+    VirtualZmwCompositeReader(VirtualZmwCompositeReader&&) = delete;
+    VirtualZmwCompositeReader& operator=(const VirtualZmwCompositeReader&) = delete;
+    VirtualZmwCompositeReader& operator=(VirtualZmwCompositeReader&&) = delete;
+    ~VirtualZmwCompositeReader() = default;
+
+    /// \}
+
+    /// \name Stitched Record Reading
+    /// \{
+
+    /// \returns true if more ZMWs/files are available for reading.
+    bool HasNext();
+
+    /// \returns the next stitched polymerase read
+    VirtualZmwBamRecord Next();
+
+    /// \returns the next set of reads that belong to one ZMW from one %BAM
+    ///          resource (a primary %BAM and/or its scraps file). This enables
+    ///          stitching records in a distinct thread.
+    ///
+    std::vector<BamRecord> NextRaw();
+
+    /// \}
+
+private:
+    StitchingSources sources_;
+    std::unique_ptr<VirtualZmwReader> currentReader_;
+    PbiFilter filter_;
+
+    void OpenNextReader();
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALCOMPOSITEREADER_H
diff --git a/src/VirtualZmwReader.cpp b/src/VirtualZmwReader.cpp

new file mode 100644 (file)

index 0000000..9dd6fa4
--- /dev/null
+++ b/src/VirtualZmwReader.cpp
@@ -0,0 +1,108 @@
+// File Description
+/// \file VirtualZmwReader.cpp
+/// \brief Implements the VirtualZmwReader class.
+//
+// Author: Armin Töpfer
+
+#include "PbbamInternalConfig.h"
+
+#include "VirtualZmwReader.h"
+
+#include <stdexcept>
+
+#include "pbbam/ReadGroupInfo.h"
+
+namespace PacBio {
+namespace BAM {
+
+VirtualZmwReader::VirtualZmwReader(const std::string& primaryBamFilepath,
+                                   const std::string& scrapsBamFilepath)
+    : VirtualZmwReader(primaryBamFilepath, scrapsBamFilepath, PbiFilter{})
+{
+}
+
+VirtualZmwReader::VirtualZmwReader(const std::string& primaryBamFilepath,
+                                   const std::string& scrapsBamFilepath, const PbiFilter& filter)
+{
+    primaryBamFile_ = std::make_unique<BamFile>(primaryBamFilepath);
+    scrapsBamFile_ = std::make_unique<BamFile>(scrapsBamFilepath);
+
+    if (filter.IsEmpty()) {
+        primaryQuery_ = std::make_unique<EntireFileQuery>(*primaryBamFile_);
+        scrapsQuery_ = std::make_unique<EntireFileQuery>(*scrapsBamFile_);
+    } else {
+        primaryQuery_ = std::make_unique<PbiFilterQuery>(filter, *primaryBamFile_);
+        scrapsQuery_ = std::make_unique<PbiFilterQuery>(filter, *scrapsBamFile_);
+    }
+
+    primaryIt_ = (primaryQuery_->begin());
+    scrapsIt_ = (scrapsQuery_->begin());
+
+    stitchedHeader_ = std::make_unique<BamHeader>(primaryBamFile_->Header().ToSam());
+
+    // update stitched read group in header
+    auto readGroups = stitchedHeader_->ReadGroups();
+    if (readGroups.empty())
+        throw std::runtime_error{
+            "VirtualZmwReader: no read groups in header of the primary BAM file: " +
+            primaryBamFilepath};
+    readGroups[0].ReadType("POLYMERASE");
+    readGroups[0].Id(readGroups[0].MovieName(), "POLYMERASE");
+    if (readGroups.size() > 1) {
+        std::vector<ReadGroupInfo> singleGroup;
+        singleGroup.emplace_back(std::move(readGroups[0]));
+        readGroups = std::move(singleGroup);
+        stitchedHeader_->ClearReadGroups();
+    }
+    stitchedHeader_->ReadGroups(readGroups);
+}
+
+VirtualZmwReader::~VirtualZmwReader() = default;
+
+bool VirtualZmwReader::HasNext()
+{
+    // Return true until both iterators are at the end of the query
+    return primaryIt_ != primaryQuery_->end() || scrapsIt_ != scrapsQuery_->end();
+}
+
+// This method is not thread safe
+VirtualZmwBamRecord VirtualZmwReader::Next()
+{
+    return VirtualZmwBamRecord{NextRaw(), *stitchedHeader_};
+}
+
+std::vector<BamRecord> VirtualZmwReader::NextRaw()
+{
+    std::vector<BamRecord> bamRecordVec;
+
+    // Current hole number, the smallest of scraps and primary.
+    // It can be that the next ZMW is scrap only.
+    int currentHoleNumber;
+    if (primaryIt_ == primaryQuery_->end())
+        currentHoleNumber = (*scrapsIt_).HoleNumber();
+    else if (scrapsIt_ == scrapsQuery_->end())
+        currentHoleNumber = (*primaryIt_).HoleNumber();
+    else
+        currentHoleNumber = std::min((*primaryIt_).HoleNumber(), (*scrapsIt_).HoleNumber());
+
+    // collect subreads or hqregions
+    while (primaryIt_ != primaryQuery_->end() && currentHoleNumber == (*primaryIt_).HoleNumber()) {
+        bamRecordVec.push_back(*primaryIt_++);
+    }
+
+    // collect scraps
+    while (scrapsIt_ != scrapsQuery_->end() && currentHoleNumber == (*scrapsIt_).HoleNumber()) {
+        bamRecordVec.push_back(*scrapsIt_++);
+    }
+
+    return bamRecordVec;
+}
+
+BamHeader VirtualZmwReader::PrimaryHeader() const { return primaryBamFile_->Header(); }
+
+BamHeader VirtualZmwReader::ScrapsHeader() const { return scrapsBamFile_->Header(); }
+
+BamHeader VirtualZmwReader::StitchedHeader() const { return *stitchedHeader_.get(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/VirtualZmwReader.h b/src/VirtualZmwReader.h

new file mode 100644 (file)

index 0000000..acd6a77
--- /dev/null
+++ b/src/VirtualZmwReader.h
@@ -0,0 +1,90 @@
+// File Description
+/// \file VirtualZmwReader.h
+/// \brief Defines the VirtualZmwReader class.
+//
+// Author: Armin Töpfer
+
+#ifndef VIRTUALZMWREADER_H
+#define VIRTUALZMWREADER_H
+
+#include "pbbam/Config.h"
+
+#include <memory>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/PbiFilter.h"
+#include "pbbam/PbiFilterQuery.h"
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+class VirtualZmwReader
+{
+public:
+    /// \brief Creates a reader that will operate on a primary %BAM file (e.g.
+    ///        subread data) and a scraps file, consuming all reads.
+    ///
+    /// \param[in] primaryBamFilepath hqregion.bam or subreads.bam file path
+    /// \param[in] scrapsBamFilepath  scraps.bam file path
+    ///
+    VirtualZmwReader(const std::string& primaryBamFilepath, const std::string& scrapsBamFilepath);
+
+    /// \brief Creates a reader that will operate on a primary %BAM file (e.g.
+    ///        subread data) and a scraps file, respecting the provided PBI
+    ///        filter.
+    ///
+    /// \note All %BAM files must have a corresponding ".pbi" index file to use
+    ///       the filter. You may need to call BamFile::EnsurePacBioIndexExists
+    ///       before constructing the reader.
+    ///
+    /// \param[in] primaryBamFilepath hqregion.bam or subreads.bam file path
+    /// \param[in] scrapsBamFilepath  scraps.bam file path
+    /// \param[in] filter PBI filter criteria
+    ///
+    VirtualZmwReader(const std::string& primaryBamFilepath, const std::string& scrapsBamFilepath,
+                     const PbiFilter& filter);
+
+    VirtualZmwReader() = delete;
+    VirtualZmwReader(const VirtualZmwReader&) = delete;
+    VirtualZmwReader(VirtualZmwReader&&) = delete;
+    VirtualZmwReader& operator=(const VirtualZmwReader&) = delete;
+    VirtualZmwReader& operator=(VirtualZmwReader&&) = delete;
+    ~VirtualZmwReader();
+
+    /// \returns the BamHeader associated with this reader's "primary" %BAM file
+    BamHeader PrimaryHeader() const;
+
+    /// \returns the BamHeader associated with this reader's "scraps" %BAM file
+    BamHeader ScrapsHeader() const;
+
+    /// \return the BamHeader associated with the newly stitched BAM data
+    BamHeader StitchedHeader() const;
+
+    /// \returns true if more ZMWs are available for reading.
+    bool HasNext();
+
+    /// \returns the next stitched polymerase read
+    VirtualZmwBamRecord Next();
+
+    /// \returns the next set of reads that belong to one ZMW.
+    ///          This enables stitching records in a distinct thread.
+    ///
+    std::vector<BamRecord> NextRaw();
+
+private:
+    std::unique_ptr<BamFile> primaryBamFile_;
+    std::unique_ptr<BamFile> scrapsBamFile_;
+    std::unique_ptr<internal::IQuery> primaryQuery_;
+    std::unique_ptr<internal::IQuery> scrapsQuery_;
+    internal::IQuery::iterator primaryIt_;
+    internal::IQuery::iterator scrapsIt_;
+    std::unique_ptr<BamHeader> stitchedHeader_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VirtualZmwREADER_H
diff --git a/src/WhitelistedZmwReadStitcher.cpp b/src/WhitelistedZmwReadStitcher.cpp

new file mode 100644 (file)

index 0000000..b8cf411
--- /dev/null
+++ b/src/WhitelistedZmwReadStitcher.cpp
@@ -0,0 +1,139 @@
+// File Description
+/// \file WhitelistedZmwReadStitcher.cpp
+/// \brief Implements the WhitelistedZmwReadStitcher class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/virtual/WhitelistedZmwReadStitcher.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "VirtualZmwReader.h"
+#include "pbbam/PbiIndexedBamReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+class WhitelistedZmwReadStitcher::WhitelistedZmwReadStitcherPrivate
+{
+public:
+    WhitelistedZmwReadStitcherPrivate(const std::vector<int32_t>& zmwWhitelist,
+                                      const std::string& primaryBamFilePath,
+                                      const std::string& scrapsBamFilePath)
+        : primaryBamFile_{std::make_unique<BamFile>(primaryBamFilePath)}
+        , scrapsBamFile_{std::make_unique<BamFile>(scrapsBamFilePath)}
+        , primaryReader_{std::make_unique<PbiIndexedBamReader>(*primaryBamFile_)}
+        , scrapsReader_{std::make_unique<PbiIndexedBamReader>(*scrapsBamFile_)}
+    {
+        // setup new header for stitched data
+        polyHeader_ = std::make_unique<BamHeader>(primaryBamFile_->Header().ToSam());
+        auto readGroups = polyHeader_->ReadGroups();
+        if (readGroups.empty())
+            throw std::runtime_error{
+                "WhitelistedZmwStitcher: no read groups in header of the primary BAM file: " +
+                primaryBamFilePath};
+        readGroups[0].ReadType("POLYMERASE");
+        readGroups[0].Id(readGroups[0].MovieName(), "POLYMERASE");
+        if (readGroups.size() > 1) {
+            std::vector<ReadGroupInfo> singleGroup;
+            singleGroup.emplace_back(std::move(readGroups[0]));
+            readGroups = std::move(singleGroup);
+            polyHeader_->ClearReadGroups();
+        }
+        polyHeader_->ReadGroups(readGroups);
+
+        // remove ZMWs up front, that are not found in either file
+        PreFilterZmws(zmwWhitelist);
+    }
+
+    bool HasNext() const { return !zmwWhitelist_.empty(); }
+
+    VirtualZmwBamRecord Next()
+    {
+        auto bamRecordVec = NextRaw();
+        return {std::move(bamRecordVec), *polyHeader_};
+    }
+
+    std::vector<BamRecord> NextRaw()
+    {
+        std::vector<BamRecord> result;
+        if (!HasNext()) return result;
+
+        const auto& zmw = zmwWhitelist_.front();
+        primaryReader_->Filter(PbiZmwFilter{zmw});
+        scrapsReader_->Filter(PbiZmwFilter{zmw});
+
+        BamRecord record;
+        while (primaryReader_->GetNext(record))
+            result.push_back(record);
+        while (scrapsReader_->GetNext(record))
+            result.push_back(record);
+
+        zmwWhitelist_.pop_front();
+        return result;
+    }
+
+    BamHeader PrimaryHeader() const { return primaryBamFile_->Header(); }
+
+    BamHeader ScrapsHeader() const { return scrapsBamFile_->Header(); }
+
+private:
+    std::unique_ptr<BamFile> primaryBamFile_;
+    std::unique_ptr<BamFile> scrapsBamFile_;
+    std::unique_ptr<PbiIndexedBamReader> primaryReader_;
+    std::unique_ptr<PbiIndexedBamReader> scrapsReader_;
+    std::unique_ptr<BamHeader> polyHeader_;
+    std::deque<int32_t> zmwWhitelist_;
+
+    void PreFilterZmws(const std::vector<int32_t>& zmwWhitelist)
+    {
+        // fetch input ZMWs
+        const PbiRawData primaryIndex{primaryBamFile_->PacBioIndexFilename()};
+        const PbiRawData scrapsIndex{scrapsBamFile_->PacBioIndexFilename()};
+        const auto& primaryZmws = primaryIndex.BasicData().holeNumber_;
+        const auto& scrapsZmws = scrapsIndex.BasicData().holeNumber_;
+
+        // toss them all into a set (for uniqueness & lookup here soon)
+        std::set<int32_t> inputZmws;
+        for (const auto& zmw : primaryZmws)
+            inputZmws.insert(zmw);
+        for (const auto& zmw : scrapsZmws)
+            inputZmws.insert(zmw);
+
+        // check our requested whitelist against files' ZMWs, keep if found
+        const auto inputEnd = inputZmws.cend();
+        for (const int32_t zmw : zmwWhitelist) {
+            if (inputZmws.find(zmw) != inputEnd) zmwWhitelist_.push_back(zmw);
+        }
+    }
+};
+
+// --------------------------------
+// ZmwReadStitcher implementation
+// --------------------------------
+
+WhitelistedZmwReadStitcher::WhitelistedZmwReadStitcher(const std::vector<int32_t>& zmwWhitelist,
+                                                       const std::string& primaryBamFilePath,
+                                                       const std::string& scrapsBamFilePath)
+    : d_{std::make_unique<WhitelistedZmwReadStitcherPrivate>(zmwWhitelist, primaryBamFilePath,
+                                                             scrapsBamFilePath)}
+{
+}
+
+WhitelistedZmwReadStitcher::~WhitelistedZmwReadStitcher() = default;
+
+bool WhitelistedZmwReadStitcher::HasNext() const { return d_->HasNext(); }
+
+VirtualZmwBamRecord WhitelistedZmwReadStitcher::Next() { return d_->Next(); }
+
+std::vector<BamRecord> WhitelistedZmwReadStitcher::NextRaw() { return d_->NextRaw(); }
+
+BamHeader WhitelistedZmwReadStitcher::PrimaryHeader() const { return d_->PrimaryHeader(); }
+
+BamHeader WhitelistedZmwReadStitcher::ScrapsHeader() const { return d_->ScrapsHeader(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/XmlReader.cpp b/src/XmlReader.cpp

new file mode 100644 (file)

index 0000000..b2b1317
--- /dev/null
+++ b/src/XmlReader.cpp
@@ -0,0 +1,236 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "XmlReader.h"
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <typeinfo>
+#include <vector>
+
+#include "pbbam/StringUtilities.h"
+#include "pugixml/pugixml.hpp"
+
+using DataSetElement = PacBio::BAM::internal::DataSetElement;
+using FromInputXml = PacBio::BAM::internal::FromInputXml;
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+std::unique_ptr<DataSetBase> MakeDataSetBase(const pugi::xml_node& xmlNode)
+{
+    const FromInputXml fromInputXml{};
+    std::string name = xmlNode.name();
+    const auto foundColon = name.find(':');
+    if (foundColon != std::string::npos) {
+        name = name.substr(foundColon + 1);
+    }
+
+    const auto type = ElementTypeFromName(name);
+    switch (type) {
+        case XmlElementType::ALIGNMENT_SET:
+            return std::make_unique<AlignmentSet>(fromInputXml);
+        case XmlElementType::BARCODE_SET:
+            return std::make_unique<BarcodeSet>(fromInputXml);
+        case XmlElementType::CONSENSUS_ALIGNMENT_SET:
+            return std::make_unique<ConsensusAlignmentSet>(fromInputXml);
+        case XmlElementType::CONSENSUS_READ_SET:
+            return std::make_unique<ConsensusReadSet>(fromInputXml);
+        case XmlElementType::CONTIG_SET:
+            return std::make_unique<ContigSet>(fromInputXml);
+        case XmlElementType::HDF_SUBREAD_SET:
+            return std::make_unique<HdfSubreadSet>(fromInputXml);
+        case XmlElementType::REFERENCE_SET:
+            return std::make_unique<ReferenceSet>(fromInputXml);
+        case XmlElementType::SUBREAD_SET:
+            return std::make_unique<SubreadSet>(fromInputXml);
+        case XmlElementType::TRANSCRIPT_SET:
+            return std::make_unique<TranscriptSet>(fromInputXml);
+        case XmlElementType::TRANSCRIPT_ALIGNMENT_SET:
+            return std::make_unique<TranscriptAlignmentSet>(fromInputXml);
+        case XmlElementType::GENERIC_DATASET:
+            return std::make_unique<DataSetBase>(fromInputXml);
+        default:
+            // unreachable
+            throw std::runtime_error{"XmlReader: unknown data set label: " + name};
+    }
+}
+
+std::shared_ptr<DataSetElement> MakeElement(const pugi::xml_node& xmlNode)
+{
+    std::string name = xmlNode.name();
+    const auto foundColon = name.find(':');
+    if (foundColon != std::string::npos) {
+        name = name.substr(foundColon + 1);
+    }
+
+    const FromInputXml fromInputXml{};
+    const auto type = ElementTypeFromName(name);
+    switch (type) {
+        case XmlElementType::DATASET_METADATA:
+            return std::make_shared<DataSetMetadata>(fromInputXml);
+        case XmlElementType::BIOSAMPLE:
+            return std::make_shared<BioSample>("", fromInputXml);
+        case XmlElementType::BIOSAMPLES:
+            return std::make_shared<BioSamples>(fromInputXml);
+        case XmlElementType::DNA_BARCODE:
+            return std::make_shared<DNABarcode>("", fromInputXml);
+        case XmlElementType::DNA_BARCODES:
+            return std::make_shared<DNABarcodes>(fromInputXml);
+        case XmlElementType::EXTENSION:
+            return std::make_shared<ExtensionElement>(fromInputXml);
+        case XmlElementType::EXTENSIONS:
+            return std::make_shared<Extensions>(fromInputXml);
+        case XmlElementType::EXTERNAL_RESOURCE:
+            return std::make_shared<ExternalResource>("", "", fromInputXml);
+        case XmlElementType::EXTERNAL_RESOURCES:
+            return std::make_shared<ExternalResources>(fromInputXml);
+        case XmlElementType::FILE_INDEX:
+            return std::make_shared<FileIndex>("", "", fromInputXml);
+        case XmlElementType::FILE_INDICES:
+            return std::make_shared<FileIndices>(fromInputXml);
+        case XmlElementType::FILTER:
+            return std::make_shared<Filter>(fromInputXml);
+        case XmlElementType::FILTERS:
+            return std::make_shared<Filters>(fromInputXml);
+        case XmlElementType::PARENT_TOOL:
+            return std::make_shared<ParentTool>(fromInputXml);
+        case XmlElementType::PROPERTY:
+            return std::make_shared<Property>("", "", "", fromInputXml);
+        case XmlElementType::PROPERTIES:
+            return std::make_shared<Properties>(fromInputXml);
+        case XmlElementType::PROVENANCE:
+            return std::make_shared<Provenance>(fromInputXml);
+        case XmlElementType::ALIGNMENT_SET:
+            return std::make_shared<AlignmentSet>(fromInputXml);
+        case XmlElementType::BARCODE_SET:
+            return std::make_shared<BarcodeSet>(fromInputXml);
+        case XmlElementType::CONSENSUS_ALIGNMENT_SET:
+            return std::make_shared<ConsensusAlignmentSet>(fromInputXml);
+        case XmlElementType::CONSENSUS_READ_SET:
+            return std::make_shared<ConsensusReadSet>(fromInputXml);
+        case XmlElementType::CONTIG_SET:
+            return std::make_shared<ContigSet>(fromInputXml);
+        case XmlElementType::HDF_SUBREAD_SET:
+            return std::make_shared<HdfSubreadSet>(fromInputXml);
+        case XmlElementType::SUBREAD_SET:
+            return std::make_shared<SubreadSet>(fromInputXml);
+        case XmlElementType::REFERENCE_SET:
+            return std::make_shared<ReferenceSet>(fromInputXml);
+        case XmlElementType::TRANSCRIPT_SET:
+            return std::make_shared<TranscriptSet>(fromInputXml);
+        case XmlElementType::TRANSCRIPT_ALIGNMENT_SET:
+            return std::make_shared<TranscriptAlignmentSet>(fromInputXml);
+        case XmlElementType::SUBDATASETS:
+            return std::make_shared<SubDataSets>(fromInputXml);
+        case XmlElementType::GENERIC_DATASET:
+            return std::make_shared<DataSetBase>(fromInputXml);
+        case XmlElementType::GENERIC_ELEMENT:
+            return std::make_shared<DataSetElement>(name, fromInputXml);
+        default:
+            // unreachable
+            throw std::runtime_error{"XmlReader: unknown data element label: " + name};
+    }
+}
+
+void UpdateRegistry(const std::string& attributeName, const std::string& attributeValue,
+                    NamespaceRegistry& registry)
+{
+    std::vector<std::string> nameParts = Split(attributeName, ':');
+    assert(!nameParts.empty());
+    if (nameParts.size() > 2)
+        throw std::runtime_error{"XmlReader: malformed xmlns attribute: " + attributeName};
+
+    const bool isDefault = (nameParts.size() == 1);
+    const XsdType xsd = registry.XsdForUri(attributeValue);
+
+    if (isDefault)
+        registry.SetDefaultXsd(xsd);
+    else {
+        assert(nameParts.size() == 2);
+        const std::string& name = nameParts.at(1);
+        const std::string& uri = attributeValue;
+        NamespaceInfo namespaceInfo(name, uri);
+        registry.Register(xsd, namespaceInfo);
+    }
+}
+
+void FromXml(const pugi::xml_node& xmlNode, DataSetElement& parent)
+{
+    // ignore non-named XML nodes
+    //
+    // pugi::xml separates XML parts into more node types than we use
+    //
+    const std::string label = xmlNode.name();
+    if (label.empty()) return;
+
+    auto e = MakeElement(xmlNode);
+    e->Label(xmlNode.name());
+    e->Text(xmlNode.text().get());
+
+    // iterate attributes
+    auto attrIter = xmlNode.attributes_begin();
+    auto attrEnd = xmlNode.attributes_end();
+    for (; attrIter != attrEnd; ++attrIter)
+        e->Attribute(attrIter->name(), attrIter->value());
+
+    // iterate children, recursively building up subtree
+    auto childIter = xmlNode.begin();
+    auto childEnd = xmlNode.end();
+    for (; childIter != childEnd; ++childIter) {
+        pugi::xml_node childNode = *childIter;
+        FromXml(childNode, *e.get());
+    }
+
+    parent.AddChild(e);
+}
+
+}  // namespace
+
+std::unique_ptr<DataSetBase> XmlReader::FromStream(std::istream& in)
+{
+    pugi::xml_document doc;
+    const pugi::xml_parse_result loadResult = doc.load(in);
+    if (loadResult.status != pugi::status_ok)
+        throw std::runtime_error{"XmlReader: could not read XML file, error code:" +
+                                 std::to_string(loadResult.status)};
+
+    // parse top-level attributes
+    pugi::xml_node rootNode = doc.document_element();
+    if (rootNode == pugi::xml_node())
+        throw std::runtime_error{"XmlReader: could not fetch XML root node"};
+
+    // create dataset matching type strings
+    auto dataset = MakeDataSetBase(rootNode);
+    dataset->Label(rootNode.name());
+
+    // iterate attributes, capture namespace info
+    const std::string xmlnsPrefix("xmlns");
+    auto attrIter = rootNode.attributes_begin();
+    auto attrEnd = rootNode.attributes_end();
+    for (; attrIter != attrEnd; ++attrIter) {
+        const std::string name = attrIter->name();
+        const std::string value = attrIter->value();
+        dataset->Attribute(name, value);
+        if (name.find(xmlnsPrefix) == 0) {
+            UpdateRegistry(name, value, dataset->Namespaces());
+        }
+    }
+
+    // iterate children, recursively building up subtree
+    auto childIter = rootNode.begin();
+    auto childEnd = rootNode.end();
+    for (; childIter != childEnd; ++childIter) {
+        pugi::xml_node childNode = *childIter;
+        FromXml(childNode, *dataset.get());
+    }
+
+    return dataset;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/XmlReader.h b/src/XmlReader.h

new file mode 100644 (file)

index 0000000..352dc40
--- /dev/null
+++ b/src/XmlReader.h
@@ -0,0 +1,25 @@
+// Author: Derek Barnett
+
+#ifndef XMLREADER_H
+#define XMLREADER_H
+
+#include "pbbam/Config.h"
+
+#include <iosfwd>
+#include <memory>
+
+#include "pbbam/DataSet.h"
+
+namespace PacBio {
+namespace BAM {
+
+class XmlReader
+{
+public:
+    static std::unique_ptr<DataSetBase> FromStream(std::istream& in);
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // XMLREADER_H
diff --git a/src/XmlWriter.cpp b/src/XmlWriter.cpp

new file mode 100644 (file)

index 0000000..24fbd69
--- /dev/null
+++ b/src/XmlWriter.cpp
@@ -0,0 +1,167 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "XmlWriter.h"
+
+#include <cstddef>
+#include <fstream>
+#include <iostream>
+#include <map>
+
+#include "pbbam/DataSet.h"
+
+#include "FileUtils.h"
+#include "pugixml/pugixml.hpp"
+
+using DataSetElement = PacBio::BAM::internal::DataSetElement;
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+std::string Prefix(const std::string& input)
+{
+    const auto colonFound = input.find(':');
+    if (colonFound == std::string::npos || colonFound == 0) return std::string();
+    return input.substr(0, colonFound);
+}
+
+std::string OutputName(const DataSetElement& node, const NamespaceRegistry& registry)
+{
+    // if from input XML, respect the namespaces given
+    if (node.IsVerbatimLabel()) return node.QualifiedNameLabel();
+
+    // otherwise, probably user-generated
+    else {
+        // if no namespace prefix, prepend the appropriate one & return
+        if (node.PrefixLabel().empty()) {
+            static const std::string colon = ":";
+            auto xsdType = node.Xsd();
+            if (xsdType == XsdType::NONE)
+                xsdType = registry.XsdForElement(node.LocalNameLabel().to_string());
+            return registry.Namespace(xsdType).Name() + colon + node.LocalNameLabel().to_string();
+        }
+        // otherwise, has prefix - return full name
+        else
+            return node.QualifiedNameLabel();
+    }
+}
+
+void ToXml(const DataSetElement& node, const NamespaceRegistry& registry,
+           std::map<XsdType, std::string>& xsdPrefixesUsed, pugi::xml_node& parentXml,
+           const DataSetBase& dataset)
+{
+    // create child of parent, w/ label & text
+    const auto label = OutputName(node, registry);
+    if (label.empty()) return;  // error?
+    auto xmlNode = parentXml.append_child(label.c_str());
+
+    if (!node.Text().empty()) xmlNode.text().set(node.Text().c_str());
+
+    // store XSD type for later
+    const auto prefix = Prefix(label);
+    if (!prefix.empty()) xsdPrefixesUsed[node.Xsd()] = prefix;
+
+    // add attributes
+    for (const auto& attribute : node.Attributes()) {
+        const auto& name = attribute.first;
+        if (name.empty()) continue;
+
+        auto attr = xmlNode.append_attribute(name.c_str());
+        std::string value = attribute.second.c_str();
+        // "absolutize" any paths, except relative paths from verbatim input XML
+        if (!dataset.FromInputXml() && name == "ResourceId")
+            value = FileUtils::ResolvedFilePath(value, dataset.Path());
+        attr.set_value(value.c_str());
+    }
+
+    // additional stuff later? (e.g. comments)
+
+    // iterate children, recursively building up subtree
+    for (const auto& child : node.Children())
+        ToXml(*child, registry, xsdPrefixesUsed, xmlNode, dataset);
+}
+
+}  // namespace
+
+void XmlWriter::ToStream(const DataSetBase& dataset, std::ostream& out)
+{
+    pugi::xml_document doc;
+
+    const auto& registry = dataset.Namespaces();
+
+    // create top-level dataset XML node
+    const auto label = OutputName(dataset, registry);
+    if (label.empty()) throw std::runtime_error{"XmlReader: could not convert dataset node to XML"};
+    auto root = doc.append_child(label.c_str());
+
+    const auto& text = dataset.Text();
+    if (!text.empty()) root.text().set(text.c_str());
+
+    // add top-level attributes
+    for (const auto& attribute : dataset.Attributes()) {
+        const auto& name = attribute.first;
+        const auto& value = attribute.second;
+        if (name.empty()) continue;
+        auto attr = root.append_attribute(name.c_str());
+        attr.set_value(value.c_str());
+    }
+
+    std::map<XsdType, std::string> xsdPrefixesUsed;
+    xsdPrefixesUsed[dataset.Xsd()] = Prefix(label);
+
+    // iterate children, recursively building up subtree
+    for (const auto& child : dataset.Children())
+        ToXml(*child, registry, xsdPrefixesUsed, root, dataset);
+
+    // write XML to stream
+    auto decl = doc.prepend_child(pugi::node_declaration);
+    decl.append_attribute("version") = "1.0";
+    decl.append_attribute("encoding") = "utf-8";
+
+    // add XSD namespace attributes
+    auto xmlnsDefaultAttribute = root.attribute("xmlns");
+    if (xmlnsDefaultAttribute.empty()) {
+        xmlnsDefaultAttribute = root.append_attribute("xmlns");
+        xmlnsDefaultAttribute.set_value(registry.DefaultNamespace().Uri().c_str());
+    }
+    auto xsiAttribute = root.attribute("xmlns:xsi");
+    if (xsiAttribute.empty()) {
+        xsiAttribute = root.append_attribute("xmlns:xsi");
+        xsiAttribute.set_value("http://www.w3.org/2001/XMLSchema-instance");
+    }
+    auto xsiSchemaLocationAttribute = root.attribute("xsi:schemaLocation");
+    if (xsiSchemaLocationAttribute.empty()) {
+        xsiSchemaLocationAttribute = root.append_attribute("xsi:schemaLocation");
+        xsiSchemaLocationAttribute.set_value(registry.DefaultNamespace().Uri().c_str());
+    }
+
+    static const std::string xmlnsPrefix = "xmlns:";
+    for (const auto prefixIter : xsdPrefixesUsed) {
+        const auto& xsdType = prefixIter.first;
+        const auto& prefix = prefixIter.second;
+        if (xsdType == XsdType::NONE || prefix.empty()) continue;
+
+        const auto& nsInfo = registry.Namespace(xsdType);
+        assert(nsInfo.Name() == prefix);
+        const auto xmlnsName = xmlnsPrefix + prefix;
+        auto xmlnsAttribute = root.attribute(xmlnsName.c_str());
+        if (xmlnsAttribute.empty()) {
+            xmlnsAttribute = root.append_attribute(xmlnsName.c_str());
+            xmlnsAttribute.set_value(nsInfo.Uri().c_str());
+        }
+    }
+
+    // "no escapes" to allow explicit ">" "<" comparison operators in filter parameters
+    // we may remove this if/when comparison is separated from the value
+    doc.save(out, "\t", pugi::format_default | pugi::format_no_escapes, pugi::encoding_utf8);
+}
+
+void XmlWriter::ToStream(const std::unique_ptr<DataSetBase>& dataset, std::ostream& out)
+{
+    ToStream(*dataset.get(), out);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/XmlWriter.h b/src/XmlWriter.h

new file mode 100644 (file)

index 0000000..5a3f481
--- /dev/null
+++ b/src/XmlWriter.h
@@ -0,0 +1,26 @@
+// Author: Derek Barnett
+
+#ifndef XMLWRITER_H
+#define XMLWRITER_H
+
+#include "pbbam/Config.h"
+
+#include <iosfwd>
+#include <memory>
+
+namespace PacBio {
+namespace BAM {
+
+class DataSetBase;
+
+class XmlWriter
+{
+public:
+    static void ToStream(const DataSetBase& dataset, std::ostream& out);
+    static void ToStream(const std::unique_ptr<DataSetBase>& dataset, std::ostream& out);
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // XMLWRITER_H
diff --git a/src/ZmwChunkedFastaReader.cpp b/src/ZmwChunkedFastaReader.cpp

new file mode 100644 (file)

index 0000000..be061d1
--- /dev/null
+++ b/src/ZmwChunkedFastaReader.cpp
@@ -0,0 +1,146 @@
+// File Description
+/// \file ZmwChunkedFastaReader.cpp
+/// \brief Implements the ZmwChunkedFastaReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ZmwChunkedFastaReader.h"
+
+#include <cstdio>
+
+#include <algorithm>
+#include <cassert>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <htslib/kseq.h>
+
+#include "pbbam/FaiIndex.h"
+#include "pbbam/FormatUtils.h"
+
+#include "MemoryUtils.h"
+#include "ZmwChunkedFastxBgzfReader.h"
+#include "ZmwChunkedFastxReaderImpl.h"
+#include "ZmwChunkedFastxTextReader.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+std::unique_ptr<ZmwChunkedFastxReaderImpl> MakeFastaReaderImpl(std::string filename,
+                                                               const size_t numChunks)
+{
+    // validate extension
+    if (!FormatUtils::IsFastaFilename(filename)) {
+        throw std::runtime_error{"ZmwChunkedFastaReader: filename '" + filename +
+                                 "' is not recognized as a FASTA file."};
+    }
+
+    // determine subsequence "loader" from compression type: plain-text, bgzf, or unsupported
+    const auto compressionType = FormatUtils::CompressionType(filename);
+    switch (compressionType) {
+
+        case HtslibCompression::NONE:
+            return std::make_unique<ZmwChunkedFastxTextReader>(std::move(filename), numChunks);
+        case HtslibCompression::BGZIP:
+            return std::make_unique<ZmwChunkedFastxBgzfReader>(std::move(filename), numChunks);
+
+        case HtslibCompression::GZIP: {
+            std::ostringstream msg;
+            msg << "ZmwChunkedFastaReader: random-access is not supported for plain gzipped "
+                   "file "
+                << filename << "\n\n"
+                << "Compressed files must be bgzipped, with accompanying *.gzi "
+                   "index.\n\n"
+                << "To keep the original gzipped file unchanged:\n"
+                << "  $ gunzip -c " << filename << " > <unzipped_file>\n"
+                << "or discard the gzipped file:\n"
+                << "  $ gunzip " << filename << '\n'
+                << '\n'
+                << "Re-compress & create *.gzi index:\n"
+                << "  $ bgzip --index <unzipped_file>\n\n";
+            throw std::runtime_error{msg.str()};
+        }
+        default:
+            assert(false);  // should never get here, the way htslib currently determines type
+            throw std::runtime_error{
+                "ZmwChunkedFastaReader: could not determine compression type for file: " +
+                filename};
+    }
+}
+
+}  // namespace
+
+class ZmwChunkedFastaReader::ZmwChunkedFastaReaderPrivate
+{
+public:
+    explicit ZmwChunkedFastaReaderPrivate(const std::string& fn, const size_t numChunks)
+        : reader_{MakeFastaReaderImpl(std::move(fn), numChunks)}
+    {
+        assert(reader_->chunker_.NumChunks() != 0);
+        Chunk(0);
+    }
+
+    void Chunk(size_t chunkId)
+    {
+        const auto& chunk = reader_->chunker_.Chunk(chunkId);
+        remaining = chunk.NumRecords;
+        reader_->Seek(chunk.FirstSeqOffset);
+        currentChunkId_ = chunkId;
+        firstRecord = true;
+    }
+
+    bool GetNext(FastaSequence& record)
+    {
+        if (remaining == 0) return false;
+        record = reader_->ReadNextFasta(firstRecord);
+        if (firstRecord) {
+            record.Name(reader_->chunker_.Chunk(currentChunkId_).FirstSeqName);
+            firstRecord = false;
+        }
+        --remaining;
+        return true;
+    }
+
+    // reader
+    std::unique_ptr<ZmwChunkedFastxReaderImpl> reader_;
+    size_t currentChunkId_ = 0;
+    bool firstRecord;
+    size_t remaining;
+};
+
+static_assert(!std::is_copy_constructible<ZmwChunkedFastaReader>::value,
+              "ZmwChunkedFastaReader(const ZmwChunkedFastaReader&) is not = delete");
+static_assert(!std::is_copy_assignable<ZmwChunkedFastaReader>::value,
+              "ZmwChunkedFastaReader& operator=(const ZmwChunkedFastaReader&) is not = delete");
+
+ZmwChunkedFastaReader::ZmwChunkedFastaReader(const std::string& fn, const size_t numChunks)
+    : internal::QueryBase<FastaSequence>{}
+    , d_{std::make_unique<ZmwChunkedFastaReaderPrivate>(fn, numChunks)}
+{
+}
+
+ZmwChunkedFastaReader::ZmwChunkedFastaReader(ZmwChunkedFastaReader&&) noexcept = default;
+
+ZmwChunkedFastaReader& ZmwChunkedFastaReader::operator=(ZmwChunkedFastaReader&&) noexcept = default;
+
+ZmwChunkedFastaReader::~ZmwChunkedFastaReader() = default;
+
+size_t ZmwChunkedFastaReader::NumChunks() const { return d_->reader_->chunker_.NumChunks(); }
+
+ZmwChunkedFastaReader& ZmwChunkedFastaReader::Chunk(size_t chunkId)
+{
+    d_->Chunk(chunkId);
+    return *this;
+}
+
+size_t ZmwChunkedFastaReader::Chunk() const { return d_->currentChunkId_; }
+
+bool ZmwChunkedFastaReader::GetNext(FastaSequence& record) { return d_->GetNext(record); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ZmwChunkedFastqReader.cpp b/src/ZmwChunkedFastqReader.cpp

new file mode 100644 (file)

index 0000000..3bbcf2b
--- /dev/null
+++ b/src/ZmwChunkedFastqReader.cpp
@@ -0,0 +1,146 @@
+// File Description
+/// \file ZmwChunkedFastqReader.cpp
+/// \brief Implements the ZmwChunkedFastqReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ZmwChunkedFastqReader.h"
+
+#include <cstdio>
+
+#include <algorithm>
+#include <cassert>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <htslib/kseq.h>
+
+#include "pbbam/FaiIndex.h"
+#include "pbbam/FormatUtils.h"
+
+#include "MemoryUtils.h"
+#include "ZmwChunkedFastxBgzfReader.h"
+#include "ZmwChunkedFastxReaderImpl.h"
+#include "ZmwChunkedFastxTextReader.h"
+
+namespace PacBio {
+namespace BAM {
+namespace {
+
+std::unique_ptr<ZmwChunkedFastxReaderImpl> MakeFastqReaderImpl(std::string filename,
+                                                               const size_t numChunks)
+{
+    // validate extension
+    if (!FormatUtils::IsFastqFilename(filename)) {
+        throw std::runtime_error{"ZmwChunkedFastqReader: filename '" + filename +
+                                 "' is not recognized as a FASTA file."};
+    }
+
+    // determine subsequence "loader" from compression type: plain-text, bgzf, or unsupported
+    const auto compressionType = FormatUtils::CompressionType(filename);
+    switch (compressionType) {
+
+        case HtslibCompression::NONE:
+            return std::make_unique<ZmwChunkedFastxTextReader>(std::move(filename), numChunks);
+        case HtslibCompression::BGZIP:
+            return std::make_unique<ZmwChunkedFastxBgzfReader>(std::move(filename), numChunks);
+
+        case HtslibCompression::GZIP: {
+            std::ostringstream msg;
+            msg << "ZmwChunkedFastqReader: random-access is not supported for plain gzipped "
+                   "file "
+                << filename << "\n\n"
+                << "Compressed files must be bgzipped, with accompanying *.gzi "
+                   "index.\n\n"
+                << "To keep the original gzipped file unchanged:\n"
+                << "  $ gunzip -c " << filename << " > <unzipped_file>\n"
+                << "or discard the gzipped file:\n"
+                << "  $ gunzip " << filename << '\n'
+                << '\n'
+                << "Re-compress & create *.gzi index:\n"
+                << "  $ bgzip --index <unzipped_file>\n\n";
+            throw std::runtime_error{msg.str()};
+        }
+        default:
+            assert(false);  // should never get here, the way htslib currently determines type
+            throw std::runtime_error{
+                "ZmwChunkedFastqReader: could not determine compression type for file: " +
+                filename};
+    }
+}
+
+}  // namespace
+
+class ZmwChunkedFastqReader::ZmwChunkedFastqReaderPrivate
+{
+public:
+    explicit ZmwChunkedFastqReaderPrivate(const std::string& fn, const size_t numChunks)
+        : reader_{MakeFastqReaderImpl(std::move(fn), numChunks)}
+    {
+        assert(reader_->chunker_.NumChunks() != 0);
+        Chunk(0);
+    }
+
+    void Chunk(size_t chunkId)
+    {
+        const auto& chunk = reader_->chunker_.Chunk(chunkId);
+        remaining = chunk.NumRecords;
+        reader_->Seek(chunk.FirstSeqOffset);
+        currentChunkId_ = chunkId;
+        firstRecord = true;
+    }
+
+    bool GetNext(FastqSequence& record)
+    {
+        if (remaining == 0) return false;
+        record = reader_->ReadNextFastq(firstRecord);
+        if (firstRecord) {
+            record.Name(reader_->chunker_.Chunk(currentChunkId_).FirstSeqName);
+            firstRecord = false;
+        }
+        --remaining;
+        return true;
+    }
+
+    // reader
+    std::unique_ptr<ZmwChunkedFastxReaderImpl> reader_;
+    size_t currentChunkId_ = 0;
+    bool firstRecord;
+    size_t remaining;
+};
+
+static_assert(!std::is_copy_constructible<ZmwChunkedFastqReader>::value,
+              "ZmwChunkedFastqReader(const ZmwChunkedFastqReader&) is not = delete");
+static_assert(!std::is_copy_assignable<ZmwChunkedFastqReader>::value,
+              "ZmwChunkedFastqReader& operator=(const ZmwChunkedFastqReader&) is not = delete");
+
+ZmwChunkedFastqReader::ZmwChunkedFastqReader(const std::string& fn, const size_t numChunks)
+    : internal::QueryBase<FastqSequence>{}
+    , d_{std::make_unique<ZmwChunkedFastqReaderPrivate>(fn, numChunks)}
+{
+}
+
+ZmwChunkedFastqReader::ZmwChunkedFastqReader(ZmwChunkedFastqReader&&) noexcept = default;
+
+ZmwChunkedFastqReader& ZmwChunkedFastqReader::operator=(ZmwChunkedFastqReader&&) noexcept = default;
+
+ZmwChunkedFastqReader::~ZmwChunkedFastqReader() = default;
+
+size_t ZmwChunkedFastqReader::NumChunks() const { return d_->reader_->chunker_.NumChunks(); }
+
+ZmwChunkedFastqReader& ZmwChunkedFastqReader::Chunk(size_t chunkId)
+{
+    d_->Chunk(chunkId);
+    return *this;
+}
+
+size_t ZmwChunkedFastqReader::Chunk() const { return d_->currentChunkId_; }
+
+bool ZmwChunkedFastqReader::GetNext(FastqSequence& record) { return d_->GetNext(record); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ZmwChunkedFastxBgzfReader.cpp b/src/ZmwChunkedFastxBgzfReader.cpp

new file mode 100644 (file)

index 0000000..7f81f18
--- /dev/null
+++ b/src/ZmwChunkedFastxBgzfReader.cpp
@@ -0,0 +1,157 @@
+// File Description
+/// \file BgzFastqLoader.cpp
+/// \brief Implements the ZmwChunkedFastxBgzfReaderr class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "ZmwChunkedFastxBgzfReader.h"
+
+#include <algorithm>
+#include <sstream>
+#include <stdexcept>
+
+namespace PacBio {
+namespace BAM {
+
+ZmwChunkedFastxBgzfReader::ZmwChunkedFastxBgzfReader(std::string filename, const size_t numChunks)
+    : ZmwChunkedFastxReaderImpl{std::move(filename), numChunks}
+    , file_{bgzf_open(fastxFilename_.c_str(), "r")}
+    , seq_{kseq_init(file_.get())}
+{
+    // check BGZF file handle
+    if (file_ == nullptr) {
+        std::ostringstream msg;
+        msg << "ZmwChunkedFastxBgzfReader: could not open file for reading\n"
+            << "  file: " << fastxFilename_ << '\n';
+        throw std::runtime_error{msg.str()};
+    }
+
+    // check kseq sequence handle
+    assert(seq_ != nullptr);
+
+    // load BGZF index data (*.gzi)
+    const auto result = bgzf_index_load(file_.get(), fastxFilename_.c_str(), ".gzi");
+    if (result != 0) {
+        std::ostringstream msg;
+        msg << "ZmwChunkedFastxBgzfReader: could not load bgzf index data\n"
+            << "  file: " << fastxFilename_ << '\n'
+            << "  index file: " << fastxFilename_ << ".gzi\n";
+        throw std::runtime_error{msg.str()};
+    }
+}
+
+int ZmwChunkedFastxBgzfReader::FetchRecord(bool skipName)
+{
+    // NOTE: kseq_read assumes it is at the beginning of "next" sequence's name.
+    //       However, here the file handle may already point to the first base after
+    //       seeking using FAI. So we optionally load the name.
+
+    int c;
+    kseq_t* seq = seq_.get();
+    kstream_t* ks = seq->f;
+    seq_->comment.l = seq_->seq.l = seq_->qual.l = 0; /* reset all members */
+
+    if (!skipName) {
+
+        if (seq->last_char == 0) { /* then jump to the next header line */
+            while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@')
+                ;
+            if (c == -1) return -1; /* end of file */
+            seq->last_char = c;
+        } /* else: the first header char has been read in the previous call */
+
+        if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;         /* normal exit: EOF */
+        if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */
+    }
+
+    if (seq_->seq.s == 0) { /* we can do this in the loop below, but that is slower */
+        seq_->seq.m = 256;
+        seq_->seq.s = (char*)malloc(seq_->seq.m);
+    }
+    while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') {
+        if (c == '\n') continue;        /* skip empty lines */
+        seq_->seq.s[seq_->seq.l++] = c; /* this is safe: we always have enough space for 1 char */
+        ks_getuntil2(ks, KS_SEP_LINE, &seq_->seq, 0, 1); /* read the rest of the line */
+    }
+
+    if (c == '>' || c == '@') seq_->last_char = c; /* the first header char has been read */
+    if (seq_->seq.l + 1 >=
+        seq_->seq.m) { /* seq_->seq.s[seq_->seq.l] below may be out of boundary */
+        seq_->seq.m = seq_->seq.l + 2;
+        kroundup32(seq_->seq.m); /* rounded to the next closest 2^k */
+        seq_->seq.s = (char*)realloc(seq_->seq.s, seq_->seq.m);
+    }
+    seq_->seq.s[seq_->seq.l] = 0; /* null terminated string */
+
+    if (c != '+') return seq_->seq.l; /* FASTA */
+    if (seq_->qual.m < seq_->seq.m) { /* allocate memory for qual in case insufficient */
+        seq_->qual.m = seq_->seq.m;
+        seq_->qual.s = (char*)realloc(seq_->qual.s, seq_->qual.m);
+    }
+
+    while ((c = ks_getc(ks)) != -1 && c != '\n')
+        ;                   /* skip the rest of '+' line */
+    if (c == -1) return -2; /* error: no quality string */
+    while (ks_getuntil2(ks, KS_SEP_LINE, &seq_->qual, 0, 1) >= 0 && seq_->qual.l < seq_->seq.l)
+        ;
+
+    seq_->last_char = 0; /* we have not come to the next header line */
+
+    if (seq_->seq.l != seq_->qual.l) return -2; /* error: qual string is of a different length */
+    return seq_->seq.l;
+}
+
+FastaSequence ZmwChunkedFastxBgzfReader::ReadNextFasta(bool skipName)
+{
+    // read sequence
+    const auto result = FetchRecord(skipName);
+    if (result < 0) {
+        std::ostringstream msg;
+        msg << "ZmwChunkedFastxBgzfReader: error reading from\n"
+            << "  file: " << fastxFilename_ << '\n'
+            << "  reason: likely truncated quality string\n";
+        throw std::runtime_error{msg.str()};
+    }
+
+    // return FASTQ
+    std::string name = (skipName ? "" : std::string{seq_->name.s, seq_->name.l});
+    std::string bases{seq_->seq.s, seq_->seq.l};
+    return FastaSequence{std::move(name), std::move(bases)};
+}
+
+FastqSequence ZmwChunkedFastxBgzfReader::ReadNextFastq(bool skipName)
+{
+    // read sequence
+    const auto result = FetchRecord(skipName);
+    if (result < 0) {
+        std::ostringstream msg;
+        msg << "ZmwChunkedFastxBgzfReader: error reading from\n"
+            << "  file: " << fastxFilename_ << '\n'
+            << "  reason: likely truncated quality string\n";
+        throw std::runtime_error{msg.str()};
+    }
+
+    // return FASTQ
+    std::string name = (skipName ? "" : std::string{seq_->name.s, seq_->name.l});
+    std::string bases{seq_->seq.s, seq_->seq.l};
+    QualityValues quals{std::string{seq_->qual.s, seq_->qual.l}};
+    return FastqSequence{std::move(name), std::move(bases), std::move(quals)};
+}
+
+void ZmwChunkedFastxBgzfReader::Seek(uint64_t pos)
+{
+    // seek to sequence 'id' & reset kseq handle
+    auto result = bgzf_useek(file_.get(), pos, SEEK_SET);
+    if (result != 0) {
+        std::ostringstream msg;
+        msg << "ZmwChunkedFastxBgzfReader: could not seek to requested pos: " << pos << '\n'
+            << "  in file: " << fastxFilename_;
+        throw std::runtime_error{msg.str()};
+    }
+    ks_rewind(seq_->f);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ZmwChunkedFastxBgzfReader.h b/src/ZmwChunkedFastxBgzfReader.h

new file mode 100644 (file)

index 0000000..5426bd9
--- /dev/null
+++ b/src/ZmwChunkedFastxBgzfReader.h
@@ -0,0 +1,53 @@
+// File Description
+/// \file ZmwChunkedFastxBgzfReader.h
+/// \brief Defines the ZmwChunkedFastxBgzfReader class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWCHUNKEDFASTXBGZFREADER_H
+#define ZMWCHUNKEDFASTXBGZFREADER_H
+
+#include "pbbam/Config.h"
+
+#include "ZmwChunkedFastxReaderImpl.h"
+
+#include <memory>
+
+#include <htslib/kseq.h>
+
+#include "MemoryUtils.h"
+
+namespace PacBio {
+namespace BAM {
+
+class ZmwChunkedFastxBgzfReader final : public ZmwChunkedFastxReaderImpl
+{
+public:
+    ZmwChunkedFastxBgzfReader(std::string filename, const size_t numChunks);
+
+    void Seek(uint64_t pos) final;
+    FastaSequence ReadNextFasta(bool skipName) final;
+    FastqSequence ReadNextFastq(bool skipName) final;
+
+private:
+    int FetchRecord(bool getName);
+
+    // specialize kseq_t for BGZF handle
+    KSEQ_INIT(BGZF*, bgzf_read);
+    struct KSeqDeleter
+    {
+        void operator()(kseq_t* seq) const
+        {
+            if (seq) kseq_destroy(seq);
+            seq = nullptr;
+        }
+    };
+
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> file_;
+    std::unique_ptr<kseq_t, KSeqDeleter> seq_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWCHUNKEDFASTXBGZFREADER_H
+\ No newline at end of file
diff --git a/src/ZmwChunkedFastxReaderImpl.cpp b/src/ZmwChunkedFastxReaderImpl.cpp

new file mode 100644 (file)

index 0000000..fa57596
--- /dev/null
+++ b/src/ZmwChunkedFastxReaderImpl.cpp
@@ -0,0 +1,25 @@
+// File Description
+/// \file ZmwChunkedFastxReaderImpl.cpp
+/// \brief Implements the ZmwChunkedFastxReaderImpl class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "ZmwChunkedFastxReaderImpl.h"
+
+namespace PacBio {
+namespace BAM {
+
+ZmwChunkedFastxReaderImpl::ZmwChunkedFastxReaderImpl(std::string filename, const size_t numChunks)
+    : fastxFilename_{std::move(filename)}
+    , faiFilename_{fastxFilename_ + ".fai"}
+    , index_{faiFilename_}
+    , chunker_{index_, numChunks}
+{
+}
+
+ZmwChunkedFastxReaderImpl::~ZmwChunkedFastxReaderImpl() = default;
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ZmwChunkedFastxReaderImpl.h b/src/ZmwChunkedFastxReaderImpl.h

new file mode 100644 (file)

index 0000000..c722b01
--- /dev/null
+++ b/src/ZmwChunkedFastxReaderImpl.h
@@ -0,0 +1,45 @@
+// File Description
+/// \file ZmwChunkedFastxReaderImpl.h
+/// \brief Defines the ZmwChunkedFastxReaderImpl class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWCHUNKEDFASTXREADERIMPL_H
+#define ZMWCHUNKEDFASTXREADERIMPL_H
+
+#include "pbbam/Config.h"
+
+#include <string>
+#include <utility>
+
+#include "pbbam/FaiIndex.h"
+#include "pbbam/FastaSequence.h"
+#include "pbbam/FastqSequence.h"
+
+#include "FaiZmwChunker.h"
+
+namespace PacBio {
+namespace BAM {
+
+class ZmwChunkedFastxReaderImpl
+{
+public:
+    virtual ~ZmwChunkedFastxReaderImpl();
+
+    virtual void Seek(uint64_t pos) = 0;
+    virtual FastaSequence ReadNextFasta(bool skipName) = 0;
+    virtual FastqSequence ReadNextFastq(bool skipName) = 0;
+
+    std::string fastxFilename_;
+    std::string faiFilename_;
+    FaiIndex index_;
+    FaiZmwChunker chunker_;
+
+protected:
+    ZmwChunkedFastxReaderImpl(std::string fastxFilename, const size_t numChunks);
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWCHUNKEDFASTXREADERIMPL_H
+\ No newline at end of file
diff --git a/src/ZmwChunkedFastxTextReader.cpp b/src/ZmwChunkedFastxTextReader.cpp

new file mode 100644 (file)

index 0000000..6b219f3
--- /dev/null
+++ b/src/ZmwChunkedFastxTextReader.cpp
@@ -0,0 +1,157 @@
+// File Description
+/// \file ZmwChunkedFastxTextReader.cpp
+/// \brief Implements the ZmwChunkedFastxTextReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "ZmwChunkedFastxTextReader.h"
+
+#include <unistd.h>
+#include <cassert>
+#include <cstdio>
+
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+
+namespace PacBio {
+namespace BAM {
+
+ZmwChunkedFastxTextReader::ZmwChunkedFastxTextReader(std::string filename, const size_t numChunks)
+    : ZmwChunkedFastxReaderImpl{std::move(filename), numChunks}
+    , file_{fopen(fastxFilename_.c_str(), "r")}
+    , seq_{kseq_init(file_.get())}
+{
+    // check file handle
+    if (file_ == nullptr) {
+        std::ostringstream msg;
+        msg << "ZmwChunkedFastxTextReader: could not open file for reading\n"
+            << "  file: " << fastxFilename_ << '\n';
+        throw std::runtime_error{msg.str()};
+    }
+
+    // check kseq sequence handle
+    assert(seq_ != nullptr);
+}
+
+int ZmwChunkedFastxTextReader::FetchRecord(bool skipName)
+{
+    // NOTE: kseq_read assumes it is at the beginning of "next" sequence's name.
+    //       However, here the file handle may already point to the first base after
+    //       seeking using FAI. So we optionally load the name.
+
+    int c;
+    kseq_t* seq = seq_.get();
+    kstream_t* ks = seq->f;
+    seq_->comment.l = seq_->seq.l = seq_->qual.l = 0; /* reset all members */
+
+    if (!skipName) {
+
+        if (seq->last_char == 0) { /* then jump to the next header line */
+            while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@')
+                ;
+            if (c == -1) return -1; /* end of file */
+            seq->last_char = c;
+        } /* else: the first header char has been read in the previous call */
+
+        if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;         /* normal exit: EOF */
+        if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */
+    }
+
+    if (seq_->seq.s == 0) { /* we can do this in the loop below, but that is slower */
+        seq_->seq.m = 256;
+        seq_->seq.s = (char*)malloc(seq_->seq.m);
+    }
+    while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') {
+        if (c == '\n') continue;        /* skip empty lines */
+        seq_->seq.s[seq_->seq.l++] = c; /* this is safe: we always have enough space for 1 char */
+        ks_getuntil2(ks, KS_SEP_LINE, &seq_->seq, 0, 1); /* read the rest of the line */
+    }
+
+    if (c == '>' || c == '@') seq_->last_char = c; /* the first header char has been read */
+    if (seq_->seq.l + 1 >=
+        seq_->seq.m) { /* seq_->seq.s[seq_->seq.l] below may be out of boundary */
+        seq_->seq.m = seq_->seq.l + 2;
+        kroundup32(seq_->seq.m); /* rounded to the next closest 2^k */
+        seq_->seq.s = (char*)realloc(seq_->seq.s, seq_->seq.m);
+    }
+    seq_->seq.s[seq_->seq.l] = 0; /* null terminated string */
+
+    if (c != '+') return seq_->seq.l; /* FASTA */
+    if (seq_->qual.m < seq_->seq.m) { /* allocate memory for qual in case insufficient */
+        seq_->qual.m = seq_->seq.m;
+        seq_->qual.s = (char*)realloc(seq_->qual.s, seq_->qual.m);
+    }
+
+    while ((c = ks_getc(ks)) != -1 && c != '\n')
+        ;                   /* skip the rest of '+' line */
+    if (c == -1) return -2; /* error: no quality string */
+    while (ks_getuntil2(ks, KS_SEP_LINE, &seq_->qual, 0, 1) >= 0 && seq_->qual.l < seq_->seq.l)
+        ;
+
+    seq_->last_char = 0; /* we have not come to the next header line */
+
+    if (seq_->seq.l != seq_->qual.l) return -2; /* error: qual string is of a different length */
+    return seq_->seq.l;
+}
+
+int ZmwChunkedFastxTextReader::ReadFromFile(FILE* fp, void* data, size_t length)
+{
+    return static_cast<int>(std::fread(data, sizeof(uint8_t), length, fp));
+}
+
+FastaSequence ZmwChunkedFastxTextReader::ReadNextFasta(bool skipName)
+{
+    // read sequence
+    const auto result = FetchRecord(skipName);
+    if (result < 0) {
+        std::ostringstream msg;
+        msg << "ZmwChunkedFastxTextReader: error reading from\n"
+            << "  file: " << fastxFilename_ << '\n'
+            << "  reason: likely truncated quality string\n";
+        throw std::runtime_error{msg.str()};
+    }
+
+    // return FASTQ
+    std::string name = (skipName ? "" : std::string{seq_->name.s, seq_->name.l});
+    std::string bases{seq_->seq.s, seq_->seq.l};
+    return FastaSequence{std::move(name), std::move(bases)};
+}
+
+FastqSequence ZmwChunkedFastxTextReader::ReadNextFastq(bool skipName)
+{
+    // read sequence
+    const auto result = FetchRecord(skipName);
+    if (result < 0) {
+        std::ostringstream msg;
+        msg << "ZmwChunkedFastxTextReader: error reading from\n"
+            << "  file: " << fastxFilename_ << '\n'
+            << "  reason: likely truncated quality string\n";
+        throw std::runtime_error{msg.str()};
+    }
+
+    // return FASTQ
+    std::string name = (skipName ? "" : std::string{seq_->name.s, seq_->name.l});
+    std::string bases{seq_->seq.s, seq_->seq.l};
+    Data::QualityValues quals{std::string{seq_->qual.s, seq_->qual.l}};
+    return FastqSequence{std::move(name), std::move(bases), std::move(quals)};
+}
+
+void ZmwChunkedFastxTextReader::Seek(uint64_t pos)
+{
+    // seek to sequence 'id' & reset kseq handle
+    auto result = fseek(file_.get(), pos, SEEK_SET);
+    if (result != 0) {
+        std::ostringstream msg;
+        msg << "ZmwChunkedFastxTextReader: could not seek to requested pos: " << pos << '\n'
+            << "  in file: " << fastxFilename_;
+        throw std::runtime_error{msg.str()};
+    }
+    ks_rewind(seq_->f);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ZmwChunkedFastxTextReader.h b/src/ZmwChunkedFastxTextReader.h

new file mode 100644 (file)

index 0000000..6e91ada
--- /dev/null
+++ b/src/ZmwChunkedFastxTextReader.h
@@ -0,0 +1,59 @@
+// File Description
+/// \file ZmwChunkedFastxTextReader.h
+/// \brief Defines the ZmwChunkedFastxTextReader class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWCHUNKEDFASTXTEXTREADER_H
+#define ZMWCHUNKEDFASTXTEXTREADER_H
+
+#include "pbbam/Config.h"
+
+#include "ZmwChunkedFastxReaderImpl.h"
+
+#include <cstdio>
+
+#include <memory>
+
+#include <htslib/kseq.h>
+#include <pbcopper/utility/Deleters.h>
+
+namespace PacBio {
+namespace BAM {
+
+class ZmwChunkedFastxTextReader final : public ZmwChunkedFastxReaderImpl
+{
+public:
+    ZmwChunkedFastxTextReader(std::string filename, const size_t numChunks);
+
+    void Seek(uint64_t pos) final;
+    FastaSequence ReadNextFasta(bool skipName) final;
+    FastqSequence ReadNextFastq(bool skipName) final;
+
+private:
+    int FetchRecord(bool getName);
+
+    // kseq needs a '__read' function with this signature, so fread does not work
+    // in this case. gzread/bgzf_read match but we want better seek performance
+    // than gzstream and are specifically not using indexed BGZF
+    static int ReadFromFile(FILE* fp, void* data, size_t length);
+
+    // specialize kseq_t for FILE handle
+    KSEQ_INIT(FILE*, ReadFromFile)
+    struct KSeqDeleter
+    {
+        void operator()(kseq_t* seq) const
+        {
+            if (seq) kseq_destroy(seq);
+            seq = nullptr;
+        }
+    };
+
+    std::unique_ptr<FILE, Utility::FileDeleter> file_;
+    std::unique_ptr<kseq_t, KSeqDeleter> seq_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWCHUNKEDFASTXTEXTREADER_H
+\ No newline at end of file
diff --git a/src/ZmwGroupQuery.cpp b/src/ZmwGroupQuery.cpp

new file mode 100644 (file)

index 0000000..8f89a01
--- /dev/null
+++ b/src/ZmwGroupQuery.cpp
@@ -0,0 +1,78 @@
+// File Description
+/// \file ZmwQuery.cpp
+/// \brief Implements the ZmwQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ZmwGroupQuery.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+
+#include "MemoryUtils.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/PbiFilterTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+class ZmwGroupQuery::ZmwGroupQueryPrivate
+{
+    using ReaderType = PbiFilterCompositeBamReader<Compare::Zmw>;
+
+public:
+    ZmwGroupQueryPrivate(const std::vector<int32_t>& zmwWhitelist, const DataSet& dataset)
+        : whitelist_(zmwWhitelist.cbegin(), zmwWhitelist.cend())
+    {
+        std::sort(whitelist_.begin(), whitelist_.end());
+        whitelist_.erase(std::unique(whitelist_.begin(), whitelist_.end()), whitelist_.end());
+
+        if (!whitelist_.empty()) {
+            reader_ = std::make_unique<ReaderType>(PbiZmwFilter{whitelist_.front()}, dataset);
+            whitelist_.pop_front();
+        }
+    }
+
+    bool GetNext(std::vector<BamRecord>& records)
+    {
+        records.clear();
+        if (!reader_) return false;
+
+        // get all records matching ZMW
+        BamRecord r;
+        while (reader_->GetNext(r))
+            records.push_back(r);
+
+        // set next ZMW (if any left)
+        if (!whitelist_.empty()) {
+            reader_->Filter(PbiZmwFilter{whitelist_.front()});
+            whitelist_.pop_front();
+        }
+
+        // otherwise destroy reader, next iteration will return false
+        else
+            reader_.reset();
+
+        return true;
+    }
+
+private:
+    std::deque<int32_t> whitelist_;
+    std::unique_ptr<ReaderType> reader_;
+};
+
+ZmwGroupQuery::ZmwGroupQuery(const std::vector<int32_t>& zmwWhitelist, const DataSet& dataset)
+    : internal::IGroupQuery(), d_{std::make_unique<ZmwGroupQueryPrivate>(zmwWhitelist, dataset)}
+{
+}
+
+ZmwGroupQuery::~ZmwGroupQuery() = default;
+
+bool ZmwGroupQuery::GetNext(std::vector<BamRecord>& records) { return d_->GetNext(records); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ZmwQuery.cpp b/src/ZmwQuery.cpp

new file mode 100644 (file)

index 0000000..6048629
--- /dev/null
+++ b/src/ZmwQuery.cpp
@@ -0,0 +1,40 @@
+// File Description
+/// \file ZmwQuery.cpp
+/// \brief Implements the ZmwQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ZmwQuery.h"
+
+#include <cstdint>
+
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/PbiFilterTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+class ZmwQuery::ZmwQueryPrivate
+{
+public:
+    ZmwQueryPrivate(std::vector<int32_t> zmwWhitelist, const DataSet& dataset)
+        : reader_{PbiZmwFilter{std::move(zmwWhitelist)}, dataset}
+    {
+    }
+
+    PbiFilterCompositeBamReader<Compare::Zmw> reader_;
+};
+
+ZmwQuery::ZmwQuery(std::vector<int32_t> zmwWhitelist, const DataSet& dataset)
+    : internal::IQuery(), d_{std::make_unique<ZmwQueryPrivate>(zmwWhitelist, dataset)}
+{
+}
+
+ZmwQuery::~ZmwQuery() = default;
+
+bool ZmwQuery::GetNext(BamRecord& r) { return d_->reader_.GetNext(r); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ZmwReadStitcher.cpp b/src/ZmwReadStitcher.cpp

new file mode 100644 (file)

index 0000000..5d018e7
--- /dev/null
+++ b/src/ZmwReadStitcher.cpp
@@ -0,0 +1,139 @@
+// File Description
+/// \file ZmwReadStitcher.cpp
+/// \brief Implements the ZmwReadStitcher class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/virtual/ZmwReadStitcher.h"
+
+#include <deque>
+#include <stdexcept>
+#include <utility>
+
+#include "VirtualStitching.h"
+#include "VirtualZmwReader.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/PbiFilter.h"
+#include "pbbam/PbiFilterQuery.h"
+
+namespace PacBio {
+namespace BAM {
+
+class ZmwReadStitcher::ZmwReadStitcherPrivate
+{
+public:
+    ZmwReadStitcherPrivate(std::string primaryBamFilePath, std::string scrapsBamFilePath,
+                           PbiFilter filter)
+        : filter_{std::move(filter)}
+    {
+        sources_.push_back({std::move(primaryBamFilePath), std::move(scrapsBamFilePath)});
+        OpenNextReader();
+    }
+
+    ZmwReadStitcherPrivate(const DataSet& dataset) : filter_{PbiFilter::FromDataSet(dataset)}
+    {
+        sources_ = SourcesFromDataset(dataset);
+        OpenNextReader();
+    }
+
+    bool HasNext() const { return (currentReader_ && currentReader_->HasNext()); }
+
+    VirtualZmwBamRecord Next()
+    {
+        if (currentReader_) {
+            const auto result = currentReader_->Next();
+            if (!currentReader_->HasNext()) OpenNextReader();
+            return result;
+        }
+
+        // no reader active
+        throw std::runtime_error{
+            "ZmwReadStitcher: "
+            "no readers active, make sure you use "
+            "ZmwReadStitcher::HasNext before "
+            "requesting next record"};
+    }
+
+    std::vector<BamRecord> NextRaw()
+    {
+        if (currentReader_) {
+            const auto result = currentReader_->NextRaw();
+            if (!currentReader_->HasNext()) OpenNextReader();
+            return result;
+        }
+
+        // no reader active
+        throw std::runtime_error{
+            "ZmwReadStitcher: "
+            "no readers active, make sure you use "
+            "ZmwReadStitcher::HasNext before "
+            "requesting next group of records"};
+    }
+
+    BamHeader PrimaryHeader() const { return currentReader_->PrimaryHeader(); }
+
+    BamHeader ScrapsHeader() const { return currentReader_->ScrapsHeader(); }
+
+    BamHeader StitchedHeader() const { return currentReader_->StitchedHeader(); }
+
+private:
+    StitchingSources sources_;
+    std::unique_ptr<VirtualZmwReader> currentReader_;
+    PbiFilter filter_;
+
+    void OpenNextReader()
+    {
+        currentReader_.reset(nullptr);
+
+        // find next source pair with data
+        while (!sources_.empty()) {
+            const auto nextSource = sources_.front();
+            sources_.pop_front();
+
+            currentReader_ =
+                std::make_unique<VirtualZmwReader>(nextSource.first, nextSource.second, filter_);
+            if (currentReader_->HasNext()) return;
+        }
+    }
+};
+
+// --------------------------------
+// ZmwReadStitcher implementation
+// --------------------------------
+
+ZmwReadStitcher::ZmwReadStitcher(std::string primaryBamFilePath, std::string scrapsBamFilePath)
+    : ZmwReadStitcher{std::move(primaryBamFilePath), std::move(scrapsBamFilePath), PbiFilter{}}
+{
+}
+
+ZmwReadStitcher::ZmwReadStitcher(std::string primaryBamFilePath, std::string scrapsBamFilePath,
+                                 PbiFilter filter)
+    : d_{std::make_unique<ZmwReadStitcherPrivate>(std::move(primaryBamFilePath),
+                                                  std::move(scrapsBamFilePath), std::move(filter))}
+{
+}
+
+ZmwReadStitcher::ZmwReadStitcher(const DataSet& dataset)
+    : d_{std::make_unique<ZmwReadStitcherPrivate>(dataset)}
+{
+}
+
+ZmwReadStitcher::~ZmwReadStitcher() = default;
+
+bool ZmwReadStitcher::HasNext() { return d_->HasNext(); }
+
+VirtualZmwBamRecord ZmwReadStitcher::Next() { return d_->Next(); }
+
+std::vector<BamRecord> ZmwReadStitcher::NextRaw() { return d_->NextRaw(); }
+
+BamHeader ZmwReadStitcher::PrimaryHeader() const { return d_->PrimaryHeader().DeepCopy(); }
+
+BamHeader ZmwReadStitcher::ScrapsHeader() const { return d_->ScrapsHeader().DeepCopy(); }
+
+BamHeader ZmwReadStitcher::StitchedHeader() const { return d_->StitchedHeader().DeepCopy(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ZmwTypeMap.cpp b/src/ZmwTypeMap.cpp

new file mode 100644 (file)

index 0000000..4c4b5b5
--- /dev/null
+++ b/src/ZmwTypeMap.cpp
@@ -0,0 +1,25 @@
+// File Description
+/// \file ZmwTypeMap.cpp
+/// \brief Implements the ZmwTypeMap class.
+//
+// Author: Armin Töpfer
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ZmwTypeMap.h"
+
+namespace PacBio {
+namespace BAM {
+
+// clang-format off
+std::map<char, ZmwType> ZmwTypeMap::ParseChar
+{
+    { 'C' , ZmwType::CONTROL   },
+    { 'M' , ZmwType::MALFORMED },
+    { 'N' , ZmwType::NORMAL    },
+    { 'S' , ZmwType::SENTINEL  }
+};
+// clang-format on
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/bed/BedReader.cpp b/src/bed/BedReader.cpp

new file mode 100644 (file)

index 0000000..2f4391e
--- /dev/null
+++ b/src/bed/BedReader.cpp
@@ -0,0 +1,117 @@
+// File Description
+/// \file BedReader.cpp
+/// \brief Implements the BedReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/bed/BedReader.h"
+
+#include <cassert>
+
+#include <sstream>
+#include <stdexcept>
+#include <type_traits>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/optional.hpp>
+
+#include "pbbam/FormatUtils.h"
+#include "pbbam/StringUtilities.h"
+#include "pbbam/TextFileReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(!std::is_copy_constructible<BedReader>::value,
+              "BedReader(const BedReader&) is not = delete");
+static_assert(!std::is_copy_assignable<BedReader>::value,
+              "BedReader& operator=(const BedReader&) is not = delete");
+
+class BedReader::BedReaderPrivate
+{
+public:
+    explicit BedReaderPrivate(const std::string& fn)
+    {
+        // validate extension
+        if (!FormatUtils::IsBedFilename(fn)) {
+            throw std::runtime_error{"BedReader ERROR: filename '" + fn +
+                                     "' is not recognized as a BED file."};
+        }
+
+        // open file stream
+        reader_ = std::make_unique<TextFileReader>(fn);
+        if (!reader_) {
+            throw std::runtime_error("BedReader ERROR: could not open text file '" + fn +
+                                     "' for reading");
+        }
+
+        // pre-fetch first record
+        GetNext();
+    }
+
+    void GetNext()
+    {
+        interval_ = boost::none;
+        std::string line;
+        if (reader_->GetNext(line)) interval_ = ParseInterval(std::move(line));
+    }
+
+    GenomicInterval ParseInterval(std::string line)
+    {
+        // trim any trailing whitespace
+        boost::trim_right(line);
+
+        // split into token fields
+        const auto fields = PacBio::BAM::Split(line, '\t');
+        if (fields.size() < 3) {
+            std::ostringstream msg;
+            msg << "BedReader ERROR: invalid BED record. Line:\n"
+                << line << '\n'
+                << "has less than 3 fields.";
+            throw std::runtime_error{msg.str()};
+        }
+
+        // convert fields into interval
+        const Position start = std::stoi(fields[1]);
+        const Position end = std::stoi(fields[2]);
+        return {fields[0], start, end};
+    }
+
+    std::unique_ptr<TextFileReader> reader_;
+    boost::optional<GenomicInterval> interval_;
+};
+
+BedReader::BedReader(const std::string& fn)
+    : internal::QueryBase<GenomicInterval>{}, d_{std::make_unique<BedReaderPrivate>(fn)}
+{
+}
+
+BedReader::BedReader(BedReader&&) noexcept = default;
+
+BedReader& BedReader::operator=(BedReader&&) noexcept = default;
+
+BedReader::~BedReader() = default;
+
+bool BedReader::GetNext(GenomicInterval& interval)
+{
+    if (!d_->interval_) return false;
+
+    interval = *d_->interval_;
+    d_->GetNext();
+    return true;
+}
+
+std::vector<GenomicInterval> BedReader::ReadAll(const std::string& fn)
+{
+    std::vector<GenomicInterval> result;
+    result.reserve(256);
+    BedReader reader{fn};
+    for (const auto& seq : reader)
+        result.emplace_back(seq);
+    return result;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/bed/BedWriter.cpp b/src/bed/BedWriter.cpp

new file mode 100644 (file)

index 0000000..c74919d
--- /dev/null
+++ b/src/bed/BedWriter.cpp
@@ -0,0 +1,55 @@
+// File Description
+/// \file BedWriter.cpp
+/// \brief Implements the BedWriter class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/bed/BedWriter.h"
+
+#include <cassert>
+
+#include <sstream>
+#include <type_traits>
+
+#include "pbbam/GenomicInterval.h"
+#include "pbbam/TextFileWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+static_assert(!std::is_copy_constructible<BedWriter>::value,
+              "BedWriter(const BedWriter&) is not = delete");
+static_assert(!std::is_copy_assignable<BedWriter>::value,
+              "BedWriter& operator=(const BedWriter&) is not = delete");
+
+class BedWriter::BedWriterPrivate
+{
+public:
+    explicit BedWriterPrivate(const std::string& filename) : writer_{filename} {}
+
+    void Write(const GenomicInterval& interval)
+    {
+        line_.str("");
+        line_ << interval.Name() << '\t' << interval.Start() << '\t' << interval.Stop();
+        writer_.Write(line_.str());
+    }
+
+private:
+    std::ostringstream line_;
+    TextFileWriter writer_;
+};
+
+BedWriter::BedWriter(const std::string& fn) : d_{std::make_unique<BedWriterPrivate>(fn)} {}
+
+BedWriter::BedWriter(BedWriter&&) noexcept = default;
+
+BedWriter& BedWriter::operator=(BedWriter&&) noexcept = default;
+
+BedWriter::~BedWriter() = default;
+
+void BedWriter::Write(const GenomicInterval& interval) { d_->Write(interval); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ccs/CCSPbiBuilder.cpp b/src/ccs/CCSPbiBuilder.cpp

new file mode 100644 (file)

index 0000000..db45d8d
--- /dev/null
+++ b/src/ccs/CCSPbiBuilder.cpp
@@ -0,0 +1,330 @@
+// File Description
+/// \file CCSPbiBuilder.cpp
+/// \brief Implements the CCSPbiBuilder.cpp class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ccs/CCSPbiBuilder.h"
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <stdexcept>
+#include <thread>
+#include <vector>
+
+#include <htslib/bgzf.h>
+#include <pbcopper/utility/Deleters.h>
+
+#include "pbbam/PbiBuilder.h"
+#include "pbbam/PbiFile.h"
+#include "pbbam/ccs/CCSHeader.h"
+#include "pbbam/ccs/CCSRecord.h"
+
+#include "MemoryUtils.h"
+
+namespace PacBio {
+namespace CCS {
+namespace internal {
+
+template <typename T>
+inline void SwapEndianness(std::vector<T>& data)
+{
+    const size_t elementSize = sizeof(T);
+    const size_t numReads = data.size();
+    switch (elementSize) {
+        case 1:
+            break;  // no swapping necessary
+        case 2:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_2p(&data[i]);
+            break;
+        case 4:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_4p(&data[i]);
+            break;
+        case 8:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_8p(&data[i]);
+            break;
+        default:
+            throw std::runtime_error{"CCSPbiBuilder: unsupported element size (" +
+                                     std::to_string(elementSize) + ")"};
+    }
+}
+
+void bgzf_write_safe(BGZF* fp, const void* data, size_t length)
+{
+    const auto ret = bgzf_write(fp, data, length);
+    if (ret < 0L)
+        throw std::runtime_error{
+            "CCSPbiBuilder: non-zero returned from bgzf_write(). Out of disk space?"};
+}
+
+template <typename T>
+inline void WriteBgzfVector(BGZF* fp, std::vector<T>& data)
+{
+    assert(fp);
+    if (fp->is_be) SwapEndianness(data);
+    bgzf_write_safe(fp, &data[0], data.size() * sizeof(T));
+}
+
+struct PbiFieldBlock
+{
+    int64_t pos_;  // file position of block start
+    size_t n_;     // number of entries in block
+};
+
+template <typename T>
+class PbiField
+{
+    constexpr static const size_t ElementSize = sizeof(T);
+
+public:
+    PbiField(size_t maxBufferSize) : maxElementCount_{maxBufferSize / ElementSize}
+    {
+        buffer_.reserve(maxElementCount_);
+    }
+
+    void Add(T value) { buffer_.push_back(value); }
+    bool IsFull() const { return buffer_.size() == maxElementCount_; }
+
+    size_t maxElementCount_;
+    std::vector<T> buffer_;
+    std::vector<PbiFieldBlock> blocks_;
+};
+
+}  // namespace internal
+
+class CCSPbiBuilder::CCSPbiBuilderPrivate
+{
+    enum class FlushMode
+    {
+        FORCE,
+        NO_FORCE
+    };
+
+    // TODO: Make this tweak-able, a la IndexedBamWriter's buffers
+    constexpr static const size_t MaxBufferSize = 0x10000;
+
+public:
+    CCSPbiBuilderPrivate(const std::string& pbiFilename, const std::string& movieName,
+                         const CCSPbiBuilderConfig& config)
+        : pbiFilename_{pbiFilename}
+        , tempFilename_{pbiFilename + ".build"}
+        , tempFile_{std::fopen(tempFilename_.c_str(), "w+b")}
+        , compressionLevel_{config.CompressionLevel}
+        , numThreads_{config.NumThreads}
+        , rgIdField_{MaxBufferSize}
+        , qStartField_{MaxBufferSize}
+        , qEndField_{MaxBufferSize}
+        , holeNumField_{MaxBufferSize}
+        , readQualField_{MaxBufferSize}
+        , ctxtField_{MaxBufferSize}
+        , fileOffsetField_{MaxBufferSize}
+    {
+        movieName_ = movieName;
+        rgId_ = BAM::ReadGroupInfo::IdToInt(BAM::MakeReadGroupId(movieName, "SUBREAD"));
+    }
+
+    void AddRecord(const CCSRecord& record)
+    {
+        rgIdField_.Add(rgId_);
+        qStartField_.Add(record.QueryStart);
+        qEndField_.Add(record.QueryEnd);
+        holeNumField_.Add(record.HoleNumber);
+        ctxtField_.Add(record.LocalContextFlags);
+        readQualField_.Add(record.Accuracy);
+        fileOffsetField_.Add(-1);
+
+        FlushBuffers(FlushMode::NO_FORCE);
+        ++currentRow_;
+    }
+
+    void Close()
+    {
+        if (isClosed_) return;
+
+        FlushBuffers(FlushMode::FORCE);
+
+        OpenPbiFile();
+        WritePbiHeader();
+        WriteFromTempFile();
+
+        std::remove(tempFilename_.c_str());
+        isClosed_ = true;
+    }
+
+    template <typename T>
+    void MaybeFlushBuffer(internal::PbiField<T>& field, bool force)
+    {
+        // replace with lambda, in FlushBuffer(), once PPA can use C++14 ?
+        if (field.IsFull() || force) {
+            WriteToTempFile(field);
+            field.buffer_.clear();
+        }
+    }
+
+    void FlushBuffers(FlushMode mode)
+    {
+        const auto force = (mode == FlushMode::FORCE);
+        MaybeFlushBuffer(rgIdField_, force);
+        MaybeFlushBuffer(qStartField_, force);
+        MaybeFlushBuffer(qEndField_, force);
+        MaybeFlushBuffer(holeNumField_, force);
+        MaybeFlushBuffer(readQualField_, force);
+        MaybeFlushBuffer(ctxtField_, force);
+        MaybeFlushBuffer(fileOffsetField_, force);
+    }
+
+    template <typename T>
+    void WriteToTempFile(internal::PbiField<T>& field)
+    {
+        if (field.buffer_.empty()) return;
+
+        const auto pos = std::ftell(tempFile_.get());
+        const auto numElements =
+            std::fwrite(field.buffer_.data(), sizeof(T), field.buffer_.size(), tempFile_.get());
+        field.blocks_.emplace_back(internal::PbiFieldBlock{pos, numElements});
+    }
+
+    void OpenPbiFile()
+    {
+        // open file handle
+        const auto mode = std::string("wb") + std::to_string(static_cast<int>(compressionLevel_));
+        pbiFile_.reset(bgzf_open(pbiFilename_.c_str(), mode.c_str()));
+        if (pbiFile_ == nullptr)
+            throw std::runtime_error{"CCSPbiBuilder: could not open file for writing: " +
+                                     pbiFilename_};
+
+        // if no explicit thread count given, attempt built-in check
+        size_t actualNumThreads = numThreads_;
+        if (actualNumThreads == 0) {
+            actualNumThreads = std::thread::hardware_concurrency();
+
+            // if still unknown, default to single-threaded
+            if (actualNumThreads == 0) actualNumThreads = 1;
+        }
+
+        // if multithreading requested, enable it
+        if (actualNumThreads > 1) bgzf_mt(pbiFile_.get(), actualNumThreads, 256);
+    }
+
+    void WritePbiHeader()
+    {
+        BGZF* bgzf = pbiFile_.get();
+
+        // 'magic' string
+        static constexpr const std::array<char, 4> magic{{'P', 'B', 'I', '\1'}};
+        internal::bgzf_write_safe(bgzf, magic.data(), 4);
+
+        PacBio::BAM::PbiFile::Sections sections = PacBio::BAM::PbiFile::BASIC;
+        // version, pbi_flags, & n_reads
+        auto version = static_cast<uint32_t>(PacBio::BAM::PbiFile::CurrentVersion);
+        uint16_t pbi_flags = sections;
+        auto numReads = currentRow_;
+        if (bgzf->is_be) {
+            version = ed_swap_4(version);
+            pbi_flags = ed_swap_2(pbi_flags);
+            numReads = ed_swap_4(numReads);
+        }
+        internal::bgzf_write_safe(bgzf, &version, 4);
+        internal::bgzf_write_safe(bgzf, &pbi_flags, 2);
+        internal::bgzf_write_safe(bgzf, &numReads, 4);
+
+        // reserved space
+        char reserved[18];
+        memset(reserved, 0, 18);
+        internal::bgzf_write_safe(bgzf, reserved, 18);
+    }
+
+    template <typename T>
+    void LoadFieldBlockFromTempFile(internal::PbiField<T>& field,
+                                    const internal::PbiFieldBlock& block)
+    {
+        // seek to block begin
+        const auto ret = std::fseek(tempFile_.get(), block.pos_, SEEK_SET);
+        if (ret != 0)
+            throw std::runtime_error{"CCSPbiBuilder: could not seek in temp file: " +
+                                     tempFilename_ + ", offset: " + std::to_string(block.pos_)};
+
+        // read block elements
+        field.buffer_.assign(block.n_, 0);
+        const auto numElements =
+            std::fread(field.buffer_.data(), sizeof(T), block.n_, tempFile_.get());
+
+        if (numElements != block.n_)
+            throw std::runtime_error{
+                "CCSPbiBuilder: could not read element count from temp file: " + tempFilename_};
+    }
+
+    template <typename T>
+    void WriteField(internal::PbiField<T>& field)
+    {
+        for (const auto& block : field.blocks_) {
+            LoadFieldBlockFromTempFile(field, block);
+            internal::WriteBgzfVector(pbiFile_.get(), field.buffer_);
+        }
+    }
+
+    void WriteFromTempFile()
+    {
+        // load from temp file, in PBI format order, and write to index
+        WriteField(rgIdField_);
+        WriteField(qStartField_);
+        WriteField(qEndField_);
+        WriteField(holeNumField_);
+        WriteField(readQualField_);
+        WriteField(ctxtField_);
+        WriteField(fileOffsetField_);
+    }
+
+    // file info
+    std::string bamFilename_;
+    std::string pbiFilename_;
+    std::string tempFilename_;
+    std::unique_ptr<FILE, Utility::FileDeleter> tempFile_;
+    std::unique_ptr<BGZF, PacBio::BAM::HtslibBgzfDeleter> pbiFile_;
+    PacBio::BAM::PbiBuilder::CompressionLevel compressionLevel_ =
+        PacBio::BAM::PbiBuilder::DefaultCompression;
+    size_t numThreads_;
+
+    // PBI field buffers
+    internal::PbiField<int32_t> rgIdField_;
+    internal::PbiField<int32_t> qStartField_;
+    internal::PbiField<int32_t> qEndField_;
+    internal::PbiField<int32_t> holeNumField_;
+    internal::PbiField<float> readQualField_;
+    internal::PbiField<uint8_t> ctxtField_;
+    internal::PbiField<uint64_t> fileOffsetField_;
+
+    std::string movieName_;
+    int32_t rgId_;
+    uint32_t currentRow_ = 0;
+    bool isClosed_ = false;
+};
+
+CCSPbiBuilder::CCSPbiBuilder(const std::string& pbiFilename, const std::string& movieName,
+                             const CCSPbiBuilderConfig& config)
+    : d_{std::make_unique<CCSPbiBuilderPrivate>(pbiFilename, movieName, config)}
+{
+}
+
+CCSPbiBuilder::CCSPbiBuilder(const std::string& pbiFilename, const CCSHeader& header,
+                             const CCSPbiBuilderConfig& config)
+    : CCSPbiBuilder{pbiFilename, header.MovieName, config}
+{
+}
+
+CCSPbiBuilder::~CCSPbiBuilder() = default;
+
+void CCSPbiBuilder::AddRecord(const CCSRecord& record) { d_->AddRecord(record); }
+
+void CCSPbiBuilder::Close() { d_->Close(); }
+
+const std::string& CCSPbiBuilder::MovieName() const { return d_->movieName_; }
+
+}  // namespace CCS
+}  // namespace PacBio
+\ No newline at end of file
diff --git a/src/ccs/CCSRecordFormat.cpp b/src/ccs/CCSRecordFormat.cpp

new file mode 100644 (file)

index 0000000..4abfb5a
--- /dev/null
+++ b/src/ccs/CCSRecordFormat.cpp
@@ -0,0 +1,144 @@
+// File Description
+/// \file CCSRecordFormat.cpp
+/// \brief Implements the CCSRecordFormat class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ccs/CCSRecordFormat.h"
+
+#include <iomanip>
+#include <sstream>
+#include <stdexcept>
+
+#include "pbbam/StringUtilities.h"
+
+namespace {
+
+static const std::string MovieName{"movie_name"};
+static const std::string BindingKit{"binding_kit"};
+static const std::string SequencingKit{"sequencing_kit"};
+static const std::string BasecallerVersion{"basecaller_version"};
+static const std::string FrameRate{"framerate"};
+
+}  // namespace
+
+namespace PacBio {
+namespace CCS {
+
+CCSHeader CCSRecordFormat::DeserializeHeader(const std::vector<std::string>& lines)
+{
+    if (lines.empty())
+        throw std::runtime_error{"CCS record format: cannot create header from empty text"};
+
+    CCSHeader result;
+    std::vector<std::string> fields;
+    for (const auto& line : lines) {
+        fields = PacBio::BAM::Split(line, '=');
+        if (fields.size() != 2) {
+            std::ostringstream msg;
+            msg << "CCS record format: malformatted header line\n"
+                << line << '\n'
+                << "must have syntax 'name=value'";
+            throw std::runtime_error{msg.str()};
+        }
+
+        // clang-format off
+        if      (fields[0] == MovieName)         result.MovieName = fields[1];
+        else if (fields[0] == BindingKit)        result.BindingKit = fields[1];
+        else if (fields[0] == SequencingKit)     result.SequencingKit = fields[1];
+        else if (fields[0] == BasecallerVersion) result.BasecallerVersion = fields[1];
+        else if (fields[0] == FrameRate)         result.FrameRate = fields[1];
+        else {
+            std::ostringstream msg;
+            msg << "CCS record format: unrecognized header field name: '" << fields[0] << '\'';
+            throw std::runtime_error{msg.str()};
+        }
+        // clang-format on
+    }
+    return result;
+}
+
+std::vector<std::string> CCSRecordFormat::SerializeHeader(const CCSHeader& header)
+{
+    // clang-format off
+    std::vector<std::string> result;
+    result.push_back(MovieName         + '=' + header.MovieName);
+    result.push_back(BindingKit        + '=' + header.BindingKit);
+    result.push_back(SequencingKit     + '=' + header.SequencingKit);
+    result.push_back(BasecallerVersion + '=' + header.BasecallerVersion);
+    result.push_back(FrameRate         + '=' + header.FrameRate);
+    // clang-format on
+    return result;
+}
+
+CCSRecord CCSRecordFormat::DeserializeRecord(const std::string& line)
+{
+    const auto fields = PacBio::BAM::Split(line);
+    if (fields.size() != 8) {
+        std::ostringstream msg;
+        msg << "CCS record format: malformatted record line\n" << line << '\n';
+        throw std::runtime_error{msg.str()};
+    }
+
+    // clang-format off
+    CCSRecord result;
+    result.HoleNumber = std::stoi(fields[0]);
+    result.QueryStart = std::stoi(fields[1]);
+    result.QueryEnd   = std::stoi(fields[2]);
+    result.LocalContextFlags = static_cast<PacBio::BAM::LocalContextFlags>(std::stoul(fields[3]));
+    result.Accuracy = std::stof(fields[4]);
+
+    const auto snrs = PacBio::BAM::Split(fields[5], ',');
+    if (snrs.size() != 4) {
+        std::ostringstream msg;
+        msg << "CCS record format: SNR field must have 4 values";
+        throw std::runtime_error{msg.str()};
+    }
+    result.SignalToNoise = {
+        std::stod(snrs[0]),
+        std::stod(snrs[1]),
+        std::stod(snrs[2]),
+        std::stod(snrs[3])
+    };
+
+    result.Sequence = fields[6];
+
+    const auto pwStrings = PacBio::BAM::Split(fields[7], ',');
+    std::vector<uint16_t> pws;
+    pws.reserve(pwStrings.size());
+    for (const auto& pwString : pwStrings)
+        pws.emplace_back(std::stoul(pwString));
+    result.PulseWidths = pws;
+
+    // clang-format on
+    return result;
+}
+
+std::string CCSRecordFormat::SerializeRecord(const CCSRecord& record)
+{
+    // clang-format off
+    std::ostringstream out;
+    out << record.HoleNumber << '\t'
+        << record.QueryStart << '\t'
+        << record.QueryEnd << '\t'
+        << static_cast<uint16_t>(record.LocalContextFlags) << '\t'
+        << record.Accuracy << '\t'
+        << record.SignalToNoise.A << ',' << record.SignalToNoise.C << ','
+        << record.SignalToNoise.G << ',' << record.SignalToNoise.T << '\t'
+        << record.Sequence << '\t';
+
+    bool first = true;
+    for (const auto pw : record.PulseWidths) {
+        if (!first) out << ',';
+        else first = false;
+        out << pw;
+    }
+
+    // clang-format on
+    return out.str();
+}
+
+}  // namespace CCS
+}  // namespace PacBio
+\ No newline at end of file
diff --git a/src/ccs/CCSRecordReader.cpp b/src/ccs/CCSRecordReader.cpp

new file mode 100644 (file)

index 0000000..efeee63
--- /dev/null
+++ b/src/ccs/CCSRecordReader.cpp
@@ -0,0 +1,65 @@
+// File Description
+/// \file CCSRecordReader.cpp
+/// \brief Implements the CCSRecordReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ccs/CCSRecordReader.h"
+
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "pbbam/ccs/CCSRecordFormat.h"
+
+namespace PacBio {
+namespace CCS {
+
+class CCSRecordReader::CCSRecordReaderPrivate
+{
+public:
+    CCSRecordReaderPrivate(std::istream& in) : in_{in} { ReadHeader(); }
+
+    void ReadHeader()
+    {
+        const std::string EndHeader{"#"};
+
+        std::vector<std::string> lines;
+        std::string line;
+        while (std::getline(in_, line)) {
+            if (line == EndHeader) break;
+            lines.push_back(line);
+        }
+        header_ = CCSRecordFormat::DeserializeHeader(lines);
+    }
+
+    bool GetNext(CCSRecord& record)
+    {
+        if (!std::getline(in_, line_)) return false;  // indicates EOF
+        record = CCSRecordFormat::DeserializeRecord(line_);
+        return true;
+    }
+
+    std::istream& in_;
+    std::string line_;
+    CCSHeader header_;
+};
+
+CCSRecordReader::CCSRecordReader() : CCSRecordReader{std::cin} {}
+
+CCSRecordReader::CCSRecordReader(std::istream& in)
+    : d_{std::make_unique<CCSRecordReaderPrivate>(in)}
+{
+}
+
+CCSRecordReader::~CCSRecordReader() = default;
+
+bool CCSRecordReader::GetNext(CCSRecord& record) { return d_->GetNext(record); }
+
+const CCSHeader& CCSRecordReader::Header() const { return d_->header_; }
+
+}  // namespace CCS
+}  // namespace PacBio
+\ No newline at end of file
diff --git a/src/ccs/CCSRecordWriter.cpp b/src/ccs/CCSRecordWriter.cpp

new file mode 100644 (file)

index 0000000..32b0a02
--- /dev/null
+++ b/src/ccs/CCSRecordWriter.cpp
@@ -0,0 +1,56 @@
+// File Description
+/// \file CCSRecordReader.cpp
+/// \brief Implements the CCSRecordReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ccs/CCSRecordWriter.h"
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "pbbam/ccs/CCSRecordFormat.h"
+
+namespace PacBio {
+namespace CCS {
+
+class CCSRecordWriter::CCSRecordWriterPrivate
+{
+public:
+    CCSRecordWriterPrivate(const CCSHeader& header, std::ostream& out) : out_{out}
+    {
+        WriteHeader(header);
+    }
+
+    void WriteHeader(const CCSHeader& header)
+    {
+        const auto lines = CCSRecordFormat::SerializeHeader(header);
+        for (const auto& line : lines)
+            out_ << line << '\n';
+        out_ << "#\n";
+    }
+
+    void Write(const CCSRecord& record)
+    {
+        out_ << CCSRecordFormat::SerializeRecord(record) << '\n';
+    }
+
+    std::ostream& out_;
+};
+
+CCSRecordWriter::CCSRecordWriter(const CCSHeader& header) : CCSRecordWriter{header, std::cout} {}
+
+CCSRecordWriter::CCSRecordWriter(const CCSHeader& header, std::ostream& out)
+    : d_{std::make_unique<CCSRecordWriterPrivate>(header, out)}
+{
+}
+
+CCSRecordWriter::~CCSRecordWriter() = default;
+
+void CCSRecordWriter::Write(const CCSRecord& record) { d_->Write(record); }
+
+}  // namespace CCS
+}  // namespace PacBio
+\ No newline at end of file
diff --git a/src/meson.build b/src/meson.build

new file mode 100644 (file)

index 0000000..4c3b104
--- /dev/null
+++ b/src/meson.build
@@ -0,0 +1,153 @@
+###########
+# sources #
+###########
+
+pbbam_cpp_sources = files([
+  'AlignmentPrinter.cpp',
+  'BaiIndexCache.cpp',
+  'BaiIndexedBamReader.cpp',
+  'BamFile.cpp',
+  'BamFileMerger.cpp',
+  'BamHeader.cpp',
+  'BamReader.cpp',
+  'BamRecord.cpp',
+  'BamRecordBuilder.cpp',
+  'BamRecordImpl.cpp',
+  'BamRecordTags.cpp',
+  'BamRecordView.cpp',
+  'BamTagCodec.cpp',
+  'BamWriter.cpp',
+  'BarcodeQuery.cpp',
+  'BgzipFastaWriter.cpp',
+  'BgzipFastqWriter.cpp',
+  'BgzipWriter.cpp',
+  'ChemistryTable.cpp',
+  'Compare.cpp',
+  'CompositeFastaReader.cpp',
+  'Config.cpp',
+  'DataSet.cpp',
+  'DataSetBaseTypes.cpp',
+  'DataSetElement.cpp',
+  'DataSetIO.cpp',
+  'DataSetTypes.cpp',
+  'DataSetXsd.cpp',
+  'EntireFileQuery.cpp',
+  'FaiIndex.cpp',
+  'FaiZmwChunker.cpp',
+  'FastaCache.cpp',
+  'FastaReader.cpp',
+  'FastaSequence.cpp',
+  'FastaSequenceQuery.cpp',
+  'FastaWriter.cpp',
+  'FastqReader.cpp',
+  'FastqSequence.cpp',
+  'FastqWriter.cpp',
+  'FileProducer.cpp',
+  'FileUtils.cpp',
+  'FofnReader.cpp',
+  'FormatUtils.cpp',
+  'GenomicIntervalQuery.cpp',
+  'IFastaWriter.cpp',
+  'IFastqWriter.cpp',
+  'IndexedBamWriter.cpp',
+  'IndexedFastaReader.cpp',
+  'IndexedFastqBgzfReader.cpp',
+  'IndexedFastqReader.cpp',
+  'IndexedFastqTextReader.cpp',
+  'IndexedFastqReaderImpl.cpp',
+  'IRecordWriter.cpp',
+  'KSeqReader.cpp',
+  'MD5.cpp',
+  'MemoryUtils.cpp',
+  'PbiBuilder.cpp',
+  'PbiFile.cpp',
+  'PbiFilter.cpp',
+  'PbiFilterQuery.cpp',
+  'PbiFilterTypes.cpp',
+  'PbiIndexedBamReader.cpp',
+  'PbiIndexIO.cpp',
+  'PbiRawData.cpp',
+  'ProgramInfo.cpp',
+  'QNameQuery.cpp',
+  'ReadAccuracyQuery.cpp',
+  'ReadGroupInfo.cpp',
+  'RecordType.cpp',
+  'SamTagCodec.cpp',
+  'SamWriter.cpp',
+  'SequenceInfo.cpp',
+  'StringUtilities.cpp',
+  'SubreadLengthQuery.cpp',
+  'Tag.cpp',
+  'TagCollection.cpp',
+  'TextFileReader.cpp',
+  'TextFileWriter.cpp',
+  'Validator.cpp',
+  'ValidationErrors.cpp',
+  'ValidationException.cpp',
+  'Version.cpp',
+  'VirtualZmwBamRecord.cpp',
+  'VirtualZmwCompositeReader.cpp',
+  'VirtualZmwReader.cpp',
+  'VirtualRegion.cpp',
+  'VirtualRegionTypeMap.cpp',
+  'XmlReader.cpp',
+  'XmlWriter.cpp',
+  'WhitelistedZmwReadStitcher.cpp',
+  'ZmwChunkedFastaReader.cpp',
+  'ZmwChunkedFastqReader.cpp',
+  'ZmwChunkedFastxBgzfReader.cpp',
+  'ZmwChunkedFastxReaderImpl.cpp',
+  'ZmwChunkedFastxTextReader.cpp',
+  'ZmwGroupQuery.cpp',
+  'ZmwReadStitcher.cpp',
+  'ZmwQuery.cpp',
+  'ZmwTypeMap.cpp',
+
+  # bed
+  'bed/BedReader.cpp',
+  'bed/BedWriter.cpp',
+
+  # ccs
+  'ccs/CCSPbiBuilder.cpp',
+  'ccs/CCSRecordFormat.cpp',
+  'ccs/CCSRecordReader.cpp',
+  'ccs/CCSRecordWriter.cpp',
+
+  # vcf
+  'vcf/VcfFile.cpp',
+  'vcf/VcfFormat.cpp',
+  'vcf/VcfHeader.cpp',
+  'vcf/VcfHeaderTypes.cpp',
+  'vcf/VcfQuery.cpp',
+  'vcf/VcfReader.cpp',
+  'vcf/VcfSort.cpp',
+  'vcf/VcfVariant.cpp',
+  'vcf/VcfWriter.cpp',
+
+  # XML I/O
+  'pugixml/pugixml.cpp'
+])
+
+pbbam_extra_flags = []
+if get_option('auto-validate')
+  pbbam_extra_flags += '-DPBBAM_AUTOVALIDATE=1'
+endif
+
+# install library if
+# - either running as a proper project
+# - or using shared libraries
+pbbam_lib_install = (not meson.is_subproject()) or (get_option('default_library') == 'shared')
+
+pbbam_lib = library(
+  'pbbam',
+  pbbam_cpp_sources,
+  # use boost SONAME practice:
+  #   cause ld startup issues before
+  #   you even have the chance of running
+  #   into ABI issues.
+  soversion : meson.project_version(),
+  version : meson.project_version(),
+  install : pbbam_lib_install,
+  dependencies : [pbbam_thread_dep, pbbam_boost_dep, pbbam_zlib_dep, pbbam_htslib_dep, pbbam_pbcopper_dep],
+  include_directories : pbbam_include_directories,
+  cpp_args : [pbbam_extra_flags, pbbam_warning_flags, pbbam_macros])
diff --git a/src/pugixml/pugiconfig.hpp b/src/pugixml/pugiconfig.hpp

new file mode 100644 (file)

index 0000000..6219dbe
--- /dev/null
+++ b/src/pugixml/pugiconfig.hpp
@@ -0,0 +1,71 @@
+/**
+ * pugixml parser - version 1.5
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef HEADER_PUGICONFIG_HPP
+#define HEADER_PUGICONFIG_HPP
+
+// Uncomment this to enable wchar_t mode
+// #define PUGIXML_WCHAR_MODE
+
+// Uncomment this to disable XPath
+// #define PUGIXML_NO_XPATH
+
+// Uncomment this to disable STL
+// #define PUGIXML_NO_STL
+
+// Uncomment this to disable exceptions
+// #define PUGIXML_NO_EXCEPTIONS
+
+// Set this to control attributes for public classes/functions, i.e.:
+// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
+// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
+// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
+// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
+
+// Tune these constants to adjust memory-related behavior
+// #define PUGIXML_MEMORY_PAGE_SIZE 32768
+// #define PUGIXML_MEMORY_OUTPUT_STACK 10240
+// #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
+
+// Uncomment this to switch to header-only version
+// #define PUGIXML_HEADER_ONLY
+
+// Uncomment this to enable long long support
+// #define PUGIXML_HAS_LONG_LONG
+
+#endif
+
+/**
+ * Copyright (c) 2006-2014 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/src/pugixml/pugixml.cpp b/src/pugixml/pugixml.cpp

new file mode 100644 (file)

index 0000000..37bdec0
--- /dev/null
+++ b/src/pugixml/pugixml.cpp
@@ -0,0 +1,11539 @@
+/**
+ * pugixml parser - version 1.5
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef SOURCE_PUGIXML_CPP
+#define SOURCE_PUGIXML_CPP
+
+// disable all the warnings in this file
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+
+#if __GNUC__ >= 6
+#pragma GCC diagnostic ignored "-Wnull-dereference"
+#endif
+
+#if !defined(__clang__) and !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic ignored "-Wuseless-cast"
+#endif
+
+#include "../PbbamInternalConfig.h"
+
+#include "pugixml.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <cassert>
+
+#ifdef PUGIXML_WCHAR_MODE
+#      include <cwchar>
+#endif
+
+#ifndef PUGIXML_NO_XPATH
+#      include <cmath>
+#      include <cfloat>
+#      ifdef PUGIXML_NO_EXCEPTIONS
+#              include <csetjmp>
+#      endif
+#endif
+
+#ifndef PUGIXML_NO_STL
+#      include <istream>
+#      include <ostream>
+#      include <string>
+#endif
+
+// For placement new
+#include <new>
+
+#ifdef _MSC_VER
+#      pragma warning(push)
+#      pragma warning(disable: 4127) // conditional expression is constant
+#      pragma warning(disable: 4324) // structure was padded due to __declspec(align())
+#      pragma warning(disable: 4611) // interaction between '_setjmp' and C++ object destruction is non-portable
+#      pragma warning(disable: 4702) // unreachable code
+#      pragma warning(disable: 4996) // this function or variable may be unsafe
+#      pragma warning(disable: 4793) // function compiled as native: presence of '_setjmp' makes a function unmanaged
+#endif
+
+#ifdef __INTEL_COMPILER
+#      pragma warning(disable: 177) // function was declared but never referenced 
+#      pragma warning(disable: 279) // controlling expression is constant
+#      pragma warning(disable: 1478 1786) // function was declared "deprecated"
+#      pragma warning(disable: 1684) // conversion from pointer to same-sized integral type
+#endif
+
+#if defined(__BORLANDC__) && defined(PUGIXML_HEADER_ONLY)
+#      pragma warn -8080 // symbol is declared but never used; disabling this inside push/pop bracket does not make the warning go away
+#endif
+
+#ifdef __BORLANDC__
+#      pragma option push
+#      pragma warn -8008 // condition is always false
+#      pragma warn -8066 // unreachable code
+#endif
+
+#ifdef __SNC__
+// Using diag_push/diag_pop does not disable the warnings inside templates due to a compiler bug
+#      pragma diag_suppress=178 // function was declared but never referenced
+#      pragma diag_suppress=237 // controlling expression is constant
+#endif
+
+// Inlining controls
+#if defined(_MSC_VER) && _MSC_VER >= 1300
+#      define PUGI__NO_INLINE __declspec(noinline)
+#elif defined(__GNUC__)
+#      define PUGI__NO_INLINE __attribute__((noinline))
+#else
+#      define PUGI__NO_INLINE 
+#endif
+
+// Branch weight controls
+#if defined(__GNUC__)
+#      define PUGI__UNLIKELY(cond) __builtin_expect(cond, 0)
+#else
+#      define PUGI__UNLIKELY(cond) (cond)
+#endif
+
+// Simple static assertion
+#define PUGI__STATIC_ASSERT(cond) { static const char condition_failed[(cond) ? 1 : -1] = {0}; (void)condition_failed[0]; }
+
+// Digital Mars C++ bug workaround for passing char loaded from memory via stack
+#ifdef __DMC__
+#      define PUGI__DMC_VOLATILE volatile
+#else
+#      define PUGI__DMC_VOLATILE
+#endif
+
+// Borland C++ bug workaround for not defining ::memcpy depending on header include order (can't always use std::memcpy because some compilers don't have it at all)
+#if defined(__BORLANDC__) && !defined(__MEM_H_USING_LIST)
+using std::memcpy;
+using std::memmove;
+#endif
+
+// In some environments MSVC is a compiler but the CRT lacks certain MSVC-specific features
+#if defined(_MSC_VER) && !defined(__S3E__)
+#      define PUGI__MSVC_CRT_VERSION _MSC_VER
+#endif
+
+#ifdef PUGIXML_HEADER_ONLY
+#      define PUGI__NS_BEGIN namespace pugi { namespace impl {
+#      define PUGI__NS_END } }
+#      define PUGI__FN inline
+#      define PUGI__FN_NO_INLINE inline
+#else
+#      if defined(_MSC_VER) && _MSC_VER < 1300 // MSVC6 seems to have an amusing bug with anonymous namespaces inside namespaces
+#              define PUGI__NS_BEGIN namespace pugi { namespace impl {
+#              define PUGI__NS_END } }
+#      else
+#              define PUGI__NS_BEGIN namespace pugi { namespace impl { namespace {
+#              define PUGI__NS_END } } }
+#      endif
+#      define PUGI__FN
+#      define PUGI__FN_NO_INLINE PUGI__NO_INLINE
+#endif
+
+// uintptr_t
+#if !defined(_MSC_VER) || _MSC_VER >= 1600
+#      include <stdint.h>
+#else
+#      ifndef _UINTPTR_T_DEFINED
+// No native uintptr_t in MSVC6 and in some WinCE versions
+typedef size_t uintptr_t;
+#define _UINTPTR_T_DEFINED
+#      endif
+PUGI__NS_BEGIN
+       typedef unsigned __int8 uint8_t;
+       typedef unsigned __int16 uint16_t;
+       typedef unsigned __int32 uint32_t;
+PUGI__NS_END
+#endif
+
+// Memory allocation
+PUGI__NS_BEGIN
+       PUGI__FN void* default_allocate(size_t size)
+       {
+               return malloc(size);
+       }
+
+       PUGI__FN void default_deallocate(void* ptr)
+       {
+               free(ptr);
+       }
+
+       template <typename T>
+       struct xml_memory_management_function_storage
+       {
+               static allocation_function allocate;
+               static deallocation_function deallocate;
+       };
+
+       // Global allocation functions are stored in class statics so that in header mode linker deduplicates them
+       // Without a template<> we'll get multiple definitions of the same static
+       template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
+       template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate;
+
+       typedef xml_memory_management_function_storage<int> xml_memory;
+PUGI__NS_END
+
+// String utilities
+PUGI__NS_BEGIN
+       // Get string length
+       PUGI__FN size_t strlength(const char_t* s)
+       {
+               assert(s);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcslen(s);
+       #else
+               return strlen(s);
+       #endif
+       }
+
+       // Compare two strings
+       PUGI__FN bool strequal(const char_t* src, const char_t* dst)
+       {
+               assert(src && dst);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcscmp(src, dst) == 0;
+       #else
+               return strcmp(src, dst) == 0;
+       #endif
+       }
+
+       // Compare lhs with [rhs_begin, rhs_end)
+       PUGI__FN bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count)
+       {
+               for (size_t i = 0; i < count; ++i)
+                       if (lhs[i] != rhs[i])
+                               return false;
+       
+               return lhs[count] == 0;
+       }
+
+       // Get length of wide string, even if CRT lacks wide character support
+       PUGI__FN size_t strlength_wide(const wchar_t* s)
+       {
+               assert(s);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcslen(s);
+       #else
+               const wchar_t* end = s;
+               while (*end) end++;
+               return static_cast<size_t>(end - s);
+       #endif
+       }
+
+#ifdef PUGIXML_WCHAR_MODE
+       // Convert string to wide string, assuming all symbols are ASCII
+       PUGI__FN void widen_ascii(wchar_t* dest, const char* source)
+       {
+               for (const char* i = source; *i; ++i) *dest++ = *i;
+               *dest = 0;
+       }
+#endif
+PUGI__NS_END
+
+#if !defined(PUGIXML_NO_STL) || !defined(PUGIXML_NO_XPATH)
+// auto_ptr-like buffer holder for exception recovery
+PUGI__NS_BEGIN
+       struct buffer_holder
+       {
+               void* data;
+               void (*deleter)(void*);
+
+               buffer_holder(void* data_, void (*deleter_)(void*)): data(data_), deleter(deleter_)
+               {
+               }
+
+               ~buffer_holder()
+               {
+                       if (data) deleter(data);
+               }
+
+               void* release()
+               {
+                       void* result = data;
+                       data = 0;
+                       return result;
+               }
+       };
+PUGI__NS_END
+#endif
+
+PUGI__NS_BEGIN
+       static const size_t xml_memory_page_size =
+       #ifdef PUGIXML_MEMORY_PAGE_SIZE
+               PUGIXML_MEMORY_PAGE_SIZE
+       #else
+               32768
+       #endif
+               ;
+
+       static const uintptr_t xml_memory_page_alignment = 64;
+       static const uintptr_t xml_memory_page_pointer_mask = ~(xml_memory_page_alignment - 1);
+       static const uintptr_t xml_memory_page_contents_shared_mask = 32;
+       static const uintptr_t xml_memory_page_name_allocated_mask = 16;
+       static const uintptr_t xml_memory_page_value_allocated_mask = 8;
+       static const uintptr_t xml_memory_page_type_mask = 7;
+       static const uintptr_t xml_memory_page_name_allocated_or_shared_mask = xml_memory_page_name_allocated_mask | xml_memory_page_contents_shared_mask;
+       static const uintptr_t xml_memory_page_value_allocated_or_shared_mask = xml_memory_page_value_allocated_mask | xml_memory_page_contents_shared_mask;
+
+       #define PUGI__NODETYPE(n) static_cast<xml_node_type>(((n)->header & impl::xml_memory_page_type_mask) + 1)
+
+       struct xml_allocator;
+
+       struct xml_memory_page
+       {
+               static xml_memory_page* construct(void* memory)
+               {
+                       xml_memory_page* result = static_cast<xml_memory_page*>(memory);
+
+                       result->allocator = 0;
+                       result->prev = 0;
+                       result->next = 0;
+                       result->busy_size = 0;
+                       result->freed_size = 0;
+
+                       return result;
+               }
+
+               xml_allocator* allocator;
+
+               xml_memory_page* prev;
+               xml_memory_page* next;
+
+               size_t busy_size;
+               size_t freed_size;
+       };
+
+       struct xml_memory_string_header
+       {
+               uint16_t page_offset; // offset from page->data
+               uint16_t full_size; // 0 if string occupies whole page
+       };
+
+       struct xml_allocator
+       {
+               xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size)
+               {
+               }
+
+               xml_memory_page* allocate_page(size_t data_size)
+               {
+                       size_t size = sizeof(xml_memory_page) + data_size;
+
+                       // allocate block with some alignment, leaving memory for worst-case padding
+                       void* memory = xml_memory::allocate(size + xml_memory_page_alignment);
+                       if (!memory) return 0;
+
+                       // align to next page boundary (note: this guarantees at least 1 usable byte before the page)
+                       char* page_memory = reinterpret_cast<char*>((reinterpret_cast<uintptr_t>(memory) + xml_memory_page_alignment) & ~(xml_memory_page_alignment - 1));
+
+                       // prepare page structure
+                       xml_memory_page* page = xml_memory_page::construct(page_memory);
+                       assert(page);
+
+                       page->allocator = _root->allocator;
+
+                       // record the offset for freeing the memory block
+                       assert(page_memory > memory && page_memory - static_cast<char*>(memory) <= 127);
+                       page_memory[-1] = static_cast<char>(page_memory - static_cast<char*>(memory));
+
+                       return page;
+               }
+
+               static void deallocate_page(xml_memory_page* page)
+               {
+                       char* page_memory = reinterpret_cast<char*>(page);
+
+                       xml_memory::deallocate(page_memory - page_memory[-1]);
+               }
+
+               void* allocate_memory_oob(size_t size, xml_memory_page*& out_page);
+
+               void* allocate_memory(size_t size, xml_memory_page*& out_page)
+               {
+                       if (_busy_size + size > xml_memory_page_size) return allocate_memory_oob(size, out_page);
+
+                       void* buf = reinterpret_cast<char*>(_root) + sizeof(xml_memory_page) + _busy_size;
+
+                       _busy_size += size;
+
+                       out_page = _root;
+
+                       return buf;
+               }
+
+               void deallocate_memory(void* ptr, size_t size, xml_memory_page* page)
+               {
+                       if (page == _root) page->busy_size = _busy_size;
+
+                       assert(ptr >= reinterpret_cast<char*>(page) + sizeof(xml_memory_page) && ptr < reinterpret_cast<char*>(page) + sizeof(xml_memory_page) + page->busy_size);
+                       (void)!ptr;
+
+                       page->freed_size += size;
+                       assert(page->freed_size <= page->busy_size);
+
+                       if (page->freed_size == page->busy_size)
+                       {
+                               if (page->next == 0)
+                               {
+                                       assert(_root == page);
+
+                                       // top page freed, just reset sizes
+                                       page->busy_size = page->freed_size = 0;
+                                       _busy_size = 0;
+                               }
+                               else
+                               {
+                                       assert(_root != page);
+                                       assert(page->prev);
+
+                                       // remove from the list
+                                       page->prev->next = page->next;
+                                       page->next->prev = page->prev;
+
+                                       // deallocate
+                                       deallocate_page(page);
+                               }
+                       }
+               }
+
+               char_t* allocate_string(size_t length)
+               {
+                       PUGI__STATIC_ASSERT(xml_memory_page_size <= (1 << 16));
+
+                       // allocate memory for string and header block
+                       size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t);
+                       
+                       // round size up to pointer alignment boundary
+                       size_t full_size = (size + (sizeof(void*) - 1)) & ~(sizeof(void*) - 1);
+
+                       xml_memory_page* page;
+                       xml_memory_string_header* header = static_cast<xml_memory_string_header*>(allocate_memory(full_size, page));
+
+                       if (!header) return 0;
+
+                       // setup header
+                       ptrdiff_t page_offset = reinterpret_cast<char*>(header) - reinterpret_cast<char*>(page) - sizeof(xml_memory_page);
+
+                       assert(page_offset >= 0 && page_offset < (1 << 16));
+                       header->page_offset = static_cast<uint16_t>(page_offset);
+
+                       // full_size == 0 for large strings that occupy the whole page
+                       assert(full_size < (1 << 16) || (page->busy_size == full_size && page_offset == 0));
+                       header->full_size = static_cast<uint16_t>(full_size < (1 << 16) ? full_size : 0);
+
+                       // round-trip through void* to avoid 'cast increases required alignment of target type' warning
+                       // header is guaranteed a pointer-sized alignment, which should be enough for char_t
+                       return static_cast<char_t*>(static_cast<void*>(header + 1));
+               }
+
+               void deallocate_string(char_t* string)
+               {
+                       // this function casts pointers through void* to avoid 'cast increases required alignment of target type' warnings
+                       // we're guaranteed the proper (pointer-sized) alignment on the input string if it was allocated via allocate_string
+
+                       // get header
+                       xml_memory_string_header* header = static_cast<xml_memory_string_header*>(static_cast<void*>(string)) - 1;
+                       assert(header);
+
+                       // deallocate
+                       size_t page_offset = sizeof(xml_memory_page) + header->page_offset;
+                       xml_memory_page* page = reinterpret_cast<xml_memory_page*>(static_cast<void*>(reinterpret_cast<char*>(header) - page_offset));
+
+                       // if full_size == 0 then this string occupies the whole page
+                       size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size;
+
+                       deallocate_memory(header, full_size, page);
+               }
+
+               xml_memory_page* _root;
+               size_t _busy_size;
+       };
+
+       PUGI__FN_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page)
+       {
+               const size_t large_allocation_threshold = xml_memory_page_size / 4;
+
+               xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size);
+               out_page = page;
+
+               if (!page) return 0;
+
+               if (size <= large_allocation_threshold)
+               {
+                       _root->busy_size = _busy_size;
+
+                       // insert page at the end of linked list
+                       page->prev = _root;
+                       _root->next = page;
+                       _root = page;
+
+                       _busy_size = size;
+               }
+               else
+               {
+                       // insert page before the end of linked list, so that it is deleted as soon as possible
+                       // the last page is not deleted even if it's empty (see deallocate_memory)
+                       assert(_root->prev);
+
+                       page->prev = _root->prev;
+                       page->next = _root;
+
+                       _root->prev->next = page;
+                       _root->prev = page;
+               }
+
+               // allocate inside page
+               page->busy_size = size;
+
+               return reinterpret_cast<char*>(page) + sizeof(xml_memory_page);
+       }
+PUGI__NS_END
+
+namespace pugi
+{
+       /// A 'name=value' XML attribute structure.
+       struct xml_attribute_struct
+       {
+               /// Default ctor
+               xml_attribute_struct(impl::xml_memory_page* page): header(reinterpret_cast<uintptr_t>(page)), name(0), value(0), prev_attribute_c(0), next_attribute(0)
+               {
+               }
+
+               uintptr_t header;
+
+               char_t* name;   ///< Pointer to attribute name.
+               char_t* value;  ///< Pointer to attribute value.
+
+               xml_attribute_struct* prev_attribute_c; ///< Previous attribute (cyclic list)
+               xml_attribute_struct* next_attribute;   ///< Next attribute
+       };
+
+       /// An XML document tree node.
+       struct xml_node_struct
+       {
+               /// Default ctor
+               /// \param type - node type
+               xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(reinterpret_cast<uintptr_t>(page) | (type - 1)), parent(0), name(0), value(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0)
+               {
+               }
+
+               uintptr_t header;
+
+               xml_node_struct*                parent;                                 ///< Pointer to parent
+
+               char_t*                                 name;                                   ///< Pointer to element name.
+               char_t*                                 value;                                  ///< Pointer to any associated string data.
+
+               xml_node_struct*                first_child;                    ///< First child
+               
+               xml_node_struct*                prev_sibling_c;                 ///< Left brother (cyclic list)
+               xml_node_struct*                next_sibling;                   ///< Right brother
+               
+               xml_attribute_struct*   first_attribute;                ///< First attribute
+       };
+}
+
+PUGI__NS_BEGIN
+       struct xml_extra_buffer
+       {
+               char_t* buffer;
+               xml_extra_buffer* next;
+       };
+
+       struct xml_document_struct: public xml_node_struct, public xml_allocator
+       {
+               xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0), extra_buffers(0)
+               {
+               }
+
+               const char_t* buffer;
+
+               xml_extra_buffer* extra_buffers;
+       };
+
+       inline xml_allocator& get_allocator(const xml_node_struct* node)
+       {
+               assert(node);
+
+               return *reinterpret_cast<xml_memory_page*>(node->header & xml_memory_page_pointer_mask)->allocator;
+       }
+
+       template <typename Object> inline xml_document_struct& get_document(const Object* object)
+       {
+               assert(object);
+
+               return *static_cast<xml_document_struct*>(reinterpret_cast<xml_memory_page*>(object->header & xml_memory_page_pointer_mask)->allocator);
+       }
+PUGI__NS_END
+
+// Low-level DOM operations
+PUGI__NS_BEGIN
+       inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc)
+       {
+               xml_memory_page* page;
+               void* memory = alloc.allocate_memory(sizeof(xml_attribute_struct), page);
+
+               return new (memory) xml_attribute_struct(page);
+       }
+
+       inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type)
+       {
+               xml_memory_page* page;
+               void* memory = alloc.allocate_memory(sizeof(xml_node_struct), page);
+
+               return new (memory) xml_node_struct(page, type);
+       }
+
+       inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc)
+       {
+               uintptr_t header = a->header;
+
+               if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(a->name);
+               if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(a->value);
+
+               alloc.deallocate_memory(a, sizeof(xml_attribute_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
+       }
+
+       inline void destroy_node(xml_node_struct* n, xml_allocator& alloc)
+       {
+               uintptr_t header = n->header;
+
+               if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(n->name);
+               if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(n->value);
+
+               for (xml_attribute_struct* attr = n->first_attribute; attr; )
+               {
+                       xml_attribute_struct* next = attr->next_attribute;
+
+                       destroy_attribute(attr, alloc);
+
+                       attr = next;
+               }
+
+               for (xml_node_struct* child = n->first_child; child; )
+               {
+                       xml_node_struct* next = child->next_sibling;
+
+                       destroy_node(child, alloc);
+
+                       child = next;
+               }
+
+               alloc.deallocate_memory(n, sizeof(xml_node_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
+       }
+
+       inline void append_node(xml_node_struct* child, xml_node_struct* node)
+       {
+               child->parent = node;
+
+               xml_node_struct* head = node->first_child;
+
+               if (head)
+               {
+                       xml_node_struct* tail = head->prev_sibling_c;
+
+                       tail->next_sibling = child;
+                       child->prev_sibling_c = tail;
+                       head->prev_sibling_c = child;
+               }
+               else
+               {
+                       node->first_child = child;
+                       child->prev_sibling_c = child;
+               }
+       }
+
+       inline void prepend_node(xml_node_struct* child, xml_node_struct* node)
+       {
+               child->parent = node;
+
+               xml_node_struct* head = node->first_child;
+
+               if (head)
+               {
+                       child->prev_sibling_c = head->prev_sibling_c;
+                       head->prev_sibling_c = child;
+               }
+               else
+                       child->prev_sibling_c = child;
+
+               child->next_sibling = head;
+               node->first_child = child;
+       }
+
+       inline void insert_node_after(xml_node_struct* child, xml_node_struct* node)
+       {
+               xml_node_struct* parent = node->parent;
+
+               child->parent = parent;
+
+               if (node->next_sibling)
+                       node->next_sibling->prev_sibling_c = child;
+               else
+                       parent->first_child->prev_sibling_c = child;
+
+               child->next_sibling = node->next_sibling;
+               child->prev_sibling_c = node;
+
+               node->next_sibling = child;
+       }
+
+       inline void insert_node_before(xml_node_struct* child, xml_node_struct* node)
+       {
+               xml_node_struct* parent = node->parent;
+
+               child->parent = parent;
+
+               if (node->prev_sibling_c->next_sibling)
+                       node->prev_sibling_c->next_sibling = child;
+               else
+                       parent->first_child = child;
+
+               child->prev_sibling_c = node->prev_sibling_c;
+               child->next_sibling = node;
+
+               node->prev_sibling_c = child;
+       }
+
+       inline void remove_node(xml_node_struct* node)
+       {
+               xml_node_struct* parent = node->parent;
+
+               if (node->next_sibling)
+                       node->next_sibling->prev_sibling_c = node->prev_sibling_c;
+               else
+                       parent->first_child->prev_sibling_c = node->prev_sibling_c;
+
+               if (node->prev_sibling_c->next_sibling)
+                       node->prev_sibling_c->next_sibling = node->next_sibling;
+               else
+                       parent->first_child = node->next_sibling;
+
+               node->parent = 0;
+               node->prev_sibling_c = 0;
+               node->next_sibling = 0;
+       }
+
+       inline void append_attribute(xml_attribute_struct* attr, xml_node_struct* node)
+       {
+               xml_attribute_struct* head = node->first_attribute;
+
+               if (head)
+               {
+                       xml_attribute_struct* tail = head->prev_attribute_c;
+
+                       tail->next_attribute = attr;
+                       attr->prev_attribute_c = tail;
+                       head->prev_attribute_c = attr;
+               }
+               else
+               {
+                       node->first_attribute = attr;
+                       attr->prev_attribute_c = attr;
+               }
+       }
+
+       inline void prepend_attribute(xml_attribute_struct* attr, xml_node_struct* node)
+       {
+               xml_attribute_struct* head = node->first_attribute;
+
+               if (head)
+               {
+                       attr->prev_attribute_c = head->prev_attribute_c;
+                       head->prev_attribute_c = attr;
+               }
+               else
+                       attr->prev_attribute_c = attr;
+
+               attr->next_attribute = head;
+               node->first_attribute = attr;
+       }
+
+       inline void insert_attribute_after(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node)
+       {
+               if (place->next_attribute)
+                       place->next_attribute->prev_attribute_c = attr;
+               else
+                       node->first_attribute->prev_attribute_c = attr;
+
+               attr->next_attribute = place->next_attribute;
+               attr->prev_attribute_c = place;
+               place->next_attribute = attr;
+       }
+
+       inline void insert_attribute_before(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node)
+       {
+               if (place->prev_attribute_c->next_attribute)
+                       place->prev_attribute_c->next_attribute = attr;
+               else
+                       node->first_attribute = attr;
+
+               attr->prev_attribute_c = place->prev_attribute_c;
+               attr->next_attribute = place;
+               place->prev_attribute_c = attr;
+       }
+
+       inline void remove_attribute(xml_attribute_struct* attr, xml_node_struct* node)
+       {
+               if (attr->next_attribute)
+                       attr->next_attribute->prev_attribute_c = attr->prev_attribute_c;
+               else
+                       node->first_attribute->prev_attribute_c = attr->prev_attribute_c;
+
+               if (attr->prev_attribute_c->next_attribute)
+                       attr->prev_attribute_c->next_attribute = attr->next_attribute;
+               else
+                       node->first_attribute = attr->next_attribute;
+
+               attr->prev_attribute_c = 0;
+               attr->next_attribute = 0;
+       }
+
+       PUGI__FN_NO_INLINE xml_node_struct* append_new_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element)
+       {
+               xml_node_struct* child = allocate_node(alloc, type);
+               if (!child) return 0;
+
+               append_node(child, node);
+
+               return child;
+       }
+
+       PUGI__FN_NO_INLINE xml_attribute_struct* append_new_attribute(xml_node_struct* node, xml_allocator& alloc)
+       {
+               xml_attribute_struct* attr = allocate_attribute(alloc);
+               if (!attr) return 0;
+
+               append_attribute(attr, node);
+
+               return attr;
+       }
+PUGI__NS_END
+
+// Helper classes for code generation
+PUGI__NS_BEGIN
+       struct opt_false
+       {
+               enum { value = 0 };
+       };
+
+       struct opt_true
+       {
+               enum { value = 1 };
+       };
+PUGI__NS_END
+
+// Unicode utilities
+PUGI__NS_BEGIN
+       inline uint16_t endian_swap(uint16_t value)
+       {
+               return static_cast<uint16_t>(((value & 0xff) << 8) | (value >> 8));
+       }
+
+       inline uint32_t endian_swap(uint32_t value)
+       {
+               return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24);
+       }
+
+       struct utf8_counter
+       {
+               typedef size_t value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       // U+0000..U+007F
+                       if (ch < 0x80) return result + 1;
+                       // U+0080..U+07FF
+                       else if (ch < 0x800) return result + 2;
+                       // U+0800..U+FFFF
+                       else return result + 3;
+               }
+
+               static value_type high(value_type result, uint32_t)
+               {
+                       // U+10000..U+10FFFF
+                       return result + 4;
+               }
+       };
+
+       struct utf8_writer
+       {
+               typedef uint8_t* value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       // U+0000..U+007F
+                       if (ch < 0x80)
+                       {
+                               *result = static_cast<uint8_t>(ch);
+                               return result + 1;
+                       }
+                       // U+0080..U+07FF
+                       else if (ch < 0x800)
+                       {
+                               result[0] = static_cast<uint8_t>(0xC0 | (ch >> 6));
+                               result[1] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+                               return result + 2;
+                       }
+                       // U+0800..U+FFFF
+                       else
+                       {
+                               result[0] = static_cast<uint8_t>(0xE0 | (ch >> 12));
+                               result[1] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
+                               result[2] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+                               return result + 3;
+                       }
+               }
+
+               static value_type high(value_type result, uint32_t ch)
+               {
+                       // U+10000..U+10FFFF
+                       result[0] = static_cast<uint8_t>(0xF0 | (ch >> 18));
+                       result[1] = static_cast<uint8_t>(0x80 | ((ch >> 12) & 0x3F));
+                       result[2] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
+                       result[3] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+                       return result + 4;
+               }
+
+               static value_type any(value_type result, uint32_t ch)
+               {
+                       return (ch < 0x10000) ? low(result, ch) : high(result, ch);
+               }
+       };
+
+       struct utf16_counter
+       {
+               typedef size_t value_type;
+
+               static value_type low(value_type result, uint32_t)
+               {
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t)
+               {
+                       return result + 2;
+               }
+       };
+
+       struct utf16_writer
+       {
+               typedef uint16_t* value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       *result = static_cast<uint16_t>(ch);
+
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t ch)
+               {
+                       uint32_t msh = static_cast<uint32_t>(ch - 0x10000) >> 10;
+                       uint32_t lsh = static_cast<uint32_t>(ch - 0x10000) & 0x3ff;
+
+                       result[0] = static_cast<uint16_t>(0xD800 + msh);
+                       result[1] = static_cast<uint16_t>(0xDC00 + lsh);
+
+                       return result + 2;
+               }
+
+               static value_type any(value_type result, uint32_t ch)
+               {
+                       return (ch < 0x10000) ? low(result, ch) : high(result, ch);
+               }
+       };
+
+       struct utf32_counter
+       {
+               typedef size_t value_type;
+
+               static value_type low(value_type result, uint32_t)
+               {
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t)
+               {
+                       return result + 1;
+               }
+       };
+
+       struct utf32_writer
+       {
+               typedef uint32_t* value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       *result = ch;
+
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t ch)
+               {
+                       *result = ch;
+
+                       return result + 1;
+               }
+
+               static value_type any(value_type result, uint32_t ch)
+               {
+                       *result = ch;
+
+                       return result + 1;
+               }
+       };
+
+       struct latin1_writer
+       {
+               typedef uint8_t* value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       *result = static_cast<uint8_t>(ch > 255 ? '?' : ch);
+
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t ch)
+               {
+                       (void)ch;
+
+                       *result = '?';
+
+                       return result + 1;
+               }
+       };
+
+       template <size_t size> struct wchar_selector;
+
+       template <> struct wchar_selector<2>
+       {
+               typedef uint16_t type;
+               typedef utf16_counter counter;
+               typedef utf16_writer writer;
+       };
+
+       template <> struct wchar_selector<4>
+       {
+               typedef uint32_t type;
+               typedef utf32_counter counter;
+               typedef utf32_writer writer;
+       };
+
+       typedef wchar_selector<sizeof(wchar_t)>::counter wchar_counter;
+       typedef wchar_selector<sizeof(wchar_t)>::writer wchar_writer;
+
+       template <typename Traits, typename opt_swap = opt_false> struct utf_decoder
+       {
+               static inline typename Traits::value_type decode_utf8_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+               {
+                       const uint8_t utf8_byte_mask = 0x3f;
+
+                       while (size)
+                       {
+                               uint8_t lead = *data;
+
+                               // 0xxxxxxx -> U+0000..U+007F
+                               if (lead < 0x80)
+                               {
+                                       result = Traits::low(result, lead);
+                                       data += 1;
+                                       size -= 1;
+
+                                       // process aligned single-byte (ascii) blocks
+                                       if ((reinterpret_cast<uintptr_t>(data) & 3) == 0)
+                                       {
+                                               // round-trip through void* to silence 'cast increases required alignment of target type' warnings
+                                               while (size >= 4 && (*static_cast<const uint32_t*>(static_cast<const void*>(data)) & 0x80808080) == 0)
+                                               {
+                                                       result = Traits::low(result, data[0]);
+                                                       result = Traits::low(result, data[1]);
+                                                       result = Traits::low(result, data[2]);
+                                                       result = Traits::low(result, data[3]);
+                                                       data += 4;
+                                                       size -= 4;
+                                               }
+                                       }
+                               }
+                               // 110xxxxx -> U+0080..U+07FF
+                               else if (static_cast<unsigned int>(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80)
+                               {
+                                       result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask));
+                                       data += 2;
+                                       size -= 2;
+                               }
+                               // 1110xxxx -> U+0800-U+FFFF
+                               else if (static_cast<unsigned int>(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80)
+                               {
+                                       result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask));
+                                       data += 3;
+                                       size -= 3;
+                               }
+                               // 11110xxx -> U+10000..U+10FFFF
+                               else if (static_cast<unsigned int>(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80)
+                               {
+                                       result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask));
+                                       data += 4;
+                                       size -= 4;
+                               }
+                               // 10xxxxxx or 11111xxx -> invalid
+                               else
+                               {
+                                       data += 1;
+                                       size -= 1;
+                               }
+                       }
+
+                       return result;
+               }
+
+               static inline typename Traits::value_type decode_utf16_block(const uint16_t* data, size_t size, typename Traits::value_type result)
+               {
+                       const uint16_t* end = data + size;
+
+                       while (data < end)
+                       {
+                               unsigned int lead = opt_swap::value ? endian_swap(*data) : *data;
+
+                               // U+0000..U+D7FF
+                               if (lead < 0xD800)
+                               {
+                                       result = Traits::low(result, lead);
+                                       data += 1;
+                               }
+                               // U+E000..U+FFFF
+                               else if (static_cast<unsigned int>(lead - 0xE000) < 0x2000)
+                               {
+                                       result = Traits::low(result, lead);
+                                       data += 1;
+                               }
+                               // surrogate pair lead
+                               else if (static_cast<unsigned int>(lead - 0xD800) < 0x400 && data + 1 < end)
+                               {
+                                       uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1];
+
+                                       if (static_cast<unsigned int>(next - 0xDC00) < 0x400)
+                                       {
+                                               result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff));
+                                               data += 2;
+                                       }
+                                       else
+                                       {
+                                               data += 1;
+                                       }
+                               }
+                               else
+                               {
+                                       data += 1;
+                               }
+                       }
+
+                       return result;
+               }
+
+               static inline typename Traits::value_type decode_utf32_block(const uint32_t* data, size_t size, typename Traits::value_type result)
+               {
+                       const uint32_t* end = data + size;
+
+                       while (data < end)
+                       {
+                               uint32_t lead = opt_swap::value ? endian_swap(*data) : *data;
+
+                               // U+0000..U+FFFF
+                               if (lead < 0x10000)
+                               {
+                                       result = Traits::low(result, lead);
+                                       data += 1;
+                               }
+                               // U+10000..U+10FFFF
+                               else
+                               {
+                                       result = Traits::high(result, lead);
+                                       data += 1;
+                               }
+                       }
+
+                       return result;
+               }
+
+               static inline typename Traits::value_type decode_latin1_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+               {
+                       for (size_t i = 0; i < size; ++i)
+                       {
+                               result = Traits::low(result, data[i]);
+                       }
+
+                       return result;
+               }
+
+               static inline typename Traits::value_type decode_wchar_block_impl(const uint16_t* data, size_t size, typename Traits::value_type result)
+               {
+                       return decode_utf16_block(data, size, result);
+               }
+
+               static inline typename Traits::value_type decode_wchar_block_impl(const uint32_t* data, size_t size, typename Traits::value_type result)
+               {
+                       return decode_utf32_block(data, size, result);
+               }
+
+               static inline typename Traits::value_type decode_wchar_block(const wchar_t* data, size_t size, typename Traits::value_type result)
+               {
+                       return decode_wchar_block_impl(reinterpret_cast<const wchar_selector<sizeof(wchar_t)>::type*>(data), size, result);
+               }
+       };
+
+       template <typename T> PUGI__FN void convert_utf_endian_swap(T* result, const T* data, size_t length)
+       {
+               for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]);
+       }
+
+#ifdef PUGIXML_WCHAR_MODE
+       PUGI__FN void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length)
+       {
+               for (size_t i = 0; i < length; ++i) result[i] = static_cast<wchar_t>(endian_swap(static_cast<wchar_selector<sizeof(wchar_t)>::type>(data[i])));
+       }
+#endif
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+       enum chartype_t
+       {
+               ct_parse_pcdata = 1,    // \0, &, \r, <
+               ct_parse_attr = 2,              // \0, &, \r, ', "
+               ct_parse_attr_ws = 4,   // \0, &, \r, ', ", \n, tab
+               ct_space = 8,                   // \r, \n, space, tab
+               ct_parse_cdata = 16,    // \0, ], >, \r
+               ct_parse_comment = 32,  // \0, -, >, \r
+               ct_symbol = 64,                 // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
+               ct_start_symbol = 128   // Any symbol > 127, a-z, A-Z, _, :
+       };
+
+       static const unsigned char chartype_table[256] =
+       {
+               55,  0,   0,   0,   0,   0,   0,   0,      0,   12,  12,  0,   0,   63,  0,   0,   // 0-15
+               0,   0,   0,   0,   0,   0,   0,   0,      0,   0,   0,   0,   0,   0,   0,   0,   // 16-31
+               8,   0,   6,   0,   0,   0,   7,   6,      0,   0,   0,   0,   0,   96,  64,  0,   // 32-47
+               64,  64,  64,  64,  64,  64,  64,  64,     64,  64,  192, 0,   1,   0,   48,  0,   // 48-63
+               0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 64-79
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0,   0,   16,  0,   192, // 80-95
+               0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 96-111
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0, 0, 0, 0, 0,           // 112-127
+
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 128+
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192
+       };
+
+       enum chartypex_t
+       {
+               ctx_special_pcdata = 1,   // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, >
+               ctx_special_attr = 2,     // Any symbol >= 0 and < 32 (except \t), &, <, >, "
+               ctx_start_symbol = 4,     // Any symbol > 127, a-z, A-Z, _
+               ctx_digit = 8,                    // 0-9
+               ctx_symbol = 16                   // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
+       };
+       
+       static const unsigned char chartypex_table[256] =
+       {
+               3,  3,  3,  3,  3,  3,  3,  3,     3,  0,  2,  3,  3,  2,  3,  3,     // 0-15
+               3,  3,  3,  3,  3,  3,  3,  3,     3,  3,  3,  3,  3,  3,  3,  3,     // 16-31
+               0,  0,  2,  0,  0,  0,  3,  0,     0,  0,  0,  0,  0, 16, 16,  0,     // 32-47
+               24, 24, 24, 24, 24, 24, 24, 24,    24, 24, 0,  0,  3,  0,  3,  0,     // 48-63
+
+               0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 64-79
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  20,    // 80-95
+               0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 96-111
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  0,     // 112-127
+
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 128+
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20
+       };
+       
+#ifdef PUGIXML_WCHAR_MODE
+       #define PUGI__IS_CHARTYPE_IMPL(c, ct, table) ((static_cast<unsigned int>(c) < 128 ? table[static_cast<unsigned int>(c)] : table[128]) & (ct))
+#else
+       #define PUGI__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast<unsigned char>(c)] & (ct))
+#endif
+
+       #define PUGI__IS_CHARTYPE(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartype_table)
+       #define PUGI__IS_CHARTYPEX(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartypex_table)
+
+       PUGI__FN bool is_little_endian()
+       {
+               unsigned int ui = 1;
+
+               return *reinterpret_cast<unsigned char*>(&ui) == 1;
+       }
+
+       PUGI__FN xml_encoding get_wchar_encoding()
+       {
+               PUGI__STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
+
+               if (sizeof(wchar_t) == 2)
+                       return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+               else 
+                       return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+       }
+
+       PUGI__FN xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3)
+       {
+               // look for BOM in first few bytes
+               if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be;
+               if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le;
+               if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be;
+               if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le;
+               if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8;
+
+               // look for <, <? or <?xm in various encodings
+               if (d0 == 0 && d1 == 0 && d2 == 0 && d3 == 0x3c) return encoding_utf32_be;
+               if (d0 == 0x3c && d1 == 0 && d2 == 0 && d3 == 0) return encoding_utf32_le;
+               if (d0 == 0 && d1 == 0x3c && d2 == 0 && d3 == 0x3f) return encoding_utf16_be;
+               if (d0 == 0x3c && d1 == 0 && d2 == 0x3f && d3 == 0) return encoding_utf16_le;
+               if (d0 == 0x3c && d1 == 0x3f && d2 == 0x78 && d3 == 0x6d) return encoding_utf8;
+
+               // look for utf16 < followed by node name (this may fail, but is better than utf8 since it's zero terminated so early)
+               if (d0 == 0 && d1 == 0x3c) return encoding_utf16_be;
+               if (d0 == 0x3c && d1 == 0) return encoding_utf16_le;
+
+               // no known BOM detected, assume utf8
+               return encoding_utf8;
+       }
+
+       PUGI__FN xml_encoding get_buffer_encoding(xml_encoding encoding, const void* contents, size_t size)
+       {
+               // replace wchar encoding with utf implementation
+               if (encoding == encoding_wchar) return get_wchar_encoding();
+
+               // replace utf16 encoding with utf16 with specific endianness
+               if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+               // replace utf32 encoding with utf32 with specific endianness
+               if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+               // only do autodetection if no explicit encoding is requested
+               if (encoding != encoding_auto) return encoding;
+
+               // skip encoding autodetection if input buffer is too small
+               if (size < 4) return encoding_utf8;
+
+               // try to guess encoding (based on XML specification, Appendix F.1)
+               const uint8_t* data = static_cast<const uint8_t*>(contents);
+
+               PUGI__DMC_VOLATILE uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3];
+
+               return guess_buffer_encoding(d0, d1, d2, d3);
+       }
+
+       PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+       {
+               size_t length = size / sizeof(char_t);
+
+               if (is_mutable)
+               {
+                       out_buffer = static_cast<char_t*>(const_cast<void*>(contents));
+                       out_length = length;
+               }
+               else
+               {
+                       char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+                       if (!buffer) return false;
+
+                       if (contents)
+                               memcpy(buffer, contents, length * sizeof(char_t));
+                       else
+                               assert(length == 0);
+
+                       buffer[length] = 0;
+
+                       out_buffer = buffer;
+                       out_length = length + 1;
+               }
+
+               return true;
+       }
+
+#ifdef PUGIXML_WCHAR_MODE
+       PUGI__FN bool need_endian_swap_utf(xml_encoding le, xml_encoding re)
+       {
+               return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) ||
+                          (le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be);
+       }
+
+       PUGI__FN bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+       {
+               const char_t* data = static_cast<const char_t*>(contents);
+               size_t length = size / sizeof(char_t);
+
+               if (is_mutable)
+               {
+                       char_t* buffer = const_cast<char_t*>(data);
+
+                       convert_wchar_endian_swap(buffer, data, length);
+
+                       out_buffer = buffer;
+                       out_length = length;
+               }
+               else
+               {
+                       char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+                       if (!buffer) return false;
+
+                       convert_wchar_endian_swap(buffer, data, length);
+                       buffer[length] = 0;
+
+                       out_buffer = buffer;
+                       out_length = length + 1;
+               }
+
+               return true;
+       }
+
+       PUGI__FN bool convert_buffer_utf8(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
+       {
+               const uint8_t* data = static_cast<const uint8_t*>(contents);
+               size_t data_length = size;
+
+               // first pass: get length in wchar_t units
+               size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf8 input to wchar_t
+               wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+               wchar_writer::value_type oend = utf_decoder<wchar_writer>::decode_utf8_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+       {
+               const uint16_t* data = static_cast<const uint16_t*>(contents);
+               size_t data_length = size / sizeof(uint16_t);
+
+               // first pass: get length in wchar_t units
+               size_t length = utf_decoder<wchar_counter, opt_swap>::decode_utf16_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf16 input to wchar_t
+               wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+               wchar_writer::value_type oend = utf_decoder<wchar_writer, opt_swap>::decode_utf16_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+       {
+               const uint32_t* data = static_cast<const uint32_t*>(contents);
+               size_t data_length = size / sizeof(uint32_t);
+
+               // first pass: get length in wchar_t units
+               size_t length = utf_decoder<wchar_counter, opt_swap>::decode_utf32_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf32 input to wchar_t
+               wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+               wchar_writer::value_type oend = utf_decoder<wchar_writer, opt_swap>::decode_utf32_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
+       {
+               const uint8_t* data = static_cast<const uint8_t*>(contents);
+               size_t data_length = size;
+
+               // get length in wchar_t units
+               size_t length = data_length;
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // convert latin1 input to wchar_t
+               wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+               wchar_writer::value_type oend = utf_decoder<wchar_writer>::decode_latin1_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
+       {
+               // get native encoding
+               xml_encoding wchar_encoding = get_wchar_encoding();
+
+               // fast path: no conversion required
+               if (encoding == wchar_encoding) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+               // only endian-swapping is required
+               if (need_endian_swap_utf(encoding, wchar_encoding)) return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable);
+
+               // source encoding is utf8
+               if (encoding == encoding_utf8) return convert_buffer_utf8(out_buffer, out_length, contents, size);
+
+               // source encoding is utf16
+               if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+               {
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+                       return (native_encoding == encoding) ?
+                               convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
+                               convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+               }
+
+               // source encoding is utf32
+               if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+               {
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+                       return (native_encoding == encoding) ?
+                               convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
+                               convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+               }
+
+               // source encoding is latin1
+               if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size);
+
+               assert(!"Invalid encoding");
+               return false;
+       }
+#else
+       template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+       {
+               const uint16_t* data = static_cast<const uint16_t*>(contents);
+               size_t data_length = size / sizeof(uint16_t);
+
+               // first pass: get length in utf8 units
+               size_t length = utf_decoder<utf8_counter, opt_swap>::decode_utf16_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf16 input to utf8
+               uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
+               uint8_t* oend = utf_decoder<utf8_writer, opt_swap>::decode_utf16_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+       {
+               const uint32_t* data = static_cast<const uint32_t*>(contents);
+               size_t data_length = size / sizeof(uint32_t);
+
+               // first pass: get length in utf8 units
+               size_t length = utf_decoder<utf8_counter, opt_swap>::decode_utf32_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf32 input to utf8
+               uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
+               uint8_t* oend = utf_decoder<utf8_writer, opt_swap>::decode_utf32_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       PUGI__FN size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size)
+       {
+               for (size_t i = 0; i < size; ++i)
+                       if (data[i] > 127)
+                               return i;
+
+               return size;
+       }
+
+       PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+       {
+               const uint8_t* data = static_cast<const uint8_t*>(contents);
+               size_t data_length = size;
+
+               // get size of prefix that does not need utf8 conversion
+               size_t prefix_length = get_latin1_7bit_prefix_length(data, data_length);
+               assert(prefix_length <= data_length);
+
+               const uint8_t* postfix = data + prefix_length;
+               size_t postfix_length = data_length - prefix_length;
+
+               // if no conversion is needed, just return the original buffer
+               if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+               // first pass: get length in utf8 units
+               size_t length = prefix_length + utf_decoder<utf8_counter>::decode_latin1_block(postfix, postfix_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert latin1 input to utf8
+               memcpy(buffer, data, prefix_length);
+
+               uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
+               uint8_t* oend = utf_decoder<utf8_writer>::decode_latin1_block(postfix, postfix_length, obegin + prefix_length);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
+       {
+               // fast path: no conversion required
+               if (encoding == encoding_utf8) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+               // source encoding is utf16
+               if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+               {
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+                       return (native_encoding == encoding) ?
+                               convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
+                               convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+               }
+
+               // source encoding is utf32
+               if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+               {
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+                       return (native_encoding == encoding) ?
+                               convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
+                               convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+               }
+
+               // source encoding is latin1
+               if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable);
+
+               assert(!"Invalid encoding");
+               return false;
+       }
+#endif
+
+       PUGI__FN size_t as_utf8_begin(const wchar_t* str, size_t length)
+       {
+               // get length in utf8 characters
+               return utf_decoder<utf8_counter>::decode_wchar_block(str, length, 0);
+       }
+
+       PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length)
+       {
+               // convert to utf8
+               uint8_t* begin = reinterpret_cast<uint8_t*>(buffer);
+               uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(str, length, begin);
+       
+               assert(begin + size == end);
+               (void)!end;
+
+               // zero-terminate
+               buffer[size] = 0;
+       }
+       
+#ifndef PUGIXML_NO_STL
+       PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length)
+       {
+               // first pass: get length in utf8 characters
+               size_t size = as_utf8_begin(str, length);
+
+               // allocate resulting string
+               std::string result;
+               result.resize(size);
+
+               // second pass: convert to utf8
+               if (size > 0) as_utf8_end(&result[0], size, str, length);
+
+               return result;
+       }
+
+       PUGI__FN std::basic_string<wchar_t> as_wide_impl(const char* str, size_t size)
+       {
+               const uint8_t* data = reinterpret_cast<const uint8_t*>(str);
+
+               // first pass: get length in wchar_t units
+               size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
+
+               // allocate resulting string
+               std::basic_string<wchar_t> result;
+               result.resize(length);
+
+               // second pass: convert to wchar_t
+               if (length > 0)
+               {
+                       wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]);
+                       wchar_writer::value_type end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, begin);
+
+                       assert(begin + length == end);
+                       (void)!end;
+               }
+
+               return result;
+       }
+#endif
+
+       inline bool strcpy_insitu_allow(size_t length, uintptr_t header, uintptr_t header_mask, char_t* target)
+       {
+               // never reuse shared memory
+               if (header & xml_memory_page_contents_shared_mask) return false;
+
+               size_t target_length = strlength(target);
+
+               // always reuse document buffer memory if possible
+               if ((header & header_mask) == 0) return target_length >= length;
+
+               // reuse heap memory if waste is not too great
+               const size_t reuse_threshold = 32;
+
+               return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2);
+       }
+
+       PUGI__FN bool strcpy_insitu(char_t*& dest, uintptr_t& header, uintptr_t header_mask, const char_t* source)
+       {
+               assert(header);
+
+               size_t source_length = strlength(source);
+
+               if (source_length == 0)
+               {
+                       // empty string and null pointer are equivalent, so just deallocate old memory
+                       xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
+
+                       if (header & header_mask) alloc->deallocate_string(dest);
+                       
+                       // mark the string as not allocated
+                       dest = 0;
+                       header &= ~header_mask;
+
+                       return true;
+               }
+               else if (dest && strcpy_insitu_allow(source_length, header, header_mask, dest))
+               {
+                       // we can reuse old buffer, so just copy the new data (including zero terminator)
+                       memcpy(dest, source, (source_length + 1) * sizeof(char_t));
+                       
+                       return true;
+               }
+               else
+               {
+                       xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
+
+                       // allocate new buffer
+                       char_t* buf = alloc->allocate_string(source_length + 1);
+                       if (!buf) return false;
+
+                       // copy the string (including zero terminator)
+                       memcpy(buf, source, (source_length + 1) * sizeof(char_t));
+
+                       // deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures)
+                       if (header & header_mask) alloc->deallocate_string(dest);
+                       
+                       // the string is now allocated, so set the flag
+                       dest = buf;
+                       header |= header_mask;
+
+                       return true;
+               }
+       }
+
+       struct gap
+       {
+               char_t* end;
+               size_t size;
+                       
+               gap(): end(0), size(0)
+               {
+               }
+                       
+               // Push new gap, move s count bytes further (skipping the gap).
+               // Collapse previous gap.
+               void push(char_t*& s, size_t count)
+               {
+                       if (end) // there was a gap already; collapse it
+                       {
+                               // Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
+                               assert(s >= end);
+                               memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
+                       }
+                               
+                       s += count; // end of current gap
+                               
+                       // "merge" two gaps
+                       end = s;
+                       size += count;
+               }
+                       
+               // Collapse all gaps, return past-the-end pointer
+               char_t* flush(char_t* s)
+               {
+                       if (end)
+                       {
+                               // Move [old_gap_end, current_pos) to [old_gap_start, ...)
+                               assert(s >= end);
+                               memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
+
+                               return s - size;
+                       }
+                       else return s;
+               }
+       };
+       
+       PUGI__FN char_t* strconv_escape(char_t* s, gap& g)
+       {
+               char_t* stre = s + 1;
+
+               switch (*stre)
+               {
+                       case '#':       // &#...
+                       {
+                               unsigned int ucsc = 0;
+
+                               if (stre[1] == 'x') // &#x... (hex code)
+                               {
+                                       stre += 2;
+
+                                       char_t ch = *stre;
+
+                                       if (ch == ';') return stre;
+
+                                       for (;;)
+                                       {
+                                               if (static_cast<unsigned int>(ch - '0') <= 9)
+                                                       ucsc = 16 * ucsc + (ch - '0');
+                                               else if (static_cast<unsigned int>((ch | ' ') - 'a') <= 5)
+                                                       ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10);
+                                               else if (ch == ';')
+                                                       break;
+                                               else // cancel
+                                                       return stre;
+
+                                               ch = *++stre;
+                                       }
+                                       
+                                       ++stre;
+                               }
+                               else    // &#... (dec code)
+                               {
+                                       char_t ch = *++stre;
+
+                                       if (ch == ';') return stre;
+
+                                       for (;;)
+                                       {
+                                               if (static_cast<unsigned int>(static_cast<unsigned int>(ch) - '0') <= 9)
+                                                       ucsc = 10 * ucsc + (ch - '0');
+                                               else if (ch == ';')
+                                                       break;
+                                               else // cancel
+                                                       return stre;
+
+                                               ch = *++stre;
+                                       }
+                                       
+                                       ++stre;
+                               }
+
+                       #ifdef PUGIXML_WCHAR_MODE
+                               s = reinterpret_cast<char_t*>(wchar_writer::any(reinterpret_cast<wchar_writer::value_type>(s), ucsc));
+                       #else
+                               s = reinterpret_cast<char_t*>(utf8_writer::any(reinterpret_cast<uint8_t*>(s), ucsc));
+                       #endif
+                                       
+                               g.push(s, stre - s);
+                               return stre;
+                       }
+
+                       case 'a':       // &a
+                       {
+                               ++stre;
+
+                               if (*stre == 'm') // &am
+                               {
+                                       if (*++stre == 'p' && *++stre == ';') // &amp;
+                                       {
+                                               *s++ = '&';
+                                               ++stre;
+                                                       
+                                               g.push(s, stre - s);
+                                               return stre;
+                                       }
+                               }
+                               else if (*stre == 'p') // &ap
+                               {
+                                       if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // &apos;
+                                       {
+                                               *s++ = '\'';
+                                               ++stre;
+
+                                               g.push(s, stre - s);
+                                               return stre;
+                                       }
+                               }
+                               break;
+                       }
+
+                       case 'g': // &g
+                       {
+                               if (*++stre == 't' && *++stre == ';') // &gt;
+                               {
+                                       *s++ = '>';
+                                       ++stre;
+                                       
+                                       g.push(s, stre - s);
+                                       return stre;
+                               }
+                               break;
+                       }
+
+                       case 'l': // &l
+                       {
+                               if (*++stre == 't' && *++stre == ';') // &lt;
+                               {
+                                       *s++ = '<';
+                                       ++stre;
+                                               
+                                       g.push(s, stre - s);
+                                       return stre;
+                               }
+                               break;
+                       }
+
+                       case 'q': // &q
+                       {
+                               if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // &quot;
+                               {
+                                       *s++ = '"';
+                                       ++stre;
+                                       
+                                       g.push(s, stre - s);
+                                       return stre;
+                               }
+                               break;
+                       }
+
+                       default:
+                               break;
+               }
+               
+               return stre;
+       }
+
+       // Parser utilities
+       #define PUGI__ENDSWITH(c, e)        ((c) == (e) || ((c) == 0 && endch == (e)))
+       #define PUGI__SKIPWS()              { while (PUGI__IS_CHARTYPE(*s, ct_space)) ++s; }
+       #define PUGI__OPTSET(OPT)           ( optmsk & (OPT) )
+       #define PUGI__PUSHNODE(TYPE)        { cursor = append_new_node(cursor, alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); }
+       #define PUGI__POPNODE()             { cursor = cursor->parent; }
+       #define PUGI__SCANFOR(X)            { while (*s != 0 && !(X)) ++s; }
+       #define PUGI__SCANWHILE(X)          { while (X) ++s; }
+       #define PUGI__SCANWHILE_UNROLL(X)   { for (;;) { char_t ss = s[0]; if (PUGI__UNLIKELY(!(X))) { break; } ss = s[1]; if (PUGI__UNLIKELY(!(X))) { s += 1; break; } ss = s[2]; if (PUGI__UNLIKELY(!(X))) { s += 2; break; } ss = s[3]; if (PUGI__UNLIKELY(!(X))) { s += 3; break; } s += 4; } }
+       #define PUGI__ENDSEG()              { ch = *s; *s = 0; ++s; }
+       #define PUGI__THROW_ERROR(err, m)   return error_offset = m, error_status = err, static_cast<char_t*>(0)
+       #define PUGI__CHECK_ERROR(err, m)   { if (*s == 0) PUGI__THROW_ERROR(err, m); }
+
+       PUGI__FN char_t* strconv_comment(char_t* s, char_t endch)
+       {
+               gap g;
+               
+               while (true)
+               {
+                       PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_comment));
+               
+                       if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+                       {
+                               *s++ = '\n'; // replace first one with 0x0a
+                               
+                               if (*s == '\n') g.push(s, 1);
+                       }
+                       else if (s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>')) // comment ends here
+                       {
+                               *g.flush(s) = 0;
+                               
+                               return s + (s[2] == '>' ? 3 : 2);
+                       }
+                       else if (*s == 0)
+                       {
+                               return 0;
+                       }
+                       else ++s;
+               }
+       }
+
+       PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch)
+       {
+               gap g;
+                       
+               while (true)
+               {
+                       PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_cdata));
+                       
+                       if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+                       {
+                               *s++ = '\n'; // replace first one with 0x0a
+                               
+                               if (*s == '\n') g.push(s, 1);
+                       }
+                       else if (s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')) // CDATA ends here
+                       {
+                               *g.flush(s) = 0;
+                               
+                               return s + 1;
+                       }
+                       else if (*s == 0)
+                       {
+                               return 0;
+                       }
+                       else ++s;
+               }
+       }
+       
+       typedef char_t* (*strconv_pcdata_t)(char_t*);
+               
+       template <typename opt_trim, typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
+       {
+               static char_t* parse(char_t* s)
+               {
+                       gap g;
+
+                       char_t* begin = s;
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_pcdata));
+
+                               if (*s == '<') // PCDATA ends here
+                               {
+                                       char_t* end = g.flush(s);
+
+                                       if (opt_trim::value)
+                                               while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
+                                                       --end;
+
+                                       *end = 0;
+                                       
+                                       return s + 1;
+                               }
+                               else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+                               {
+                                       *s++ = '\n'; // replace first one with 0x0a
+                                       
+                                       if (*s == '\n') g.push(s, 1);
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (*s == 0)
+                               {
+                                       char_t* end = g.flush(s);
+
+                                       if (opt_trim::value)
+                                               while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
+                                                       --end;
+
+                                       *end = 0;
+
+                                       return s;
+                               }
+                               else ++s;
+                       }
+               }
+       };
+       
+       PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
+       {
+               PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800);
+
+               switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim)
+               {
+               case 0: return strconv_pcdata_impl<opt_false, opt_false, opt_false>::parse;
+               case 1: return strconv_pcdata_impl<opt_false, opt_false, opt_true>::parse;
+               case 2: return strconv_pcdata_impl<opt_false, opt_true, opt_false>::parse;
+               case 3: return strconv_pcdata_impl<opt_false, opt_true, opt_true>::parse;
+               case 4: return strconv_pcdata_impl<opt_true, opt_false, opt_false>::parse;
+               case 5: return strconv_pcdata_impl<opt_true, opt_false, opt_true>::parse;
+               case 6: return strconv_pcdata_impl<opt_true, opt_true, opt_false>::parse;
+               case 7: return strconv_pcdata_impl<opt_true, opt_true, opt_true>::parse;
+               default: assert(false); return 0; // should not get here
+               }
+       }
+
+       typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
+       
+       template <typename opt_escape> struct strconv_attribute_impl
+       {
+               static char_t* parse_wnorm(char_t* s, char_t end_quote)
+               {
+                       gap g;
+
+                       // trim leading whitespaces
+                       if (PUGI__IS_CHARTYPE(*s, ct_space))
+                       {
+                               char_t* str = s;
+                               
+                               do ++str;
+                               while (PUGI__IS_CHARTYPE(*str, ct_space));
+                               
+                               g.push(s, str - s);
+                       }
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space));
+                               
+                               if (*s == end_quote)
+                               {
+                                       char_t* str = g.flush(s);
+                                       
+                                       do *str-- = 0;
+                                       while (PUGI__IS_CHARTYPE(*str, ct_space));
+                               
+                                       return s + 1;
+                               }
+                               else if (PUGI__IS_CHARTYPE(*s, ct_space))
+                               {
+                                       *s++ = ' ';
+               
+                                       if (PUGI__IS_CHARTYPE(*s, ct_space))
+                                       {
+                                               char_t* str = s + 1;
+                                               while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str;
+                                               
+                                               g.push(s, str - s);
+                                       }
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (!*s)
+                               {
+                                       return 0;
+                               }
+                               else ++s;
+                       }
+               }
+
+               static char_t* parse_wconv(char_t* s, char_t end_quote)
+               {
+                       gap g;
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws));
+                               
+                               if (*s == end_quote)
+                               {
+                                       *g.flush(s) = 0;
+                               
+                                       return s + 1;
+                               }
+                               else if (PUGI__IS_CHARTYPE(*s, ct_space))
+                               {
+                                       if (*s == '\r')
+                                       {
+                                               *s++ = ' ';
+                               
+                                               if (*s == '\n') g.push(s, 1);
+                                       }
+                                       else *s++ = ' ';
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (!*s)
+                               {
+                                       return 0;
+                               }
+                               else ++s;
+                       }
+               }
+
+               static char_t* parse_eol(char_t* s, char_t end_quote)
+               {
+                       gap g;
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr));
+                               
+                               if (*s == end_quote)
+                               {
+                                       *g.flush(s) = 0;
+                               
+                                       return s + 1;
+                               }
+                               else if (*s == '\r')
+                               {
+                                       *s++ = '\n';
+                                       
+                                       if (*s == '\n') g.push(s, 1);
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (!*s)
+                               {
+                                       return 0;
+                               }
+                               else ++s;
+                       }
+               }
+
+               static char_t* parse_simple(char_t* s, char_t end_quote)
+               {
+                       gap g;
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr));
+                               
+                               if (*s == end_quote)
+                               {
+                                       *g.flush(s) = 0;
+                               
+                                       return s + 1;
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (!*s)
+                               {
+                                       return 0;
+                               }
+                               else ++s;
+                       }
+               }
+       };
+
+       PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask)
+       {
+               PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
+               
+               switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
+               {
+               case 0:  return strconv_attribute_impl<opt_false>::parse_simple;
+               case 1:  return strconv_attribute_impl<opt_true>::parse_simple;
+               case 2:  return strconv_attribute_impl<opt_false>::parse_eol;
+               case 3:  return strconv_attribute_impl<opt_true>::parse_eol;
+               case 4:  return strconv_attribute_impl<opt_false>::parse_wconv;
+               case 5:  return strconv_attribute_impl<opt_true>::parse_wconv;
+               case 6:  return strconv_attribute_impl<opt_false>::parse_wconv;
+               case 7:  return strconv_attribute_impl<opt_true>::parse_wconv;
+               case 8:  return strconv_attribute_impl<opt_false>::parse_wnorm;
+               case 9:  return strconv_attribute_impl<opt_true>::parse_wnorm;
+               case 10: return strconv_attribute_impl<opt_false>::parse_wnorm;
+               case 11: return strconv_attribute_impl<opt_true>::parse_wnorm;
+               case 12: return strconv_attribute_impl<opt_false>::parse_wnorm;
+               case 13: return strconv_attribute_impl<opt_true>::parse_wnorm;
+               case 14: return strconv_attribute_impl<opt_false>::parse_wnorm;
+               case 15: return strconv_attribute_impl<opt_true>::parse_wnorm;
+               default: assert(false); return 0; // should not get here
+               }
+       }
+
+       inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0)
+       {
+               xml_parse_result result;
+               result.status = status;
+               result.offset = offset;
+
+               return result;
+       }
+
+       struct xml_parser
+       {
+               xml_allocator alloc;
+               char_t* error_offset;
+               xml_parse_status error_status;
+               
+               xml_parser(const xml_allocator& alloc_): alloc(alloc_), error_offset(0), error_status(status_ok)
+               {
+               }
+
+               // DOCTYPE consists of nested sections of the following possible types:
+               // <!-- ... -->, <? ... ?>, "...", '...'
+               // <![...]]>
+               // <!...>
+               // First group can not contain nested groups
+               // Second group can contain nested groups of the same type
+               // Third group can contain all other groups
+               char_t* parse_doctype_primitive(char_t* s)
+               {
+                       if (*s == '"' || *s == '\'')
+                       {
+                               // quoted string
+                               char_t ch = *s++;
+                               PUGI__SCANFOR(*s == ch);
+                               if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                               s++;
+                       }
+                       else if (s[0] == '<' && s[1] == '?')
+                       {
+                               // <? ... ?>
+                               s += 2;
+                               PUGI__SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
+                               if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                               s += 2;
+                       }
+                       else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
+                       {
+                               s += 4;
+                               PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
+                               if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                               s += 4;
+                       }
+                       else PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                       return s;
+               }
+
+               char_t* parse_doctype_ignore(char_t* s)
+               {
+                       size_t depth = 0;
+
+                       assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
+                       s += 3;
+
+                       while (*s)
+                       {
+                               if (s[0] == '<' && s[1] == '!' && s[2] == '[')
+                               {
+                                       // nested ignore section
+                                       s += 3;
+                                       depth++;
+                               }
+                               else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
+                               {
+                                       // ignore section end
+                                       s += 3;
+
+                                       if (depth == 0)
+                                               return s;
+
+                                       depth--;
+                               }
+                               else s++;
+                       }
+
+                       PUGI__THROW_ERROR(status_bad_doctype, s);
+               }
+
+               char_t* parse_doctype_group(char_t* s, char_t endch)
+               {
+                       size_t depth = 0;
+
+                       assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
+                       s += 2;
+
+                       while (*s)
+                       {
+                               if (s[0] == '<' && s[1] == '!' && s[2] != '-')
+                               {
+                                       if (s[2] == '[')
+                                       {
+                                               // ignore
+                                               s = parse_doctype_ignore(s);
+                                               if (!s) return s;
+                                       }
+                                       else
+                                       {
+                                               // some control group
+                                               s += 2;
+                                               depth++;
+                                       }
+                               }
+                               else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
+                               {
+                                       // unknown tag (forbidden), or some primitive group
+                                       s = parse_doctype_primitive(s);
+                                       if (!s) return s;
+                               }
+                               else if (*s == '>')
+                               {
+                                       if (depth == 0)
+                                               return s;
+
+                                       depth--;
+                                       s++;
+                               }
+                               else s++;
+                       }
+
+                       if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                       return s;
+               }
+
+               char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch)
+               {
+                       // parse node contents, starting with exclamation mark
+                       ++s;
+
+                       if (*s == '-') // '<!-...'
+                       {
+                               ++s;
+
+                               if (*s == '-') // '<!--...'
+                               {
+                                       ++s;
+
+                                       if (PUGI__OPTSET(parse_comments))
+                                       {
+                                               PUGI__PUSHNODE(node_comment); // Append a new node on the tree.
+                                               cursor->value = s; // Save the offset.
+                                       }
+
+                                       if (PUGI__OPTSET(parse_eol) && PUGI__OPTSET(parse_comments))
+                                       {
+                                               s = strconv_comment(s, endch);
+
+                                               if (!s) PUGI__THROW_ERROR(status_bad_comment, cursor->value);
+                                       }
+                                       else
+                                       {
+                                               // Scan for terminating '-->'.
+                                               PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>'));
+                                               PUGI__CHECK_ERROR(status_bad_comment, s);
+
+                                               if (PUGI__OPTSET(parse_comments))
+                                                       *s = 0; // Zero-terminate this segment at the first terminating '-'.
+
+                                               s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'.
+                                       }
+                               }
+                               else PUGI__THROW_ERROR(status_bad_comment, s);
+                       }
+                       else if (*s == '[')
+                       {
+                               // '<![CDATA[...'
+                               if (*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && *++s == '[')
+                               {
+                                       ++s;
+
+                                       if (PUGI__OPTSET(parse_cdata))
+                                       {
+                                               PUGI__PUSHNODE(node_cdata); // Append a new node on the tree.
+                                               cursor->value = s; // Save the offset.
+
+                                               if (PUGI__OPTSET(parse_eol))
+                                               {
+                                                       s = strconv_cdata(s, endch);
+
+                                                       if (!s) PUGI__THROW_ERROR(status_bad_cdata, cursor->value);
+                                               }
+                                               else
+                                               {
+                                                       // Scan for terminating ']]>'.
+                                                       PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>'));
+                                                       PUGI__CHECK_ERROR(status_bad_cdata, s);
+
+                                                       *s++ = 0; // Zero-terminate this segment.
+                                               }
+                                       }
+                                       else // Flagged for discard, but we still have to scan for the terminator.
+                                       {
+                                               // Scan for terminating ']]>'.
+                                               PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>'));
+                                               PUGI__CHECK_ERROR(status_bad_cdata, s);
+
+                                               ++s;
+                                       }
+
+                                       s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'.
+                               }
+                               else PUGI__THROW_ERROR(status_bad_cdata, s);
+                       }
+                       else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && PUGI__ENDSWITH(s[6], 'E'))
+                       {
+                               s -= 2;
+
+                               if (cursor->parent) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                               char_t* mark = s + 9;
+
+                               s = parse_doctype_group(s, endch);
+                               if (!s) return s;
+
+                               assert((*s == 0 && endch == '>') || *s == '>');
+                               if (*s) *s++ = 0;
+
+                               if (PUGI__OPTSET(parse_doctype))
+                               {
+                                       while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark;
+
+                                       PUGI__PUSHNODE(node_doctype);
+
+                                       cursor->value = mark;
+                               }
+                       }
+                       else if (*s == 0 && endch == '-') PUGI__THROW_ERROR(status_bad_comment, s);
+                       else if (*s == 0 && endch == '[') PUGI__THROW_ERROR(status_bad_cdata, s);
+                       else PUGI__THROW_ERROR(status_unrecognized_tag, s);
+
+                       return s;
+               }
+
+               char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch)
+               {
+                       // load into registers
+                       xml_node_struct* cursor = ref_cursor;
+                       char_t ch = 0;
+
+                       // parse node contents, starting with question mark
+                       ++s;
+
+                       // read PI target
+                       char_t* target = s;
+
+                       if (!PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_pi, s);
+
+                       PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol));
+                       PUGI__CHECK_ERROR(status_bad_pi, s);
+
+                       // determine node type; stricmp / strcasecmp is not portable
+                       bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s;
+
+                       if (declaration ? PUGI__OPTSET(parse_declaration) : PUGI__OPTSET(parse_pi))
+                       {
+                               if (declaration)
+                               {
+                                       // disallow non top-level declarations
+                                       if (cursor->parent) PUGI__THROW_ERROR(status_bad_pi, s);
+
+                                       PUGI__PUSHNODE(node_declaration);
+                               }
+                               else
+                               {
+                                       PUGI__PUSHNODE(node_pi);
+                               }
+
+                               cursor->name = target;
+
+                               PUGI__ENDSEG();
+
+                               // parse value/attributes
+                               if (ch == '?')
+                               {
+                                       // empty node
+                                       if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_pi, s);
+                                       s += (*s == '>');
+
+                                       PUGI__POPNODE();
+                               }
+                               else if (PUGI__IS_CHARTYPE(ch, ct_space))
+                               {
+                                       PUGI__SKIPWS();
+
+                                       // scan for tag end
+                                       char_t* value = s;
+
+                                       PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>'));
+                                       PUGI__CHECK_ERROR(status_bad_pi, s);
+
+                                       if (declaration)
+                                       {
+                                               // replace ending ? with / so that 'element' terminates properly
+                                               *s = '/';
+
+                                               // we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES
+                                               s = value;
+                                       }
+                                       else
+                                       {
+                                               // store value and step over >
+                                               cursor->value = value;
+                                               PUGI__POPNODE();
+
+                                               PUGI__ENDSEG();
+
+                                               s += (*s == '>');
+                                       }
+                               }
+                               else PUGI__THROW_ERROR(status_bad_pi, s);
+                       }
+                       else
+                       {
+                               // scan for tag end
+                               PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>'));
+                               PUGI__CHECK_ERROR(status_bad_pi, s);
+
+                               s += (s[1] == '>' ? 2 : 1);
+                       }
+
+                       // store from registers
+                       ref_cursor = cursor;
+
+                       return s;
+               }
+
+               char_t* parse_tree(char_t* s, xml_node_struct* root, unsigned int optmsk, char_t endch)
+               {
+                       strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
+                       strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
+                       
+                       char_t ch = 0;
+                       xml_node_struct* cursor = root;
+                       char_t* mark = s;
+
+                       while (*s != 0)
+                       {
+                               if (*s == '<')
+                               {
+                                       ++s;
+
+                               LOC_TAG:
+                                       if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...'
+                                       {
+                                               PUGI__PUSHNODE(node_element); // Append a new node to the tree.
+
+                                               cursor->name = s;
+
+                                               PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
+                                               PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
+
+                                               if (ch == '>')
+                                               {
+                                                       // end of tag
+                                               }
+                                               else if (PUGI__IS_CHARTYPE(ch, ct_space))
+                                               {
+                                               LOC_ATTRIBUTES:
+                                                       while (true)
+                                                       {
+                                                               PUGI__SKIPWS(); // Eat any whitespace.
+                                               
+                                                               if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
+                                                               {
+                                                                       xml_attribute_struct* a = append_new_attribute(cursor, alloc); // Make space for this attribute.
+                                                                       if (!a) PUGI__THROW_ERROR(status_out_of_memory, s);
+
+                                                                       a->name = s; // Save the offset.
+
+                                                                       PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
+                                                                       PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
+
+                                                                       if (PUGI__IS_CHARTYPE(ch, ct_space))
+                                                                       {
+                                                                               PUGI__SKIPWS(); // Eat any whitespace.
+
+                                                                               ch = *s;
+                                                                               ++s;
+                                                                       }
+                                                                       
+                                                                       if (ch == '=') // '<... #=...'
+                                                                       {
+                                                                               PUGI__SKIPWS(); // Eat any whitespace.
+
+                                                                               if (*s == '"' || *s == '\'') // '<... #="...'
+                                                                               {
+                                                                                       ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
+                                                                                       ++s; // Step over the quote.
+                                                                                       a->value = s; // Save the offset.
+
+                                                                                       s = strconv_attribute(s, ch);
+                                                                               
+                                                                                       if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value);
+
+                                                                                       // After this line the loop continues from the start;
+                                                                                       // Whitespaces, / and > are ok, symbols and EOF are wrong,
+                                                                                       // everything else will be detected
+                                                                                       if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_attribute, s);
+                                                                               }
+                                                                               else PUGI__THROW_ERROR(status_bad_attribute, s);
+                                                                       }
+                                                                       else PUGI__THROW_ERROR(status_bad_attribute, s);
+                                                               }
+                                                               else if (*s == '/')
+                                                               {
+                                                                       ++s;
+                                                                       
+                                                                       if (*s == '>')
+                                                                       {
+                                                                               PUGI__POPNODE();
+                                                                               s++;
+                                                                               break;
+                                                                       }
+                                                                       else if (*s == 0 && endch == '>')
+                                                                       {
+                                                                               PUGI__POPNODE();
+                                                                               break;
+                                                                       }
+                                                                       else PUGI__THROW_ERROR(status_bad_start_element, s);
+                                                               }
+                                                               else if (*s == '>')
+                                                               {
+                                                                       ++s;
+
+                                                                       break;
+                                                               }
+                                                               else if (*s == 0 && endch == '>')
+                                                               {
+                                                                       break;
+                                                               }
+                                                               else PUGI__THROW_ERROR(status_bad_start_element, s);
+                                                       }
+
+                                                       // !!!
+                                               }
+                                               else if (ch == '/') // '<#.../'
+                                               {
+                                                       if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_start_element, s);
+
+                                                       PUGI__POPNODE(); // Pop.
+
+                                                       s += (*s == '>');
+                                               }
+                                               else if (ch == 0)
+                                               {
+                                                       // we stepped over null terminator, backtrack & handle closing tag
+                                                       --s;
+                                                       
+                                                       if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s);
+                                               }
+                                               else PUGI__THROW_ERROR(status_bad_start_element, s);
+                                       }
+                                       else if (*s == '/')
+                                       {
+                                               ++s;
+
+                                               char_t* name = cursor->name;
+                                               if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+                                               
+                                               while (PUGI__IS_CHARTYPE(*s, ct_symbol))
+                                               {
+                                                       if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+                                               }
+
+                                               if (*name)
+                                               {
+                                                       if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s);
+                                                       else PUGI__THROW_ERROR(status_end_element_mismatch, s);
+                                               }
+                                                       
+                                               PUGI__POPNODE(); // Pop.
+
+                                               PUGI__SKIPWS();
+
+                                               if (*s == 0)
+                                               {
+                                                       if (endch != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
+                                               }
+                                               else
+                                               {
+                                                       if (*s != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
+                                                       ++s;
+                                               }
+                                       }
+                                       else if (*s == '?') // '<?...'
+                                       {
+                                               s = parse_question(s, cursor, optmsk, endch);
+                                               if (!s) return s;
+
+                                               assert(cursor);
+                                               if (PUGI__NODETYPE(cursor) == node_declaration) goto LOC_ATTRIBUTES;
+                                       }
+                                       else if (*s == '!') // '<!...'
+                                       {
+                                               s = parse_exclamation(s, cursor, optmsk, endch);
+                                               if (!s) return s;
+                                       }
+                                       else if (*s == 0 && endch == '?') PUGI__THROW_ERROR(status_bad_pi, s);
+                                       else PUGI__THROW_ERROR(status_unrecognized_tag, s);
+                               }
+                               else
+                               {
+                                       mark = s; // Save this offset while searching for a terminator.
+
+                                       PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
+
+                                       if (*s == '<' || !*s)
+                                       {
+                                               // We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
+                                               assert(mark != s);
+
+                                               if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single) || PUGI__OPTSET(parse_trim_pcdata))
+                                               {
+                                                       continue;
+                                               }
+                                               else if (PUGI__OPTSET(parse_ws_pcdata_single))
+                                               {
+                                                       if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue;
+                                               }
+                                       }
+
+                                       if (!PUGI__OPTSET(parse_trim_pcdata))
+                                               s = mark;
+                                                       
+                                       if (cursor->parent || PUGI__OPTSET(parse_fragment))
+                                       {
+                                               PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
+                                               cursor->value = s; // Save the offset.
+
+                                               s = strconv_pcdata(s);
+                                                               
+                                               PUGI__POPNODE(); // Pop since this is a standalone.
+                                               
+                                               if (!*s) break;
+                                       }
+                                       else
+                                       {
+                                               PUGI__SCANFOR(*s == '<'); // '...<'
+                                               if (!*s) break;
+                                               
+                                               ++s;
+                                       }
+
+                                       // We're after '<'
+                                       goto LOC_TAG;
+                               }
+                       }
+
+                       // check that last tag is closed
+                       if (cursor != root) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+
+                       return s;
+               }
+
+       #ifdef PUGIXML_WCHAR_MODE
+               static char_t* parse_skip_bom(char_t* s)
+               {
+                       unsigned int bom = 0xfeff;
+                       return (s[0] == static_cast<wchar_t>(bom)) ? s + 1 : s;
+               }
+       #else
+               static char_t* parse_skip_bom(char_t* s)
+               {
+                       return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s;
+               }
+       #endif
+
+               static bool has_element_node_siblings(xml_node_struct* node)
+               {
+                       while (node)
+                       {
+                               if (PUGI__NODETYPE(node) == node_element) return true;
+
+                               node = node->next_sibling;
+                       }
+
+                       return false;
+               }
+
+               static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk)
+               {
+                       // allocator object is a part of document object
+                       xml_allocator& alloc_ = *static_cast<xml_allocator*>(xmldoc);
+
+                       // early-out for empty documents
+                       if (length == 0)
+                               return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element);
+
+                       // get last child of the root before parsing
+                       xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0;
+       
+                       // create parser on stack
+                       xml_parser parser(alloc_);
+
+                       // save last character and make buffer zero-terminated (speeds up parsing)
+                       char_t endch = buffer[length - 1];
+                       buffer[length - 1] = 0;
+                       
+                       // skip BOM to make sure it does not end up as part of parse output
+                       char_t* buffer_data = parse_skip_bom(buffer);
+
+                       // perform actual parsing
+                       parser.parse_tree(buffer_data, root, optmsk, endch);
+
+                       // update allocator state
+                       alloc_ = parser.alloc;
+
+                       xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
+                       assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
+
+                       if (result)
+                       {
+                               // since we removed last character, we have to handle the only possible false positive (stray <)
+                               if (endch == '<')
+                                       return make_parse_result(status_unrecognized_tag, length - 1);
+
+                               // check if there are any element nodes parsed
+                               xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child;
+
+                               if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed))
+                                       return make_parse_result(status_no_document_element, length - 1);
+                       }
+                       else
+                       {
+                               // roll back offset if it occurs on a null terminator in the source buffer
+                               if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0)
+                                       result.offset--;
+                       }
+
+                       return result;
+               }
+       };
+
+       // Output facilities
+       PUGI__FN xml_encoding get_write_native_encoding()
+       {
+       #ifdef PUGIXML_WCHAR_MODE
+               return get_wchar_encoding();
+       #else
+               return encoding_utf8;
+       #endif
+       }
+
+       PUGI__FN xml_encoding get_write_encoding(xml_encoding encoding)
+       {
+               // replace wchar encoding with utf implementation
+               if (encoding == encoding_wchar) return get_wchar_encoding();
+
+               // replace utf16 encoding with utf16 with specific endianness
+               if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+               // replace utf32 encoding with utf32 with specific endianness
+               if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+               // only do autodetection if no explicit encoding is requested
+               if (encoding != encoding_auto) return encoding;
+
+               // assume utf8 encoding
+               return encoding_utf8;
+       }
+
+#ifdef PUGIXML_WCHAR_MODE
+       PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
+       {
+               if (length < 1) return 0;
+
+               // discard last character if it's the lead of a surrogate pair 
+               return (sizeof(wchar_t) == 2 && static_cast<unsigned int>(static_cast<uint16_t>(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length;
+       }
+
+       PUGI__FN size_t convert_buffer_output(char_t* r_char, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
+       {
+               // only endian-swapping is required
+               if (need_endian_swap_utf(encoding, get_wchar_encoding()))
+               {
+                       convert_wchar_endian_swap(r_char, data, length);
+
+                       return length * sizeof(char_t);
+               }
+       
+               // convert to utf8
+               if (encoding == encoding_utf8)
+               {
+                       uint8_t* dest = r_u8;
+                       uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(data, length, dest);
+
+                       return static_cast<size_t>(end - dest);
+               }
+
+               // convert to utf16
+               if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+               {
+                       uint16_t* dest = r_u16;
+
+                       // convert to native utf16
+                       uint16_t* end = utf_decoder<utf16_writer>::decode_wchar_block(data, length, dest);
+
+                       // swap if necessary
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+                       if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+                       return static_cast<size_t>(end - dest) * sizeof(uint16_t);
+               }
+
+               // convert to utf32
+               if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+               {
+                       uint32_t* dest = r_u32;
+
+                       // convert to native utf32
+                       uint32_t* end = utf_decoder<utf32_writer>::decode_wchar_block(data, length, dest);
+
+                       // swap if necessary
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+                       if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+                       return static_cast<size_t>(end - dest) * sizeof(uint32_t);
+               }
+
+               // convert to latin1
+               if (encoding == encoding_latin1)
+               {
+                       uint8_t* dest = r_u8;
+                       uint8_t* end = utf_decoder<latin1_writer>::decode_wchar_block(data, length, dest);
+
+                       return static_cast<size_t>(end - dest);
+               }
+
+               assert(!"Invalid encoding");
+               return 0;
+       }
+#else
+       PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
+       {
+               if (length < 5) return 0;
+
+               for (size_t i = 1; i <= 4; ++i)
+               {
+                       uint8_t ch = static_cast<uint8_t>(data[length - i]);
+
+                       // either a standalone character or a leading one
+                       if ((ch & 0xc0) != 0x80) return length - i;
+               }
+
+               // there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk
+               return length;
+       }
+
+       PUGI__FN size_t convert_buffer_output(char_t* /* r_char */, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
+       {
+               if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+               {
+                       uint16_t* dest = r_u16;
+
+                       // convert to native utf16
+                       uint16_t* end = utf_decoder<utf16_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+                       // swap if necessary
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+                       if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+                       return static_cast<size_t>(end - dest) * sizeof(uint16_t);
+               }
+
+               if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+               {
+                       uint32_t* dest = r_u32;
+
+                       // convert to native utf32
+                       uint32_t* end = utf_decoder<utf32_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+                       // swap if necessary
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+                       if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+                       return static_cast<size_t>(end - dest) * sizeof(uint32_t);
+               }
+
+               if (encoding == encoding_latin1)
+               {
+                       uint8_t* dest = r_u8;
+                       uint8_t* end = utf_decoder<latin1_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+                       return static_cast<size_t>(end - dest);
+               }
+
+               assert(!"Invalid encoding");
+               return 0;
+       }
+#endif
+
+       class xml_buffered_writer
+       {
+               xml_buffered_writer(const xml_buffered_writer&);
+               xml_buffered_writer& operator=(const xml_buffered_writer&);
+
+       public:
+               xml_buffered_writer(xml_writer& writer_, xml_encoding user_encoding): writer(writer_), bufsize(0), encoding(get_write_encoding(user_encoding))
+               {
+                       PUGI__STATIC_ASSERT(bufcapacity >= 8);
+               }
+
+               ~xml_buffered_writer()
+               {
+                       flush();
+               }
+
+               size_t flush()
+               {
+                       flush(buffer, bufsize);
+                       bufsize = 0;
+                       return 0;
+               }
+
+               void flush(const char_t* data, size_t size)
+               {
+                       if (size == 0) return;
+
+                       // fast path, just write data
+                       if (encoding == get_write_native_encoding())
+                               writer.write(data, size * sizeof(char_t));
+                       else
+                       {
+                               // convert chunk
+                               size_t result = convert_buffer_output(scratch.data_char, scratch.data_u8, scratch.data_u16, scratch.data_u32, data, size, encoding);
+                               assert(result <= sizeof(scratch));
+
+                               // write data
+                               writer.write(scratch.data_u8, result);
+                       }
+               }
+
+               void write_direct(const char_t* data, size_t length)
+               {
+                       // flush the remaining buffer contents
+                       flush();
+
+                       // handle large chunks
+                       if (length > bufcapacity)
+                       {
+                               if (encoding == get_write_native_encoding())
+                               {
+                                       // fast path, can just write data chunk
+                                       writer.write(data, length * sizeof(char_t));
+                                       return;
+                               }
+
+                               // need to convert in suitable chunks
+                               while (length > bufcapacity)
+                               {
+                                       // get chunk size by selecting such number of characters that are guaranteed to fit into scratch buffer
+                                       // and form a complete codepoint sequence (i.e. discard start of last codepoint if necessary)
+                                       size_t chunk_size = get_valid_length(data, bufcapacity);
+                                       assert(chunk_size);
+
+                                       // convert chunk and write
+                                       flush(data, chunk_size);
+
+                                       // iterate
+                                       data += chunk_size;
+                                       length -= chunk_size;
+                               }
+
+                               // small tail is copied below
+                               bufsize = 0;
+                       }
+
+                       memcpy(buffer + bufsize, data, length * sizeof(char_t));
+                       bufsize += length;
+               }
+
+               void write_buffer(const char_t* data, size_t length)
+               {
+                       size_t offset = bufsize;
+
+                       if (offset + length <= bufcapacity)
+                       {
+                               memcpy(buffer + offset, data, length * sizeof(char_t));
+                               bufsize = offset + length;
+                       }
+                       else
+                       {
+                               write_direct(data, length);
+                       }
+               }
+
+               void write_string(const char_t* data)
+               {
+                       // write the part of the string that fits in the buffer
+                       size_t offset = bufsize;
+
+                       while (*data && offset < bufcapacity)
+                               buffer[offset++] = *data++;
+
+                       // write the rest
+                       if (offset < bufcapacity)
+                       {
+                               bufsize = offset;
+                       }
+                       else
+                       {
+                               // backtrack a bit if we have split the codepoint
+                               size_t length = offset - bufsize;
+                               size_t extra = length - get_valid_length(data - length, length);
+
+                               bufsize = offset - extra;
+
+                               write_direct(data - extra, strlength(data) + extra);
+                       }
+               }
+
+               void write(char_t d0)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 1) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       bufsize = offset + 1;
+               }
+
+               void write(char_t d0, char_t d1)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 2) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       bufsize = offset + 2;
+               }
+
+               void write(char_t d0, char_t d1, char_t d2)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 3) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       buffer[offset + 2] = d2;
+                       bufsize = offset + 3;
+               }
+
+               void write(char_t d0, char_t d1, char_t d2, char_t d3)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 4) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       buffer[offset + 2] = d2;
+                       buffer[offset + 3] = d3;
+                       bufsize = offset + 4;
+               }
+
+               void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 5) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       buffer[offset + 2] = d2;
+                       buffer[offset + 3] = d3;
+                       buffer[offset + 4] = d4;
+                       bufsize = offset + 5;
+               }
+
+               void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4, char_t d5)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 6) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       buffer[offset + 2] = d2;
+                       buffer[offset + 3] = d3;
+                       buffer[offset + 4] = d4;
+                       buffer[offset + 5] = d5;
+                       bufsize = offset + 6;
+               }
+
+               // utf8 maximum expansion: x4 (-> utf32)
+               // utf16 maximum expansion: x2 (-> utf32)
+               // utf32 maximum expansion: x1
+               enum
+               {
+                       bufcapacitybytes =
+                       #ifdef PUGIXML_MEMORY_OUTPUT_STACK
+                               PUGIXML_MEMORY_OUTPUT_STACK
+                       #else
+                               10240
+                       #endif
+                       ,
+                       bufcapacity = bufcapacitybytes / (sizeof(char_t) + 4)
+               };
+
+               char_t buffer[bufcapacity];
+
+               union
+               {
+                       uint8_t data_u8[4 * bufcapacity];
+                       uint16_t data_u16[2 * bufcapacity];
+                       uint32_t data_u32[bufcapacity];
+                       char_t data_char[bufcapacity];
+               } scratch;
+
+               xml_writer& writer;
+               size_t bufsize;
+               xml_encoding encoding;
+       };
+
+       PUGI__FN void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type)
+       {
+               while (*s)
+               {
+                       const char_t* prev = s;
+                       
+                       // While *s is a usual symbol
+                       PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPEX(ss, type));
+               
+                       writer.write_buffer(prev, static_cast<size_t>(s - prev));
+
+                       switch (*s)
+                       {
+                               case 0: break;
+                               case '&':
+                                       writer.write('&', 'a', 'm', 'p', ';');
+                                       ++s;
+                                       break;
+                               case '<':
+                                       writer.write('&', 'l', 't', ';');
+                                       ++s;
+                                       break;
+                               case '>':
+                                       writer.write('&', 'g', 't', ';');
+                                       ++s;
+                                       break;
+                               case '"':
+                                       writer.write('&', 'q', 'u', 'o', 't', ';');
+                                       ++s;
+                                       break;
+                               default: // s is not a usual symbol
+                               {
+                                       unsigned int ch = static_cast<unsigned int>(*s++);
+                                       assert(ch < 32);
+
+                                       writer.write('&', '#', static_cast<char_t>((ch / 10) + '0'), static_cast<char_t>((ch % 10) + '0'), ';');
+                               }
+                       }
+               }
+       }
+
+       PUGI__FN void text_output(xml_buffered_writer& writer, const char_t* s, chartypex_t type, unsigned int flags)
+       {
+               if (flags & format_no_escapes)
+                       writer.write_string(s);
+               else
+                       text_output_escaped(writer, s, type);
+       }
+
+       PUGI__FN void text_output_cdata(xml_buffered_writer& writer, const char_t* s)
+       {
+               do
+               {
+                       writer.write('<', '!', '[', 'C', 'D');
+                       writer.write('A', 'T', 'A', '[');
+
+                       const char_t* prev = s;
+
+                       // look for ]]> sequence - we can't output it as is since it terminates CDATA
+                       while (*s && !(s[0] == ']' && s[1] == ']' && s[2] == '>')) ++s;
+
+                       // skip ]] if we stopped at ]]>, > will go to the next CDATA section
+                       if (*s) s += 2;
+
+                       writer.write_buffer(prev, static_cast<size_t>(s - prev));
+
+                       writer.write(']', ']', '>');
+               }
+               while (*s);
+       }
+
+       PUGI__FN void text_output_indent(xml_buffered_writer& writer, const char_t* indent, size_t indent_length, unsigned int depth)
+       {
+               switch (indent_length)
+               {
+               case 1:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write(indent[0]);
+                       break;
+               }
+
+               case 2:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write(indent[0], indent[1]);
+                       break;
+               }
+
+               case 3:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write(indent[0], indent[1], indent[2]);
+                       break;
+               }
+
+               case 4:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write(indent[0], indent[1], indent[2], indent[3]);
+                       break;
+               }
+
+               default:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write_buffer(indent, indent_length);
+               }
+               }
+       }
+
+       PUGI__FN void node_output_comment(xml_buffered_writer& writer, const char_t* s)
+       {
+               writer.write('<', '!', '-', '-');
+
+               while (*s)
+               {
+                       const char_t* prev = s;
+
+                       // look for -\0 or -- sequence - we can't output it since -- is illegal in comment body
+                       while (*s && !(s[0] == '-' && (s[1] == '-' || s[1] == 0))) ++s;
+
+                       writer.write_buffer(prev, static_cast<size_t>(s - prev));
+
+                       if (*s)
+                       {
+                               assert(*s == '-');
+
+                               writer.write('-', ' ');
+                               ++s;
+                       }
+               }
+
+               writer.write('-', '-', '>');
+       }
+
+       PUGI__FN void node_output_attributes(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+       {
+               const char_t* default_name = PUGIXML_TEXT(":anonymous");
+
+               for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute)
+               {
+                       writer.write(' ');
+                       writer.write_string(a->name ? a->name : default_name);
+                       writer.write('=', '"');
+
+                       if (a->value)
+                               text_output(writer, a->value, ctx_special_attr, flags);
+
+                       writer.write('"');
+               }
+       }
+
+       PUGI__FN bool node_output_start(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+       {
+               const char_t* default_name = PUGIXML_TEXT(":anonymous");
+               const char_t* name = node->name ? node->name : default_name;
+
+               writer.write('<');
+               writer.write_string(name);
+
+               if (node->first_attribute)
+                       node_output_attributes(writer, node, flags);
+
+               if (flags & format_raw)
+               {
+                       if (!node->first_child)
+                               writer.write(' ', '/', '>');
+                       else
+                       {
+                               writer.write('>');
+
+                               return true;
+                       }
+               }
+               else
+               {
+                       xml_node_struct* first = node->first_child;
+
+                       if (!first)
+                               writer.write(' ', '/', '>', '\n');
+                       else if (!first->next_sibling && (PUGI__NODETYPE(first) == node_pcdata || PUGI__NODETYPE(first) == node_cdata))
+                       {
+                               writer.write('>');
+
+                               const char_t* value = first->value ? first->value : PUGIXML_TEXT("");
+
+                               if (PUGI__NODETYPE(first) == node_pcdata)
+                                       text_output(writer, value, ctx_special_pcdata, flags);
+                               else
+                                       text_output_cdata(writer, value);
+
+                               writer.write('<', '/');
+                               writer.write_string(name);
+                               writer.write('>', '\n');
+                       }
+                       else
+                       {
+                               writer.write('>', '\n');
+
+                               return true;
+                       }
+               }
+
+               return false;
+       }
+
+       PUGI__FN void node_output_end(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+       {
+               const char_t* default_name = PUGIXML_TEXT(":anonymous");
+               const char_t* name = node->name ? node->name : default_name;
+
+               writer.write('<', '/');
+               writer.write_string(name);
+
+               if (flags & format_raw)
+                       writer.write('>');
+               else
+                       writer.write('>', '\n');
+       }
+
+       PUGI__FN void node_output_simple(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+       {
+               const char_t* default_name = PUGIXML_TEXT(":anonymous");
+
+               switch (PUGI__NODETYPE(node))
+               {
+                       case node_pcdata:
+                               text_output(writer, node->value ? node->value : PUGIXML_TEXT(""), ctx_special_pcdata, flags);
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_cdata:
+                               text_output_cdata(writer, node->value ? node->value : PUGIXML_TEXT(""));
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_comment:
+                               node_output_comment(writer, node->value ? node->value : PUGIXML_TEXT(""));
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_pi:
+                               writer.write('<', '?');
+                               writer.write_string(node->name ? node->name : default_name);
+
+                               if (node->value)
+                               {
+                                       writer.write(' ');
+                                       writer.write_string(node->value);
+                               }
+
+                               writer.write('?', '>');
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_declaration:
+                               writer.write('<', '?');
+                               writer.write_string(node->name ? node->name : default_name);
+                               node_output_attributes(writer, node, flags);
+                               writer.write('?', '>');
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_doctype:
+                               writer.write('<', '!', 'D', 'O', 'C');
+                               writer.write('T', 'Y', 'P', 'E');
+
+                               if (node->value)
+                               {
+                                       writer.write(' ');
+                                       writer.write_string(node->value);
+                               }
+
+                               writer.write('>');
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       default:
+                               assert(!"Invalid node type");
+               }
+       }
+
+       PUGI__FN void node_output(xml_buffered_writer& writer, xml_node_struct* root, const char_t* indent, unsigned int flags, unsigned int depth)
+       {
+               size_t indent_length = ((flags & (format_indent | format_raw)) == format_indent) ? strlength(indent) : 0;
+
+               xml_node_struct* node = root;
+
+               do
+               {
+                       assert(node);
+
+                       // begin writing current node
+                       if (indent_length)
+                               text_output_indent(writer, indent, indent_length, depth);
+
+                       if (PUGI__NODETYPE(node) == node_element)
+                       {
+                               if (node_output_start(writer, node, flags))
+                               {
+                                       node = node->first_child;
+                                       depth++;
+                                       continue;
+                               }
+                       }
+                       else if (PUGI__NODETYPE(node) == node_document)
+                       {
+                               if (node->first_child)
+                               {
+                                       node = node->first_child;
+                                       continue;
+                               }
+                       }
+                       else
+                       {
+                               node_output_simple(writer, node, flags);
+                       }
+
+                       // continue to the next node
+                       while (node != root)
+                       {
+                               if (node->next_sibling)
+                               {
+                                       node = node->next_sibling;
+                                       break;
+                               }
+
+                               node = node->parent;
+
+                               // write closing node
+                               if (PUGI__NODETYPE(node) == node_element)
+                               {
+                                       depth--;
+
+                                       if (indent_length)
+                                               text_output_indent(writer, indent, indent_length, depth);
+
+                                       node_output_end(writer, node, flags);
+                               }
+                       }
+               }
+               while (node != root);
+       }
+
+       PUGI__FN bool has_declaration(xml_node_struct* node)
+       {
+               for (xml_node_struct* child = node->first_child; child; child = child->next_sibling)
+               {
+                       xml_node_type type = PUGI__NODETYPE(child);
+
+                       if (type == node_declaration) return true;
+                       if (type == node_element) return false;
+               }
+
+               return false;
+       }
+
+       PUGI__FN bool is_attribute_of(xml_attribute_struct* attr, xml_node_struct* node)
+       {
+               for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute)
+                       if (a == attr)
+                               return true;
+
+               return false;
+       }
+
+       PUGI__FN bool allow_insert_attribute(xml_node_type parent)
+       {
+               return parent == node_element || parent == node_declaration;
+       }
+
+       PUGI__FN bool allow_insert_child(xml_node_type parent, xml_node_type child)
+       {
+               if (parent != node_document && parent != node_element) return false;
+               if (child == node_document || child == node_null) return false;
+               if (parent != node_document && (child == node_declaration || child == node_doctype)) return false;
+
+               return true;
+       }
+
+       PUGI__FN bool allow_move(xml_node parent, xml_node child)
+       {
+               // check that child can be a child of parent
+               if (!allow_insert_child(parent.type(), child.type()))
+                       return false;
+
+               // check that node is not moved between documents
+               if (parent.root() != child.root())
+                       return false;
+
+               // check that new parent is not in the child subtree
+               xml_node cur = parent;
+
+               while (cur)
+               {
+                       if (cur == child)
+                               return false;
+
+                       cur = cur.parent();
+               }
+
+               return true;
+       }
+
+       PUGI__FN void node_copy_string(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char_t* source, uintptr_t& source_header, xml_allocator* alloc)
+       {
+               assert(!dest && (header & header_mask) == 0);
+
+               if (source)
+               {
+                       if (alloc && (source_header & header_mask) == 0)
+                       {
+                               dest = source;
+
+                               // since strcpy_insitu can reuse document buffer memory we need to mark both source and dest as shared
+                               header |= xml_memory_page_contents_shared_mask;
+                               source_header |= xml_memory_page_contents_shared_mask;
+                       }
+                       else
+                               strcpy_insitu(dest, header, header_mask, source);
+               }
+       }
+
+       PUGI__FN void node_copy_contents(xml_node_struct* dn, xml_node_struct* sn, xml_allocator* shared_alloc)
+       {
+               node_copy_string(dn->name, dn->header, xml_memory_page_name_allocated_mask, sn->name, sn->header, shared_alloc);
+               node_copy_string(dn->value, dn->header, xml_memory_page_value_allocated_mask, sn->value, sn->header, shared_alloc);
+
+               for (xml_attribute_struct* sa = sn->first_attribute; sa; sa = sa->next_attribute)
+               {
+                       xml_attribute_struct* da = append_new_attribute(dn, get_allocator(dn));
+
+                       if (da)
+                       {
+                               node_copy_string(da->name, da->header, xml_memory_page_name_allocated_mask, sa->name, sa->header, shared_alloc);
+                               node_copy_string(da->value, da->header, xml_memory_page_value_allocated_mask, sa->value, sa->header, shared_alloc);
+                       }
+               }
+       }
+
+       PUGI__FN void node_copy_tree(xml_node_struct* dn, xml_node_struct* sn)
+       {
+               xml_allocator& alloc = get_allocator(dn);
+               xml_allocator* shared_alloc = (&alloc == &get_allocator(sn)) ? &alloc : 0;
+
+               node_copy_contents(dn, sn, shared_alloc);
+
+               xml_node_struct* dit = dn;
+               xml_node_struct* sit = sn->first_child;
+
+               while (sit && sit != sn)
+               {
+                       if (sit != dn)
+                       {
+                               xml_node_struct* copy = append_new_node(dit, alloc, PUGI__NODETYPE(sit));
+
+                               if (copy)
+                               {
+                                       node_copy_contents(copy, sit, shared_alloc);
+
+                                       if (sit->first_child)
+                                       {
+                                               dit = copy;
+                                               sit = sit->first_child;
+                                               continue;
+                                       }
+                               }
+                       }
+
+                       // continue to the next node
+                       do
+                       {
+                               if (sit->next_sibling)
+                               {
+                                       sit = sit->next_sibling;
+                                       break;
+                               }
+
+                               sit = sit->parent;
+                               dit = dit->parent;
+                       }
+                       while (sit != sn);
+               }
+       }
+
+       inline bool is_text_node(xml_node_struct* node)
+       {
+               xml_node_type type = PUGI__NODETYPE(node);
+
+               return type == node_pcdata || type == node_cdata;
+       }
+
+       // get value with conversion functions
+       PUGI__FN int get_integer_base(const char_t* value)
+       {
+               const char_t* s = value;
+
+               while (PUGI__IS_CHARTYPE(*s, ct_space))
+                       s++;
+
+               if (*s == '-')
+                       s++;
+
+               return (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) ? 16 : 10;
+       }
+
+       PUGI__FN int get_value_int(const char_t* value, int def)
+       {
+               if (!value) return def;
+
+               int base = get_integer_base(value);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return static_cast<int>(wcstol(value, 0, base));
+       #else
+               return static_cast<int>(strtol(value, 0, base));
+       #endif
+       }
+
+       PUGI__FN unsigned int get_value_uint(const char_t* value, unsigned int def)
+       {
+               if (!value) return def;
+
+               int base = get_integer_base(value);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return static_cast<unsigned int>(wcstoul(value, 0, base));
+       #else
+               return static_cast<unsigned int>(strtoul(value, 0, base));
+       #endif
+       }
+
+       PUGI__FN double get_value_double(const char_t* value, double def)
+       {
+               if (!value) return def;
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcstod(value, 0);
+       #else
+               return strtod(value, 0);
+       #endif
+       }
+
+       PUGI__FN float get_value_float(const char_t* value, float def)
+       {
+               if (!value) return def;
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return static_cast<float>(wcstod(value, 0));
+       #else
+               return static_cast<float>(strtod(value, 0));
+       #endif
+       }
+
+       PUGI__FN bool get_value_bool(const char_t* value, bool def)
+       {
+               if (!value) return def;
+
+               // only look at first char
+               char_t first = *value;
+
+               // 1*, t* (true), T* (True), y* (yes), Y* (YES)
+               return (first == '1' || first == 't' || first == 'T' || first == 'y' || first == 'Y');
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN long long get_value_llong(const char_t* value, long long def)
+       {
+               if (!value) return def;
+
+               int base = get_integer_base(value);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               #ifdef PUGI__MSVC_CRT_VERSION
+                       return _wcstoi64(value, 0, base);
+               #else
+                       return wcstoll(value, 0, base);
+               #endif
+       #else
+               #ifdef PUGI__MSVC_CRT_VERSION
+                       return _strtoi64(value, 0, base);
+               #else
+                       return strtoll(value, 0, base);
+               #endif
+       #endif
+       }
+
+       PUGI__FN unsigned long long get_value_ullong(const char_t* value, unsigned long long def)
+       {
+               if (!value) return def;
+
+               int base = get_integer_base(value);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               #ifdef PUGI__MSVC_CRT_VERSION
+                       return _wcstoui64(value, 0, base);
+               #else
+                       return wcstoull(value, 0, base);
+               #endif
+       #else
+               #ifdef PUGI__MSVC_CRT_VERSION
+                       return _strtoui64(value, 0, base);
+               #else
+                       return strtoull(value, 0, base);
+               #endif
+       #endif
+       }
+#endif
+
+       // set value with conversion functions
+       PUGI__FN bool set_value_buffer(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char (&buf)[128])
+       {
+       #ifdef PUGIXML_WCHAR_MODE
+               char_t wbuf[128];
+               impl::widen_ascii(wbuf, buf);
+
+               return strcpy_insitu(dest, header, header_mask, wbuf);
+       #else
+               return strcpy_insitu(dest, header, header_mask, buf);
+       #endif
+       }
+
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, int value)
+       {
+               char buf[128];
+               sprintf(buf, "%d", value);
+       
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned int value)
+       {
+               char buf[128];
+               sprintf(buf, "%u", value);
+
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, float value)
+       {
+               char buf[128];
+               sprintf(buf, "%.9g", value);
+
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+       
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, double value)
+       {
+               char buf[128];
+               sprintf(buf, "%.17g", value);
+
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+       
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, bool value)
+       {
+               return strcpy_insitu(dest, header, header_mask, value ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, long long value)
+       {
+               char buf[128];
+               sprintf(buf, "%lld", value);
+       
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned long long value)
+       {
+               char buf[128];
+               sprintf(buf, "%llu", value);
+       
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+#endif
+
+       // we need to get length of entire file to load it in memory; the only (relatively) sane way to do it is via seek/tell trick
+       PUGI__FN xml_parse_status get_file_size(FILE* file, size_t& out_result)
+       {
+       #if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
+               // there are 64-bit versions of fseek/ftell, let's use them
+               typedef __int64 length_type;
+
+               _fseeki64(file, 0, SEEK_END);
+               length_type length = _ftelli64(file);
+               _fseeki64(file, 0, SEEK_SET);
+       #elif defined(__MINGW32__) && !defined(__NO_MINGW_LFS) && (!defined(__STRICT_ANSI__) || defined(__MINGW64_VERSION_MAJOR))
+               // there are 64-bit versions of fseek/ftell, let's use them
+               typedef off64_t length_type;
+
+               fseeko64(file, 0, SEEK_END);
+               length_type length = ftello64(file);
+               fseeko64(file, 0, SEEK_SET);
+       #else
+               // if this is a 32-bit OS, long is enough; if this is a unix system, long is 64-bit, which is enough; otherwise we can't do anything anyway.
+               typedef long length_type;
+
+               fseek(file, 0, SEEK_END);
+               length_type length = ftell(file);
+               fseek(file, 0, SEEK_SET);
+       #endif
+
+               // check for I/O errors
+               if (length < 0) return status_io_error;
+               
+               // check for overflow
+               size_t result = static_cast<size_t>(length);
+
+               if (static_cast<length_type>(result) != length) return status_out_of_memory;
+
+               // finalize
+               out_result = result;
+
+               return status_ok;
+       }
+
+       PUGI__FN size_t zero_terminate_buffer(void* buffer, size_t size, xml_encoding encoding) 
+       {
+               // We only need to zero-terminate if encoding conversion does not do it for us
+       #ifdef PUGIXML_WCHAR_MODE
+               xml_encoding wchar_encoding = get_wchar_encoding();
+
+               if (encoding == wchar_encoding || need_endian_swap_utf(encoding, wchar_encoding))
+               {
+                       size_t length = size / sizeof(char_t);
+
+                       static_cast<char_t*>(buffer)[length] = 0;
+                       return (length + 1) * sizeof(char_t);
+               }
+       #else
+               if (encoding == encoding_utf8)
+               {
+                       static_cast<char*>(buffer)[size] = 0;
+                       return size + 1;
+               }
+       #endif
+
+               return size;
+       }
+
+       PUGI__FN xml_parse_result load_file_impl(xml_document& doc, FILE* file, unsigned int options, xml_encoding encoding)
+       {
+               if (!file) return make_parse_result(status_file_not_found);
+
+               // get file size (can result in I/O errors)
+               size_t size = 0;
+               xml_parse_status size_status = get_file_size(file, size);
+
+               if (size_status != status_ok)
+               {
+                       fclose(file);
+                       return make_parse_result(size_status);
+               }
+               
+               size_t max_suffix_size = sizeof(char_t);
+
+               // allocate buffer for the whole file
+               char* contents = static_cast<char*>(xml_memory::allocate(size + max_suffix_size));
+
+               if (!contents)
+               {
+                       fclose(file);
+                       return make_parse_result(status_out_of_memory);
+               }
+
+               // read file in memory
+               size_t read_size = fread(contents, 1, size, file);
+               fclose(file);
+
+               if (read_size != size)
+               {
+                       xml_memory::deallocate(contents);
+                       return make_parse_result(status_io_error);
+               }
+
+               xml_encoding real_encoding = get_buffer_encoding(encoding, contents, size);
+               
+               return doc.load_buffer_inplace_own(contents, zero_terminate_buffer(contents, size, real_encoding), options, real_encoding);
+       }
+
+#ifndef PUGIXML_NO_STL
+       template <typename T> struct xml_stream_chunk
+       {
+               static xml_stream_chunk* create()
+               {
+                       void* memory = xml_memory::allocate(sizeof(xml_stream_chunk));
+                       
+                       return new (memory) xml_stream_chunk();
+               }
+
+               static void destroy(void* ptr)
+               {
+                       xml_stream_chunk* chunk = static_cast<xml_stream_chunk*>(ptr);
+
+                       // free chunk chain
+                       while (chunk)
+                       {
+                               xml_stream_chunk* next_ = chunk->next;
+
+                               xml_memory::deallocate(chunk);
+
+                               chunk = next_;
+                       }
+               }
+
+               xml_stream_chunk(): next(0), size(0)
+               {
+               }
+
+               xml_stream_chunk* next;
+               size_t size;
+
+               T data[xml_memory_page_size / sizeof(T)];
+       };
+
+       template <typename T> PUGI__FN xml_parse_status load_stream_data_noseek(std::basic_istream<T>& stream, void** out_buffer, size_t* out_size)
+       {
+               buffer_holder chunks(0, xml_stream_chunk<T>::destroy);
+
+               // read file to a chunk list
+               size_t total = 0;
+               xml_stream_chunk<T>* last = 0;
+
+               while (!stream.eof())
+               {
+                       // allocate new chunk
+                       xml_stream_chunk<T>* chunk = xml_stream_chunk<T>::create();
+                       if (!chunk) return status_out_of_memory;
+
+                       // append chunk to list
+                       if (last) last = last->next = chunk;
+                       else chunks.data = last = chunk;
+
+                       // read data to chunk
+                       stream.read(chunk->data, static_cast<std::streamsize>(sizeof(chunk->data) / sizeof(T)));
+                       chunk->size = static_cast<size_t>(stream.gcount()) * sizeof(T);
+
+                       // read may set failbit | eofbit in case gcount() is less than read length, so check for other I/O errors
+                       if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error;
+
+                       // guard against huge files (chunk size is small enough to make this overflow check work)
+                       if (total + chunk->size < total) return status_out_of_memory;
+                       total += chunk->size;
+               }
+
+               size_t max_suffix_size = sizeof(char_t);
+
+               // copy chunk list to a contiguous buffer
+               char* buffer = static_cast<char*>(xml_memory::allocate(total + max_suffix_size));
+               if (!buffer) return status_out_of_memory;
+
+               char* write = buffer;
+
+               for (xml_stream_chunk<T>* chunk = static_cast<xml_stream_chunk<T>*>(chunks.data); chunk; chunk = chunk->next)
+               {
+                       assert(write + chunk->size <= buffer + total);
+                       memcpy(write, chunk->data, chunk->size);
+                       write += chunk->size;
+               }
+
+               assert(write == buffer + total);
+
+               // return buffer
+               *out_buffer = buffer;
+               *out_size = total;
+
+               return status_ok;
+       }
+
+       template <typename T> PUGI__FN xml_parse_status load_stream_data_seek(std::basic_istream<T>& stream, void** out_buffer, size_t* out_size)
+       {
+               // get length of remaining data in stream
+               typename std::basic_istream<T>::pos_type pos = stream.tellg();
+               stream.seekg(0, std::ios::end);
+               std::streamoff length = stream.tellg() - pos;
+               stream.seekg(pos);
+
+               if (stream.fail() || pos < 0) return status_io_error;
+
+               // guard against huge files
+               size_t read_length = static_cast<size_t>(length);
+
+               if (static_cast<std::streamsize>(read_length) != length || length < 0) return status_out_of_memory;
+
+               size_t max_suffix_size = sizeof(char_t);
+
+               // read stream data into memory (guard against stream exceptions with buffer holder)
+               buffer_holder buffer(xml_memory::allocate(read_length * sizeof(T) + max_suffix_size), xml_memory::deallocate);
+               if (!buffer.data) return status_out_of_memory;
+
+               stream.read(static_cast<T*>(buffer.data), static_cast<std::streamsize>(read_length));
+
+               // read may set failbit | eofbit in case gcount() is less than read_length (i.e. line ending conversion), so check for other I/O errors
+               if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error;
+
+               // return buffer
+               size_t actual_length = static_cast<size_t>(stream.gcount());
+               assert(actual_length <= read_length);
+               
+               *out_buffer = buffer.release();
+               *out_size = actual_length * sizeof(T);
+
+               return status_ok;
+       }
+
+       template <typename T> PUGI__FN xml_parse_result load_stream_impl(xml_document& doc, std::basic_istream<T>& stream, unsigned int options, xml_encoding encoding)
+       {
+               void* buffer = 0;
+               size_t size = 0;
+               xml_parse_status status = status_ok;
+
+               // if stream has an error bit set, bail out (otherwise tellg() can fail and we'll clear error bits)
+               if (stream.fail()) return make_parse_result(status_io_error);
+
+               // load stream to memory (using seek-based implementation if possible, since it's faster and takes less memory)
+               if (stream.tellg() < 0)
+               {
+                       stream.clear(); // clear error flags that could be set by a failing tellg
+                       status = load_stream_data_noseek(stream, &buffer, &size);
+               }
+               else
+                       status = load_stream_data_seek(stream, &buffer, &size);
+
+               if (status != status_ok) return make_parse_result(status);
+
+               xml_encoding real_encoding = get_buffer_encoding(encoding, buffer, size);
+               
+               return doc.load_buffer_inplace_own(buffer, zero_terminate_buffer(buffer, size, real_encoding), options, real_encoding);
+       }
+#endif
+
+#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) || (defined(__MINGW32__) && (!defined(__STRICT_ANSI__) || defined(__MINGW64_VERSION_MAJOR)))
+       PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
+       {
+               return _wfopen(path, mode);
+       }
+#else
+       PUGI__FN char* convert_path_heap(const wchar_t* str)
+       {
+               assert(str);
+
+               // first pass: get length in utf8 characters
+               size_t length = strlength_wide(str);
+               size_t size = as_utf8_begin(str, length);
+
+               // allocate resulting string
+               char* result = static_cast<char*>(xml_memory::allocate(size + 1));
+               if (!result) return 0;
+
+               // second pass: convert to utf8
+               as_utf8_end(result, size, str, length);
+
+               return result;
+       }
+
+       PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
+       {
+               // there is no standard function to open wide paths, so our best bet is to try utf8 path
+               char* path_utf8 = convert_path_heap(path);
+               if (!path_utf8) return 0;
+
+               // convert mode to ASCII (we mirror _wfopen interface)
+               char mode_ascii[4] = {0};
+               for (size_t i = 0; mode[i]; ++i) mode_ascii[i] = static_cast<char>(mode[i]);
+
+               // try to open the utf8 path
+               FILE* result = fopen(path_utf8, mode_ascii);
+
+               // free dummy buffer
+               xml_memory::deallocate(path_utf8);
+
+               return result;
+       }
+#endif
+
+       PUGI__FN bool save_file_impl(const xml_document& doc, FILE* file, const char_t* indent, unsigned int flags, xml_encoding encoding)
+       {
+               if (!file) return false;
+
+               xml_writer_file writer(file);
+               doc.save(writer, indent, flags, encoding);
+
+               int result = ferror(file);
+
+               fclose(file);
+
+               return result == 0;
+       }
+
+       PUGI__FN xml_parse_result load_buffer_impl(xml_document_struct* doc, xml_node_struct* root, void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own, char_t** out_buffer)
+       {
+               // check input buffer
+               assert(contents || size == 0);
+
+               // get actual encoding
+               xml_encoding buffer_encoding = impl::get_buffer_encoding(encoding, contents, size);
+
+               // get private buffer
+               char_t* buffer = 0;
+               size_t length = 0;
+
+               if (!impl::convert_buffer(buffer, length, buffer_encoding, contents, size, is_mutable)) return impl::make_parse_result(status_out_of_memory);
+               
+               // delete original buffer if we performed a conversion
+               if (own && buffer != contents && contents) impl::xml_memory::deallocate(contents);
+
+               // store buffer for offset_debug
+               doc->buffer = buffer;
+
+               // parse
+               xml_parse_result res = impl::xml_parser::parse(buffer, length, doc, root, options);
+
+               // remember encoding
+               res.encoding = buffer_encoding;
+
+               // grab onto buffer if it's our buffer, user is responsible for deallocating contents himself
+               if (own || buffer != contents) *out_buffer = buffer;
+
+               return res;
+       }
+PUGI__NS_END
+
+namespace pugi
+{
+       PUGI__FN xml_writer_file::xml_writer_file(void* file_): file(file_)
+       {
+       }
+
+       PUGI__FN void xml_writer_file::write(const void* data, size_t size)
+       {
+               size_t result = fwrite(data, 1, size, static_cast<FILE*>(file));
+               (void)!result; // unfortunately we can't do proper error handling here
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream): narrow_stream(&stream), wide_stream(0)
+       {
+       }
+
+       PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream): narrow_stream(0), wide_stream(&stream)
+       {
+       }
+
+       PUGI__FN void xml_writer_stream::write(const void* data, size_t size)
+       {
+               if (narrow_stream)
+               {
+                       assert(!wide_stream);
+                       narrow_stream->write(reinterpret_cast<const char*>(data), static_cast<std::streamsize>(size));
+               }
+               else
+               {
+                       assert(wide_stream);
+                       assert(size % sizeof(wchar_t) == 0);
+
+                       wide_stream->write(reinterpret_cast<const wchar_t*>(data), static_cast<std::streamsize>(size / sizeof(wchar_t)));
+               }
+       }
+#endif
+
+       PUGI__FN xml_tree_walker::xml_tree_walker(): _depth(0)
+       {
+       }
+       
+       PUGI__FN xml_tree_walker::~xml_tree_walker()
+       {
+       }
+
+       PUGI__FN int xml_tree_walker::depth() const
+       {
+               return _depth;
+       }
+
+       PUGI__FN bool xml_tree_walker::begin(xml_node&)
+       {
+               return true;
+       }
+
+       PUGI__FN bool xml_tree_walker::end(xml_node&)
+       {
+               return true;
+       }
+
+       PUGI__FN xml_attribute::xml_attribute(): _attr(0)
+       {
+       }
+
+       PUGI__FN xml_attribute::xml_attribute(xml_attribute_struct* attr): _attr(attr)
+       {
+       }
+
+       PUGI__FN static void unspecified_bool_xml_attribute(xml_attribute***)
+       {
+       }
+
+       PUGI__FN xml_attribute::operator xml_attribute::unspecified_bool_type() const
+       {
+               return _attr ? unspecified_bool_xml_attribute : 0;
+       }
+
+       PUGI__FN bool xml_attribute::operator!() const
+       {
+               return !_attr;
+       }
+
+       PUGI__FN bool xml_attribute::operator==(const xml_attribute& r) const
+       {
+               return (_attr == r._attr);
+       }
+       
+       PUGI__FN bool xml_attribute::operator!=(const xml_attribute& r) const
+       {
+               return (_attr != r._attr);
+       }
+
+       PUGI__FN bool xml_attribute::operator<(const xml_attribute& r) const
+       {
+               return (_attr < r._attr);
+       }
+       
+       PUGI__FN bool xml_attribute::operator>(const xml_attribute& r) const
+       {
+               return (_attr > r._attr);
+       }
+       
+       PUGI__FN bool xml_attribute::operator<=(const xml_attribute& r) const
+       {
+               return (_attr <= r._attr);
+       }
+       
+       PUGI__FN bool xml_attribute::operator>=(const xml_attribute& r) const
+       {
+               return (_attr >= r._attr);
+       }
+
+       PUGI__FN xml_attribute xml_attribute::next_attribute() const
+       {
+               return _attr ? xml_attribute(_attr->next_attribute) : xml_attribute();
+       }
+
+       PUGI__FN xml_attribute xml_attribute::previous_attribute() const
+       {
+               return _attr && _attr->prev_attribute_c->next_attribute ? xml_attribute(_attr->prev_attribute_c) : xml_attribute();
+       }
+
+       PUGI__FN const char_t* xml_attribute::as_string(const char_t* def) const
+       {
+               return (_attr && _attr->value) ? _attr->value : def;
+       }
+
+       PUGI__FN int xml_attribute::as_int(int def) const
+       {
+               return impl::get_value_int(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN unsigned int xml_attribute::as_uint(unsigned int def) const
+       {
+               return impl::get_value_uint(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN double xml_attribute::as_double(double def) const
+       {
+               return impl::get_value_double(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN float xml_attribute::as_float(float def) const
+       {
+               return impl::get_value_float(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN bool xml_attribute::as_bool(bool def) const
+       {
+               return impl::get_value_bool(_attr ? _attr->value : 0, def);
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN long long xml_attribute::as_llong(long long def) const
+       {
+               return impl::get_value_llong(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN unsigned long long xml_attribute::as_ullong(unsigned long long def) const
+       {
+               return impl::get_value_ullong(_attr ? _attr->value : 0, def);
+       }
+#endif
+
+       PUGI__FN bool xml_attribute::empty() const
+       {
+               return !_attr;
+       }
+
+       PUGI__FN const char_t* xml_attribute::name() const
+       {
+               return (_attr && _attr->name) ? _attr->name : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* xml_attribute::value() const
+       {
+               return (_attr && _attr->value) ? _attr->value : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN size_t xml_attribute::hash_value() const
+       {
+               return static_cast<size_t>(reinterpret_cast<uintptr_t>(_attr) / sizeof(xml_attribute_struct));
+       }
+
+       PUGI__FN xml_attribute_struct* xml_attribute::internal_object() const
+       {
+               return _attr;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute::operator=(const char_t* rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+       
+       PUGI__FN xml_attribute& xml_attribute::operator=(int rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute::operator=(unsigned int rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute::operator=(double rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+       
+       PUGI__FN xml_attribute& xml_attribute::operator=(float rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+       
+       PUGI__FN xml_attribute& xml_attribute::operator=(bool rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN xml_attribute& xml_attribute::operator=(long long rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute::operator=(unsigned long long rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+#endif
+
+       PUGI__FN bool xml_attribute::set_name(const char_t* rhs)
+       {
+               if (!_attr) return false;
+               
+               return impl::strcpy_insitu(_attr->name, _attr->header, impl::xml_memory_page_name_allocated_mask, rhs);
+       }
+               
+       PUGI__FN bool xml_attribute::set_value(const char_t* rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::strcpy_insitu(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+       PUGI__FN bool xml_attribute::set_value(int rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+       PUGI__FN bool xml_attribute::set_value(unsigned int rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+       PUGI__FN bool xml_attribute::set_value(double rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+       
+       PUGI__FN bool xml_attribute::set_value(float rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+       
+       PUGI__FN bool xml_attribute::set_value(bool rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN bool xml_attribute::set_value(long long rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+       PUGI__FN bool xml_attribute::set_value(unsigned long long rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+#endif
+
+#ifdef __BORLANDC__
+       PUGI__FN bool operator&&(const xml_attribute& lhs, bool rhs)
+       {
+               return (bool)lhs && rhs;
+       }
+
+       PUGI__FN bool operator||(const xml_attribute& lhs, bool rhs)
+       {
+               return (bool)lhs || rhs;
+       }
+#endif
+
+       PUGI__FN xml_node::xml_node(): _root(0)
+       {
+       }
+
+       PUGI__FN xml_node::xml_node(xml_node_struct* p): _root(p)
+       {
+       }
+       
+       PUGI__FN static void unspecified_bool_xml_node(xml_node***)
+       {
+       }
+
+       PUGI__FN xml_node::operator xml_node::unspecified_bool_type() const
+       {
+               return _root ? unspecified_bool_xml_node : 0;
+       }
+
+       PUGI__FN bool xml_node::operator!() const
+       {
+               return !_root;
+       }
+
+       PUGI__FN xml_node::iterator xml_node::begin() const
+       {
+               return iterator(_root ? _root->first_child : 0, _root);
+       }
+
+       PUGI__FN xml_node::iterator xml_node::end() const
+       {
+               return iterator(0, _root);
+       }
+       
+       PUGI__FN xml_node::attribute_iterator xml_node::attributes_begin() const
+       {
+               return attribute_iterator(_root ? _root->first_attribute : 0, _root);
+       }
+
+       PUGI__FN xml_node::attribute_iterator xml_node::attributes_end() const
+       {
+               return attribute_iterator(0, _root);
+       }
+       
+       PUGI__FN xml_object_range<xml_node_iterator> xml_node::children() const
+       {
+               return xml_object_range<xml_node_iterator>(begin(), end());
+       }
+
+       PUGI__FN xml_object_range<xml_named_node_iterator> xml_node::children(const char_t* name_) const
+       {
+               return xml_object_range<xml_named_node_iterator>(xml_named_node_iterator(child(name_)._root, _root, name_), xml_named_node_iterator(0, _root, name_));
+       }
+
+       PUGI__FN xml_object_range<xml_attribute_iterator> xml_node::attributes() const
+       {
+               return xml_object_range<xml_attribute_iterator>(attributes_begin(), attributes_end());
+       }
+
+       PUGI__FN bool xml_node::operator==(const xml_node& r) const
+       {
+               return (_root == r._root);
+       }
+
+       PUGI__FN bool xml_node::operator!=(const xml_node& r) const
+       {
+               return (_root != r._root);
+       }
+
+       PUGI__FN bool xml_node::operator<(const xml_node& r) const
+       {
+               return (_root < r._root);
+       }
+       
+       PUGI__FN bool xml_node::operator>(const xml_node& r) const
+       {
+               return (_root > r._root);
+       }
+       
+       PUGI__FN bool xml_node::operator<=(const xml_node& r) const
+       {
+               return (_root <= r._root);
+       }
+       
+       PUGI__FN bool xml_node::operator>=(const xml_node& r) const
+       {
+               return (_root >= r._root);
+       }
+
+       PUGI__FN bool xml_node::empty() const
+       {
+               return !_root;
+       }
+       
+       PUGI__FN const char_t* xml_node::name() const
+       {
+               return (_root && _root->name) ? _root->name : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN xml_node_type xml_node::type() const
+       {
+               return _root ? PUGI__NODETYPE(_root) : node_null;
+       }
+       
+       PUGI__FN const char_t* xml_node::value() const
+       {
+               return (_root && _root->value) ? _root->value : PUGIXML_TEXT("");
+       }
+       
+       PUGI__FN xml_node xml_node::child(const char_t* name_) const
+       {
+               if (!_root) return xml_node();
+
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+               return xml_node();
+       }
+
+       PUGI__FN xml_attribute xml_node::attribute(const char_t* name_) const
+       {
+               if (!_root) return xml_attribute();
+
+               for (xml_attribute_struct* i = _root->first_attribute; i; i = i->next_attribute)
+                       if (i->name && impl::strequal(name_, i->name))
+                               return xml_attribute(i);
+               
+               return xml_attribute();
+       }
+       
+       PUGI__FN xml_node xml_node::next_sibling(const char_t* name_) const
+       {
+               if (!_root) return xml_node();
+               
+               for (xml_node_struct* i = _root->next_sibling; i; i = i->next_sibling)
+                       if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+               return xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::next_sibling() const
+       {
+               return _root ? xml_node(_root->next_sibling) : xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::previous_sibling(const char_t* name_) const
+       {
+               if (!_root) return xml_node();
+               
+               for (xml_node_struct* i = _root->prev_sibling_c; i->next_sibling; i = i->prev_sibling_c)
+                       if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+               return xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::previous_sibling() const
+       {
+               if (!_root) return xml_node();
+               
+               if (_root->prev_sibling_c->next_sibling) return xml_node(_root->prev_sibling_c);
+               else return xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::parent() const
+       {
+               return _root ? xml_node(_root->parent) : xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::root() const
+       {
+               return _root ? xml_node(&impl::get_document(_root)) : xml_node();
+       }
+
+       PUGI__FN xml_text xml_node::text() const
+       {
+               return xml_text(_root);
+       }
+
+       PUGI__FN const char_t* xml_node::child_value() const
+       {
+               if (!_root) return PUGIXML_TEXT("");
+               
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       if (i->value && impl::is_text_node(i))
+                               return i->value;
+
+               return PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* xml_node::child_value(const char_t* name_) const
+       {
+               return child(name_).child_value();
+       }
+
+       PUGI__FN xml_attribute xml_node::first_attribute() const
+       {
+               return _root ? xml_attribute(_root->first_attribute) : xml_attribute();
+       }
+
+       PUGI__FN xml_attribute xml_node::last_attribute() const
+       {
+               return _root && _root->first_attribute ? xml_attribute(_root->first_attribute->prev_attribute_c) : xml_attribute();
+       }
+
+       PUGI__FN xml_node xml_node::first_child() const
+       {
+               return _root ? xml_node(_root->first_child) : xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::last_child() const
+       {
+               return _root && _root->first_child ? xml_node(_root->first_child->prev_sibling_c) : xml_node();
+       }
+
+       PUGI__FN bool xml_node::set_name(const char_t* rhs)
+       {
+               switch (type())
+               {
+               case node_pi:
+               case node_declaration:
+               case node_element:
+                       return impl::strcpy_insitu(_root->name, _root->header, impl::xml_memory_page_name_allocated_mask, rhs);
+
+               default:
+                       return false;
+               }
+       }
+               
+       PUGI__FN bool xml_node::set_value(const char_t* rhs)
+       {
+               switch (type())
+               {
+               case node_pi:
+               case node_cdata:
+               case node_pcdata:
+               case node_comment:
+               case node_doctype:
+                       return impl::strcpy_insitu(_root->value, _root->header, impl::xml_memory_page_value_allocated_mask, rhs);
+
+               default:
+                       return false;
+               }
+       }
+
+       PUGI__FN xml_attribute xml_node::append_attribute(const char_t* name_)
+       {
+               if (!impl::allow_insert_attribute(type())) return xml_attribute();
+               
+               xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+               if (!a) return xml_attribute();
+
+               impl::append_attribute(a._attr, _root);
+
+               a.set_name(name_);
+               
+               return a;
+       }
+
+       PUGI__FN xml_attribute xml_node::prepend_attribute(const char_t* name_)
+       {
+               if (!impl::allow_insert_attribute(type())) return xml_attribute();
+               
+               xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+               if (!a) return xml_attribute();
+
+               impl::prepend_attribute(a._attr, _root);
+
+               a.set_name(name_);
+
+               return a;
+       }
+
+       PUGI__FN xml_attribute xml_node::insert_attribute_after(const char_t* name_, const xml_attribute& attr)
+       {
+               if (!impl::allow_insert_attribute(type())) return xml_attribute();
+               if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute();
+               
+               xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+               if (!a) return xml_attribute();
+
+               impl::insert_attribute_after(a._attr, attr._attr, _root);
+
+               a.set_name(name_);
+
+               return a;
+       }
+
+       PUGI__FN xml_attribute xml_node::insert_attribute_before(const char_t* name_, const xml_attribute& attr)
+       {
+               if (!impl::allow_insert_attribute(type())) return xml_attribute();
+               if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute();
+               
+               xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+               if (!a) return xml_attribute();
+
+               impl::insert_attribute_before(a._attr, attr._attr, _root);
+
+               a.set_name(name_);
+
+               return a;
+       }
+
+       PUGI__FN xml_attribute xml_node::append_copy(const xml_attribute& proto)
+       {
+               if (!proto) return xml_attribute();
+
+               xml_attribute result = append_attribute(proto.name());
+               result.set_value(proto.value());
+
+               return result;
+       }
+
+       PUGI__FN xml_attribute xml_node::prepend_copy(const xml_attribute& proto)
+       {
+               if (!proto) return xml_attribute();
+
+               xml_attribute result = prepend_attribute(proto.name());
+               result.set_value(proto.value());
+
+               return result;
+       }
+
+       PUGI__FN xml_attribute xml_node::insert_copy_after(const xml_attribute& proto, const xml_attribute& attr)
+       {
+               if (!proto) return xml_attribute();
+
+               xml_attribute result = insert_attribute_after(proto.name(), attr);
+               result.set_value(proto.value());
+
+               return result;
+       }
+
+       PUGI__FN xml_attribute xml_node::insert_copy_before(const xml_attribute& proto, const xml_attribute& attr)
+       {
+               if (!proto) return xml_attribute();
+
+               xml_attribute result = insert_attribute_before(proto.name(), attr);
+               result.set_value(proto.value());
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::append_child(xml_node_type type_)
+       {
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::append_node(n._root, _root);
+
+               if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::prepend_child(xml_node_type type_)
+       {
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::prepend_node(n._root, _root);
+                               
+               if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::insert_child_before(xml_node_type type_, const xml_node& node)
+       {
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+       
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::insert_node_before(n._root, node._root);
+
+               if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::insert_child_after(xml_node_type type_, const xml_node& node)
+       {
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+       
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::insert_node_after(n._root, node._root);
+
+               if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::append_child(const char_t* name_)
+       {
+               xml_node result = append_child(node_element);
+
+               result.set_name(name_);
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::prepend_child(const char_t* name_)
+       {
+               xml_node result = prepend_child(node_element);
+
+               result.set_name(name_);
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::insert_child_after(const char_t* name_, const xml_node& node)
+       {
+               xml_node result = insert_child_after(node_element, node);
+
+               result.set_name(name_);
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::insert_child_before(const char_t* name_, const xml_node& node)
+       {
+               xml_node result = insert_child_before(node_element, node);
+
+               result.set_name(name_);
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::append_copy(const xml_node& proto)
+       {
+               xml_node_type type_ = proto.type();
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::append_node(n._root, _root);
+               impl::node_copy_tree(n._root, proto._root);
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::prepend_copy(const xml_node& proto)
+       {
+               xml_node_type type_ = proto.type();
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::prepend_node(n._root, _root);
+               impl::node_copy_tree(n._root, proto._root);
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::insert_copy_after(const xml_node& proto, const xml_node& node)
+       {
+               xml_node_type type_ = proto.type();
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::insert_node_after(n._root, node._root);
+               impl::node_copy_tree(n._root, proto._root);
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::insert_copy_before(const xml_node& proto, const xml_node& node)
+       {
+               xml_node_type type_ = proto.type();
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::insert_node_before(n._root, node._root);
+               impl::node_copy_tree(n._root, proto._root);
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::append_move(const xml_node& moved)
+       {
+               if (!impl::allow_move(*this, moved)) return xml_node();
+
+               // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers
+               impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask;
+
+               impl::remove_node(moved._root);
+               impl::append_node(moved._root, _root);
+
+               return moved;
+       }
+
+       PUGI__FN xml_node xml_node::prepend_move(const xml_node& moved)
+       {
+               if (!impl::allow_move(*this, moved)) return xml_node();
+
+               // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers
+               impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask;
+
+               impl::remove_node(moved._root);
+               impl::prepend_node(moved._root, _root);
+
+               return moved;
+       }
+
+       PUGI__FN xml_node xml_node::insert_move_after(const xml_node& moved, const xml_node& node)
+       {
+               if (!impl::allow_move(*this, moved)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+               if (moved._root == node._root) return xml_node();
+
+               // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers
+               impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask;
+
+               impl::remove_node(moved._root);
+               impl::insert_node_after(moved._root, node._root);
+
+               return moved;
+       }
+
+       PUGI__FN xml_node xml_node::insert_move_before(const xml_node& moved, const xml_node& node)
+       {
+               if (!impl::allow_move(*this, moved)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+               if (moved._root == node._root) return xml_node();
+
+               // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers
+               impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask;
+
+               impl::remove_node(moved._root);
+               impl::insert_node_before(moved._root, node._root);
+
+               return moved;
+       }
+
+       PUGI__FN bool xml_node::remove_attribute(const char_t* name_)
+       {
+               return remove_attribute(attribute(name_));
+       }
+
+       PUGI__FN bool xml_node::remove_attribute(const xml_attribute& a)
+       {
+               if (!_root || !a._attr) return false;
+               if (!impl::is_attribute_of(a._attr, _root)) return false;
+
+               impl::remove_attribute(a._attr, _root);
+               impl::destroy_attribute(a._attr, impl::get_allocator(_root));
+
+               return true;
+       }
+
+       PUGI__FN bool xml_node::remove_child(const char_t* name_)
+       {
+               return remove_child(child(name_));
+       }
+
+       PUGI__FN bool xml_node::remove_child(const xml_node& n)
+       {
+               if (!_root || !n._root || n._root->parent != _root) return false;
+
+               impl::remove_node(n._root);
+               impl::destroy_node(n._root, impl::get_allocator(_root));
+
+               return true;
+       }
+
+       PUGI__FN xml_parse_result xml_node::append_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding)
+       {
+               // append_buffer is only valid for elements/documents
+               if (!impl::allow_insert_child(type(), node_element)) return impl::make_parse_result(status_append_invalid_root);
+
+               // get document node
+               impl::xml_document_struct* doc = &impl::get_document(_root);
+
+               // disable document_buffer_order optimization since in a document with multiple buffers comparing buffer pointers does not make sense
+               doc->header |= impl::xml_memory_page_contents_shared_mask;
+               
+               // get extra buffer element (we'll store the document fragment buffer there so that we can deallocate it later)
+               impl::xml_memory_page* page = 0;
+               impl::xml_extra_buffer* extra = static_cast<impl::xml_extra_buffer*>(doc->allocate_memory(sizeof(impl::xml_extra_buffer), page));
+               (void)page;
+
+               if (!extra) return impl::make_parse_result(status_out_of_memory);
+
+               // save name; name of the root has to be NULL before parsing - otherwise closing node mismatches will not be detected at the top level
+               char_t* rootname = _root->name;
+               _root->name = 0;
+
+               // parse
+               char_t* buffer = 0;
+               xml_parse_result res = impl::load_buffer_impl(doc, _root, const_cast<void*>(contents), size, options, encoding, false, false, &buffer);
+
+               // restore name
+               _root->name = rootname;
+
+               // add extra buffer to the list
+               extra->buffer = buffer;
+               extra->next = doc->extra_buffers;
+               doc->extra_buffers = extra;
+
+               return res;
+       }
+
+       PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* name_, const char_t* attr_name, const char_t* attr_value) const
+       {
+               if (!_root) return xml_node();
+               
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       if (i->name && impl::strequal(name_, i->name))
+                       {
+                               for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
+                                       if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value : PUGIXML_TEXT("")))
+                                               return xml_node(i);
+                       }
+
+               return xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const
+       {
+               if (!_root) return xml_node();
+               
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
+                               if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value : PUGIXML_TEXT("")))
+                                       return xml_node(i);
+
+               return xml_node();
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN string_t xml_node::path(char_t delimiter) const
+       {
+               xml_node cursor = *this; // Make a copy.
+               
+               string_t result = cursor.name();
+
+               while (cursor.parent())
+               {
+                       cursor = cursor.parent();
+                       
+                       string_t temp = cursor.name();
+                       temp += delimiter;
+                       temp += result;
+                       result.swap(temp);
+               }
+
+               return result;
+       }
+#endif
+
+       PUGI__FN xml_node xml_node::first_element_by_path(const char_t* path_, char_t delimiter) const
+       {
+               xml_node found = *this; // Current search context.
+
+               if (!_root || !path_ || !path_[0]) return found;
+
+               if (path_[0] == delimiter)
+               {
+                       // Absolute path; e.g. '/foo/bar'
+                       found = found.root();
+                       ++path_;
+               }
+
+               const char_t* path_segment = path_;
+
+               while (*path_segment == delimiter) ++path_segment;
+
+               const char_t* path_segment_end = path_segment;
+
+               while (*path_segment_end && *path_segment_end != delimiter) ++path_segment_end;
+
+               if (path_segment == path_segment_end) return found;
+
+               const char_t* next_segment = path_segment_end;
+
+               while (*next_segment == delimiter) ++next_segment;
+
+               if (*path_segment == '.' && path_segment + 1 == path_segment_end)
+                       return found.first_element_by_path(next_segment, delimiter);
+               else if (*path_segment == '.' && *(path_segment+1) == '.' && path_segment + 2 == path_segment_end)
+                       return found.parent().first_element_by_path(next_segment, delimiter);
+               else
+               {
+                       for (xml_node_struct* j = found._root->first_child; j; j = j->next_sibling)
+                       {
+                               if (j->name && impl::strequalrange(j->name, path_segment, static_cast<size_t>(path_segment_end - path_segment)))
+                               {
+                                       xml_node subsearch = xml_node(j).first_element_by_path(next_segment, delimiter);
+
+                                       if (subsearch) return subsearch;
+                               }
+                       }
+
+                       return xml_node();
+               }
+       }
+
+       PUGI__FN bool xml_node::traverse(xml_tree_walker& walker)
+       {
+               walker._depth = -1;
+               
+               xml_node arg_begin = *this;
+               if (!walker.begin(arg_begin)) return false;
+
+               xml_node cur = first_child();
+                               
+               if (cur)
+               {
+                       ++walker._depth;
+
+                       do 
+                       {
+                               xml_node arg_for_each = cur;
+                               if (!walker.for_each(arg_for_each))
+                                       return false;
+                                               
+                               if (cur.first_child())
+                               {
+                                       ++walker._depth;
+                                       cur = cur.first_child();
+                               }
+                               else if (cur.next_sibling())
+                                       cur = cur.next_sibling();
+                               else
+                               {
+                                       // Borland C++ workaround
+                                       while (!cur.next_sibling() && cur != *this && !cur.parent().empty())
+                                       {
+                                               --walker._depth;
+                                               cur = cur.parent();
+                                       }
+                                               
+                                       if (cur != *this)
+                                               cur = cur.next_sibling();
+                               }
+                       }
+                       while (cur && cur != *this);
+               }
+
+               assert(walker._depth == -1);
+
+               xml_node arg_end = *this;
+               return walker.end(arg_end);
+       }
+
+       PUGI__FN size_t xml_node::hash_value() const
+       {
+               return static_cast<size_t>(reinterpret_cast<uintptr_t>(_root) / sizeof(xml_node_struct));
+       }
+
+       PUGI__FN xml_node_struct* xml_node::internal_object() const
+       {
+               return _root;
+       }
+
+       PUGI__FN void xml_node::print(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
+       {
+               if (!_root) return;
+
+               impl::xml_buffered_writer buffered_writer(writer, encoding);
+
+               impl::node_output(buffered_writer, _root, indent, flags, depth);
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN void xml_node::print(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
+       {
+               xml_writer_stream writer(stream);
+
+               print(writer, indent, flags, encoding, depth);
+       }
+
+       PUGI__FN void xml_node::print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags, unsigned int depth) const
+       {
+               xml_writer_stream writer(stream);
+
+               print(writer, indent, flags, encoding_wchar, depth);
+       }
+#endif
+
+       PUGI__FN ptrdiff_t xml_node::offset_debug() const
+       {
+               if (!_root) return -1;
+
+               impl::xml_document_struct& doc = impl::get_document(_root);
+
+               // we can determine the offset reliably only if there is exactly once parse buffer
+               if (!doc.buffer || doc.extra_buffers) return -1;
+
+               switch (type())
+               {
+               case node_document:
+                       return 0;
+
+               case node_element:
+               case node_declaration:
+               case node_pi:
+                       return _root->name && (_root->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0 ? _root->name - doc.buffer : -1;
+
+               case node_pcdata:
+               case node_cdata:
+               case node_comment:
+               case node_doctype:
+                       return _root->value && (_root->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0 ? _root->value - doc.buffer : -1;
+
+               default:
+                       return -1;
+               }
+       }
+
+#ifdef __BORLANDC__
+       PUGI__FN bool operator&&(const xml_node& lhs, bool rhs)
+       {
+               return (bool)lhs && rhs;
+       }
+
+       PUGI__FN bool operator||(const xml_node& lhs, bool rhs)
+       {
+               return (bool)lhs || rhs;
+       }
+#endif
+
+       PUGI__FN xml_text::xml_text(xml_node_struct* root): _root(root)
+       {
+       }
+
+       PUGI__FN xml_node_struct* xml_text::_data() const
+       {
+               if (!_root || impl::is_text_node(_root)) return _root;
+
+               for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling)
+                       if (impl::is_text_node(node))
+                               return node;
+
+               return 0;
+       }
+
+       PUGI__FN xml_node_struct* xml_text::_data_new()
+       {
+               xml_node_struct* d = _data();
+               if (d) return d;
+
+               return xml_node(_root).append_child(node_pcdata).internal_object();
+       }
+
+       PUGI__FN xml_text::xml_text(): _root(0)
+       {
+       }
+
+       PUGI__FN static void unspecified_bool_xml_text(xml_text***)
+       {
+       }
+
+       PUGI__FN xml_text::operator xml_text::unspecified_bool_type() const
+       {
+               return _data() ? unspecified_bool_xml_text : 0;
+       }
+
+       PUGI__FN bool xml_text::operator!() const
+       {
+               return !_data();
+       }
+
+       PUGI__FN bool xml_text::empty() const
+       {
+               return _data() == 0;
+       }
+
+       PUGI__FN const char_t* xml_text::get() const
+       {
+               xml_node_struct* d = _data();
+
+               return (d && d->value) ? d->value : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* xml_text::as_string(const char_t* def) const
+       {
+               xml_node_struct* d = _data();
+
+               return (d && d->value) ? d->value : def;
+       }
+
+       PUGI__FN int xml_text::as_int(int def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_int(d ? d->value : 0, def);
+       }
+
+       PUGI__FN unsigned int xml_text::as_uint(unsigned int def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_uint(d ? d->value : 0, def);
+       }
+
+       PUGI__FN double xml_text::as_double(double def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_double(d ? d->value : 0, def);
+       }
+
+       PUGI__FN float xml_text::as_float(float def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_float(d ? d->value : 0, def);
+       }
+
+       PUGI__FN bool xml_text::as_bool(bool def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_bool(d ? d->value : 0, def);
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN long long xml_text::as_llong(long long def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_llong(d ? d->value : 0, def);
+       }
+
+       PUGI__FN unsigned long long xml_text::as_ullong(unsigned long long def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_ullong(d ? d->value : 0, def);
+       }
+#endif
+
+       PUGI__FN bool xml_text::set(const char_t* rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::strcpy_insitu(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(int rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(unsigned int rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(float rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(double rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(bool rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN bool xml_text::set(long long rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(unsigned long long rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+#endif
+
+       PUGI__FN xml_text& xml_text::operator=(const char_t* rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(int rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(unsigned int rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(double rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(float rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(bool rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN xml_text& xml_text::operator=(long long rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(unsigned long long rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+#endif
+
+       PUGI__FN xml_node xml_text::data() const
+       {
+               return xml_node(_data());
+       }
+
+#ifdef __BORLANDC__
+       PUGI__FN bool operator&&(const xml_text& lhs, bool rhs)
+       {
+               return (bool)lhs && rhs;
+       }
+
+       PUGI__FN bool operator||(const xml_text& lhs, bool rhs)
+       {
+               return (bool)lhs || rhs;
+       }
+#endif
+
+       PUGI__FN xml_node_iterator::xml_node_iterator()
+       {
+       }
+
+       PUGI__FN xml_node_iterator::xml_node_iterator(const xml_node& node): _wrap(node), _parent(node.parent())
+       {
+       }
+
+       PUGI__FN xml_node_iterator::xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
+       {
+       }
+
+       PUGI__FN bool xml_node_iterator::operator==(const xml_node_iterator& rhs) const
+       {
+               return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root;
+       }
+       
+       PUGI__FN bool xml_node_iterator::operator!=(const xml_node_iterator& rhs) const
+       {
+               return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root;
+       }
+
+       PUGI__FN xml_node& xml_node_iterator::operator*() const
+       {
+               assert(_wrap._root);
+               return _wrap;
+       }
+
+       PUGI__FN xml_node* xml_node_iterator::operator->() const
+       {
+               assert(_wrap._root);
+               return const_cast<xml_node*>(&_wrap); // BCC32 workaround
+       }
+
+       PUGI__FN const xml_node_iterator& xml_node_iterator::operator++()
+       {
+               assert(_wrap._root);
+               _wrap._root = _wrap._root->next_sibling;
+               return *this;
+       }
+
+       PUGI__FN xml_node_iterator xml_node_iterator::operator++(int)
+       {
+               xml_node_iterator temp = *this;
+               ++*this;
+               return temp;
+       }
+
+       PUGI__FN const xml_node_iterator& xml_node_iterator::operator--()
+       {
+               _wrap = _wrap._root ? _wrap.previous_sibling() : _parent.last_child();
+               return *this;
+       }
+
+       PUGI__FN xml_node_iterator xml_node_iterator::operator--(int)
+       {
+               xml_node_iterator temp = *this;
+               --*this;
+               return temp;
+       }
+
+       PUGI__FN xml_attribute_iterator::xml_attribute_iterator()
+       {
+       }
+
+       PUGI__FN xml_attribute_iterator::xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent): _wrap(attr), _parent(parent)
+       {
+       }
+
+       PUGI__FN xml_attribute_iterator::xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
+       {
+       }
+
+       PUGI__FN bool xml_attribute_iterator::operator==(const xml_attribute_iterator& rhs) const
+       {
+               return _wrap._attr == rhs._wrap._attr && _parent._root == rhs._parent._root;
+       }
+       
+       PUGI__FN bool xml_attribute_iterator::operator!=(const xml_attribute_iterator& rhs) const
+       {
+               return _wrap._attr != rhs._wrap._attr || _parent._root != rhs._parent._root;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute_iterator::operator*() const
+       {
+               assert(_wrap._attr);
+               return _wrap;
+       }
+
+       PUGI__FN xml_attribute* xml_attribute_iterator::operator->() const
+       {
+               assert(_wrap._attr);
+               return const_cast<xml_attribute*>(&_wrap); // BCC32 workaround
+       }
+
+       PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator++()
+       {
+               assert(_wrap._attr);
+               _wrap._attr = _wrap._attr->next_attribute;
+               return *this;
+       }
+
+       PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator++(int)
+       {
+               xml_attribute_iterator temp = *this;
+               ++*this;
+               return temp;
+       }
+
+       PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator--()
+       {
+               _wrap = _wrap._attr ? _wrap.previous_attribute() : _parent.last_attribute();
+               return *this;
+       }
+
+       PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator--(int)
+       {
+               xml_attribute_iterator temp = *this;
+               --*this;
+               return temp;
+       }
+
+       PUGI__FN xml_named_node_iterator::xml_named_node_iterator(): _name(0)
+       {
+       }
+
+       PUGI__FN xml_named_node_iterator::xml_named_node_iterator(const xml_node& node, const char_t* name): _wrap(node), _parent(node.parent()), _name(name)
+       {
+       }
+
+       PUGI__FN xml_named_node_iterator::xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name): _wrap(ref), _parent(parent), _name(name)
+       {
+       }
+
+       PUGI__FN bool xml_named_node_iterator::operator==(const xml_named_node_iterator& rhs) const
+       {
+               return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root;
+       }
+
+       PUGI__FN bool xml_named_node_iterator::operator!=(const xml_named_node_iterator& rhs) const
+       {
+               return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root;
+       }
+
+       PUGI__FN xml_node& xml_named_node_iterator::operator*() const
+       {
+               assert(_wrap._root);
+               return _wrap;
+       }
+
+       PUGI__FN xml_node* xml_named_node_iterator::operator->() const
+       {
+               assert(_wrap._root);
+               return const_cast<xml_node*>(&_wrap); // BCC32 workaround
+       }
+
+       PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator++()
+       {
+               assert(_wrap._root);
+               _wrap = _wrap.next_sibling(_name);
+               return *this;
+       }
+
+       PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator++(int)
+       {
+               xml_named_node_iterator temp = *this;
+               ++*this;
+               return temp;
+       }
+
+       PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator--()
+       {
+               if (_wrap._root)
+                       _wrap = _wrap.previous_sibling(_name);
+               else
+               {
+                       _wrap = _parent.last_child();
+
+                       if (!impl::strequal(_wrap.name(), _name))
+                               _wrap = _wrap.previous_sibling(_name);
+               }
+
+               return *this;
+       }
+
+       PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator--(int)
+       {
+               xml_named_node_iterator temp = *this;
+               --*this;
+               return temp;
+       }
+
+       PUGI__FN xml_parse_result::xml_parse_result(): status(status_internal_error), offset(0), encoding(encoding_auto)
+       {
+       }
+
+       PUGI__FN xml_parse_result::operator bool() const
+       {
+               return status == status_ok;
+       }
+
+       PUGI__FN const char* xml_parse_result::description() const
+       {
+               switch (status)
+               {
+               case status_ok: return "No error";
+
+               case status_file_not_found: return "File was not found";
+               case status_io_error: return "Error reading from file/stream";
+               case status_out_of_memory: return "Could not allocate memory";
+               case status_internal_error: return "Internal error occurred";
+
+               case status_unrecognized_tag: return "Could not determine tag type";
+
+               case status_bad_pi: return "Error parsing document declaration/processing instruction";
+               case status_bad_comment: return "Error parsing comment";
+               case status_bad_cdata: return "Error parsing CDATA section";
+               case status_bad_doctype: return "Error parsing document type declaration";
+               case status_bad_pcdata: return "Error parsing PCDATA section";
+               case status_bad_start_element: return "Error parsing start element tag";
+               case status_bad_attribute: return "Error parsing element attribute";
+               case status_bad_end_element: return "Error parsing end element tag";
+               case status_end_element_mismatch: return "Start-end tags mismatch";
+
+               case status_append_invalid_root: return "Unable to append nodes: root is not an element or document";
+
+               case status_no_document_element: return "No document element found";
+
+               default: return "Unknown error";
+               }
+       }
+
+       PUGI__FN xml_document::xml_document(): _buffer(0)
+       {
+               create();
+       }
+
+       PUGI__FN xml_document::~xml_document()
+       {
+               destroy();
+       }
+
+       PUGI__FN void xml_document::reset()
+       {
+               destroy();
+               create();
+       }
+
+       PUGI__FN void xml_document::reset(const xml_document& proto)
+       {
+               reset();
+
+               for (xml_node cur = proto.first_child(); cur; cur = cur.next_sibling())
+                       append_copy(cur);
+       }
+
+       PUGI__FN void xml_document::create()
+       {
+               assert(!_root);
+
+               // initialize sentinel page
+               PUGI__STATIC_ASSERT(sizeof(impl::xml_memory_page) + sizeof(impl::xml_document_struct) + impl::xml_memory_page_alignment - sizeof(void*) <= sizeof(_memory));
+
+               // align upwards to page boundary
+               void* page_memory = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(_memory) + (impl::xml_memory_page_alignment - 1)) & ~(impl::xml_memory_page_alignment - 1));
+
+               // prepare page structure
+               impl::xml_memory_page* page = impl::xml_memory_page::construct(page_memory);
+               assert(page);
+
+               page->busy_size = impl::xml_memory_page_size;
+
+               // allocate new root
+               _root = new (reinterpret_cast<char*>(page) + sizeof(impl::xml_memory_page)) impl::xml_document_struct(page);
+               _root->prev_sibling_c = _root;
+
+               // setup sentinel page
+               page->allocator = static_cast<impl::xml_document_struct*>(_root);
+
+               // verify the document allocation
+               assert(reinterpret_cast<char*>(_root) + sizeof(impl::xml_document_struct) <= _memory + sizeof(_memory));
+       }
+
+       PUGI__FN void xml_document::destroy()
+       {
+               assert(_root);
+
+               // destroy static storage
+               if (_buffer)
+               {
+                       impl::xml_memory::deallocate(_buffer);
+                       _buffer = 0;
+               }
+
+               // destroy extra buffers (note: no need to destroy linked list nodes, they're allocated using document allocator)
+               for (impl::xml_extra_buffer* extra = static_cast<impl::xml_document_struct*>(_root)->extra_buffers; extra; extra = extra->next)
+               {
+                       if (extra->buffer) impl::xml_memory::deallocate(extra->buffer);
+               }
+
+               // destroy dynamic storage, leave sentinel page (it's in static memory)
+               impl::xml_memory_page* root_page = reinterpret_cast<impl::xml_memory_page*>(_root->header & impl::xml_memory_page_pointer_mask);
+               assert(root_page && !root_page->prev);
+               assert(reinterpret_cast<char*>(root_page) >= _memory && reinterpret_cast<char*>(root_page) < _memory + sizeof(_memory));
+
+               for (impl::xml_memory_page* page = root_page->next; page; )
+               {
+                       impl::xml_memory_page* next = page->next;
+
+                       impl::xml_allocator::deallocate_page(page);
+
+                       page = next;
+               }
+
+               _root = 0;
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN xml_parse_result xml_document::load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               return impl::load_stream_impl(*this, stream, options, encoding);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options)
+       {
+               reset();
+
+               return impl::load_stream_impl(*this, stream, options, encoding_wchar);
+       }
+#endif
+
+       PUGI__FN xml_parse_result xml_document::load_string(const char_t* contents, unsigned int options)
+       {
+               // Force native encoding (skip autodetection)
+       #ifdef PUGIXML_WCHAR_MODE
+               xml_encoding encoding = encoding_wchar;
+       #else
+               xml_encoding encoding = encoding_utf8;
+       #endif
+
+               return load_buffer(contents, impl::strlength(contents) * sizeof(char_t), options, encoding);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load(const char_t* contents, unsigned int options)
+       {
+               return load_string(contents, options);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_file(const char* path_, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               FILE* file = fopen(path_, "rb");
+
+               return impl::load_file_impl(*this, file, options, encoding);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_file(const wchar_t* path_, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               FILE* file = impl::open_file_wide(path_, L"rb");
+
+               return impl::load_file_impl(*this, file, options, encoding);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               return impl::load_buffer_impl(static_cast<impl::xml_document_struct*>(_root), _root, const_cast<void*>(contents), size, options, encoding, false, false, &_buffer);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_buffer_inplace(void* contents, size_t size, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               return impl::load_buffer_impl(static_cast<impl::xml_document_struct*>(_root), _root, contents, size, options, encoding, true, false, &_buffer);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_buffer_inplace_own(void* contents, size_t size, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               return impl::load_buffer_impl(static_cast<impl::xml_document_struct*>(_root), _root, contents, size, options, encoding, true, true, &_buffer);
+       }
+
+       PUGI__FN void xml_document::save(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+       {
+               impl::xml_buffered_writer buffered_writer(writer, encoding);
+
+               if ((flags & format_write_bom) && encoding != encoding_latin1)
+               {
+                       // BOM always represents the codepoint U+FEFF, so just write it in native encoding
+               #ifdef PUGIXML_WCHAR_MODE
+                       unsigned int bom = 0xfeff;
+                       buffered_writer.write(static_cast<wchar_t>(bom));
+               #else
+                       buffered_writer.write('\xef', '\xbb', '\xbf');
+               #endif
+               }
+
+               if (!(flags & format_no_declaration) && !impl::has_declaration(_root))
+               {
+                       buffered_writer.write_string(PUGIXML_TEXT("<?xml version=\"1.0\""));
+                       if (encoding == encoding_latin1) buffered_writer.write_string(PUGIXML_TEXT(" encoding=\"ISO-8859-1\""));
+                       buffered_writer.write('?', '>');
+                       if (!(flags & format_raw)) buffered_writer.write('\n');
+               }
+
+               impl::node_output(buffered_writer, _root, indent, flags, 0);
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN void xml_document::save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+       {
+               xml_writer_stream writer(stream);
+
+               save(writer, indent, flags, encoding);
+       }
+
+       PUGI__FN void xml_document::save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags) const
+       {
+               xml_writer_stream writer(stream);
+
+               save(writer, indent, flags, encoding_wchar);
+       }
+#endif
+
+       PUGI__FN bool xml_document::save_file(const char* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+       {
+               FILE* file = fopen(path_, (flags & format_save_file_text) ? "w" : "wb");
+               return impl::save_file_impl(*this, file, indent, flags, encoding);
+       }
+
+       PUGI__FN bool xml_document::save_file(const wchar_t* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+       {
+               FILE* file = impl::open_file_wide(path_, (flags & format_save_file_text) ? L"w" : L"wb");
+               return impl::save_file_impl(*this, file, indent, flags, encoding);
+       }
+
+       PUGI__FN xml_node xml_document::document_element() const
+       {
+               assert(_root);
+
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       if (PUGI__NODETYPE(i) == node_element)
+                               return xml_node(i);
+
+               return xml_node();
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str)
+       {
+               assert(str);
+
+               return impl::as_utf8_impl(str, impl::strlength_wide(str));
+       }
+
+       PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t>& str)
+       {
+               return impl::as_utf8_impl(str.c_str(), str.size());
+       }
+       
+       PUGI__FN std::basic_string<wchar_t> PUGIXML_FUNCTION as_wide(const char* str)
+       {
+               assert(str);
+
+               return impl::as_wide_impl(str, strlen(str));
+       }
+       
+       PUGI__FN std::basic_string<wchar_t> PUGIXML_FUNCTION as_wide(const std::string& str)
+       {
+               return impl::as_wide_impl(str.c_str(), str.size());
+       }
+#endif
+
+       PUGI__FN void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate)
+       {
+               impl::xml_memory::allocate = allocate;
+               impl::xml_memory::deallocate = deallocate;
+       }
+
+       PUGI__FN allocation_function PUGIXML_FUNCTION get_memory_allocation_function()
+       {
+               return impl::xml_memory::allocate;
+       }
+
+       PUGI__FN deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function()
+       {
+               return impl::xml_memory::deallocate;
+       }
+}
+
+#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
+namespace std
+{
+       // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+       PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_node_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+
+       PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_attribute_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+
+       PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_named_node_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+}
+#endif
+
+#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
+namespace std
+{
+       // Workarounds for (non-standard) iterator category detection
+       PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_node_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+
+       PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_attribute_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+
+       PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_named_node_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+}
+#endif
+
+#ifndef PUGIXML_NO_XPATH
+// STL replacements
+PUGI__NS_BEGIN
+       struct equal_to
+       {
+               template <typename T> bool operator()(const T& lhs, const T& rhs) const
+               {
+                       return lhs == rhs;
+               }
+       };
+
+       struct not_equal_to
+       {
+               template <typename T> bool operator()(const T& lhs, const T& rhs) const
+               {
+                       return lhs != rhs;
+               }
+       };
+
+       struct less
+       {
+               template <typename T> bool operator()(const T& lhs, const T& rhs) const
+               {
+                       return lhs < rhs;
+               }
+       };
+
+       struct less_equal
+       {
+               template <typename T> bool operator()(const T& lhs, const T& rhs) const
+               {
+                       return lhs <= rhs;
+               }
+       };
+
+       template <typename T> void swap(T& lhs, T& rhs)
+       {
+               T temp = lhs;
+               lhs = rhs;
+               rhs = temp;
+       }
+
+       template <typename I, typename Pred> I min_element(I begin, I end, const Pred& pred)
+       {
+               I result = begin;
+
+               for (I it = begin + 1; it != end; ++it)
+                       if (pred(*it, *result))
+                               result = it;
+
+               return result;
+       }
+
+       template <typename I> void reverse(I begin, I end)
+       {
+               while (end - begin > 1) swap(*begin++, *--end);
+       }
+
+       template <typename I> I unique(I begin, I end)
+       {
+               // fast skip head
+               while (end - begin > 1 && *begin != *(begin + 1)) begin++;
+
+               if (begin == end) return begin;
+
+               // last written element
+               I write = begin++; 
+
+               // merge unique elements
+               while (begin != end)
+               {
+                       if (*begin != *write)
+                               *++write = *begin++;
+                       else
+                               begin++;
+               }
+
+               // past-the-end (write points to live element)
+               return write + 1;
+       }
+
+       template <typename I> void copy_backwards(I begin, I end, I target)
+       {
+               while (begin != end) *--target = *--end;
+       }
+
+       template <typename I, typename Pred, typename T> void insertion_sort(I begin, I end, const Pred& pred, T*)
+       {
+               assert(begin != end);
+
+               for (I it = begin + 1; it != end; ++it)
+               {
+                       T val = *it;
+
+                       if (pred(val, *begin))
+                       {
+                               // move to front
+                               copy_backwards(begin, it, it + 1);
+                               *begin = val;
+                       }
+                       else
+                       {
+                               I hole = it;
+
+                               // move hole backwards
+                               while (pred(val, *(hole - 1)))
+                               {
+                                       *hole = *(hole - 1);
+                                       hole--;
+                               }
+
+                               // fill hole with element
+                               *hole = val;
+                       }
+               }
+       }
+
+       // std variant for elements with ==
+       template <typename I, typename Pred> void partition(I begin, I middle, I end, const Pred& pred, I* out_eqbeg, I* out_eqend)
+       {
+               I eqbeg = middle, eqend = middle + 1;
+
+               // expand equal range
+               while (eqbeg != begin && *(eqbeg - 1) == *eqbeg) --eqbeg;
+               while (eqend != end && *eqend == *eqbeg) ++eqend;
+
+               // process outer elements
+               I ltend = eqbeg, gtbeg = eqend;
+
+               for (;;)
+               {
+                       // find the element from the right side that belongs to the left one
+                       for (; gtbeg != end; ++gtbeg)
+                               if (!pred(*eqbeg, *gtbeg))
+                               {
+                                       if (*gtbeg == *eqbeg) swap(*gtbeg, *eqend++);
+                                       else break;
+                               }
+
+                       // find the element from the left side that belongs to the right one
+                       for (; ltend != begin; --ltend)
+                               if (!pred(*(ltend - 1), *eqbeg))
+                               {
+                                       if (*eqbeg == *(ltend - 1)) swap(*(ltend - 1), *--eqbeg);
+                                       else break;
+                               }
+
+                       // scanned all elements
+                       if (gtbeg == end && ltend == begin)
+                       {
+                               *out_eqbeg = eqbeg;
+                               *out_eqend = eqend;
+                               return;
+                       }
+
+                       // make room for elements by moving equal area
+                       if (gtbeg == end)
+                       {
+                               if (--ltend != --eqbeg) swap(*ltend, *eqbeg);
+                               swap(*eqbeg, *--eqend);
+                       }
+                       else if (ltend == begin)
+                       {
+                               if (eqend != gtbeg) swap(*eqbeg, *eqend);
+                               ++eqend;
+                               swap(*gtbeg++, *eqbeg++);
+                       }
+                       else swap(*gtbeg++, *--ltend);
+               }
+       }
+
+       template <typename I, typename Pred> void median3(I first, I middle, I last, const Pred& pred)
+       {
+               if (pred(*middle, *first)) swap(*middle, *first);
+               if (pred(*last, *middle)) swap(*last, *middle);
+               if (pred(*middle, *first)) swap(*middle, *first);
+       }
+
+       template <typename I, typename Pred> void median(I first, I middle, I last, const Pred& pred)
+       {
+               if (last - first <= 40)
+               {
+                       // median of three for small chunks
+                       median3(first, middle, last, pred);
+               }
+               else
+               {
+                       // median of nine
+                       size_t step = (last - first + 1) / 8;
+
+                       median3(first, first + step, first + 2 * step, pred);
+                       median3(middle - step, middle, middle + step, pred);
+                       median3(last - 2 * step, last - step, last, pred);
+                       median3(first + step, middle, last - step, pred);
+               }
+       }
+
+       template <typename I, typename Pred> void sort(I begin, I end, const Pred& pred)
+       {
+               // sort large chunks
+               while (end - begin > 32)
+               {
+                       // find median element
+                       I middle = begin + (end - begin) / 2;
+                       median(begin, middle, end - 1, pred);
+
+                       // partition in three chunks (< = >)
+                       I eqbeg, eqend;
+                       partition(begin, middle, end, pred, &eqbeg, &eqend);
+
+                       // loop on larger half
+                       if (eqbeg - begin > end - eqend)
+                       {
+                               sort(eqend, end, pred);
+                               end = eqbeg;
+                       }
+                       else
+                       {
+                               sort(begin, eqbeg, pred);
+                               begin = eqend;
+                       }
+               }
+
+               // insertion sort small chunk
+               if (begin != end) insertion_sort(begin, end, pred, &*begin);
+       }
+PUGI__NS_END
+
+// Allocator used for AST and evaluation stacks
+PUGI__NS_BEGIN
+       struct xpath_memory_block
+       {       
+               xpath_memory_block* next;
+               size_t capacity;
+
+               char data[
+       #ifdef PUGIXML_MEMORY_XPATH_PAGE_SIZE
+                       PUGIXML_MEMORY_XPATH_PAGE_SIZE
+       #else
+                       4096
+       #endif
+               ];
+       };
+               
+       class xpath_allocator
+       {
+               xpath_memory_block* _root;
+               size_t _root_size;
+
+       public:
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               jmp_buf* error_handler;
+       #endif
+
+               xpath_allocator(xpath_memory_block* root, size_t root_size = 0): _root(root), _root_size(root_size)
+               {
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       error_handler = 0;
+               #endif
+               }
+               
+               void* allocate_nothrow(size_t size)
+               {
+                       // align size so that we're able to store pointers in subsequent blocks
+                       size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+                       if (_root_size + size <= _root->capacity)
+                       {
+                               void* buf = _root->data + _root_size;
+                               _root_size += size;
+                               return buf;
+                       }
+                       else
+                       {
+                               // make sure we have at least 1/4th of the page free after allocation to satisfy subsequent allocation requests
+                               size_t block_capacity_base = sizeof(_root->data);
+                               size_t block_capacity_req = size + block_capacity_base / 4;
+                               size_t block_capacity = (block_capacity_base > block_capacity_req) ? block_capacity_base : block_capacity_req;
+
+                               size_t block_size = block_capacity + offsetof(xpath_memory_block, data);
+
+                               xpath_memory_block* block = static_cast<xpath_memory_block*>(xml_memory::allocate(block_size));
+                               if (!block) return 0;
+                               
+                               block->next = _root;
+                               block->capacity = block_capacity;
+                               
+                               _root = block;
+                               _root_size = size;
+                               
+                               return block->data;
+                       }
+               }
+
+               void* allocate(size_t size)
+               {
+                       void* result = allocate_nothrow(size);
+
+                       if (!result)
+                       {
+                       #ifdef PUGIXML_NO_EXCEPTIONS
+                               assert(error_handler);
+                               longjmp(*error_handler, 1);
+                       #else
+                               throw std::bad_alloc();
+                       #endif
+                       }
+
+                       return result;
+               }
+
+               void* reallocate(void* ptr, size_t old_size, size_t new_size)
+               {
+                       // align size so that we're able to store pointers in subsequent blocks
+                       old_size = (old_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+                       new_size = (new_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+                       // we can only reallocate the last object
+                       assert(ptr == 0 || static_cast<char*>(ptr) + old_size == _root->data + _root_size);
+
+                       // adjust root size so that we have not allocated the object at all
+                       bool only_object = (_root_size == old_size);
+
+                       if (ptr) _root_size -= old_size;
+
+                       // allocate a new version (this will obviously reuse the memory if possible)
+                       void* result = allocate(new_size);
+                       assert(result);
+
+                       // we have a new block
+                       if (result != ptr && ptr)
+                       {
+                               // copy old data
+                               assert(new_size >= old_size);
+                               memcpy(result, ptr, old_size);
+
+                               // free the previous page if it had no other objects
+                               if (only_object)
+                               {
+                                       assert(_root->data == result);
+                                       assert(_root->next);
+
+                                       xpath_memory_block* next = _root->next->next;
+
+                                       if (next)
+                                       {
+                                               // deallocate the whole page, unless it was the first one
+                                               xml_memory::deallocate(_root->next);
+                                               _root->next = next;
+                                       }
+                               }
+                       }
+
+                       return result;
+               }
+
+               void revert(const xpath_allocator& state)
+               {
+                       // free all new pages
+                       xpath_memory_block* cur = _root;
+
+                       while (cur != state._root)
+                       {
+                               xpath_memory_block* next = cur->next;
+
+                               xml_memory::deallocate(cur);
+
+                               cur = next;
+                       }
+
+                       // restore state
+                       _root = state._root;
+                       _root_size = state._root_size;
+               }
+
+               void release()
+               {
+                       xpath_memory_block* cur = _root;
+                       assert(cur);
+
+                       while (cur->next)
+                       {
+                               xpath_memory_block* next = cur->next;
+
+                               xml_memory::deallocate(cur);
+
+                               cur = next;
+                       }
+               }
+       };
+
+       struct xpath_allocator_capture
+       {
+               xpath_allocator_capture(xpath_allocator* alloc): _target(alloc), _state(*alloc)
+               {
+               }
+
+               ~xpath_allocator_capture()
+               {
+                       _target->revert(_state);
+               }
+
+               xpath_allocator* _target;
+               xpath_allocator _state;
+       };
+
+       struct xpath_stack
+       {
+               xpath_allocator* result;
+               xpath_allocator* temp;
+       };
+
+       struct xpath_stack_data
+       {
+               xpath_memory_block blocks[2];
+               xpath_allocator result;
+               xpath_allocator temp;
+               xpath_stack stack;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               jmp_buf error_handler;
+       #endif
+
+               xpath_stack_data(): result(blocks + 0), temp(blocks + 1)
+               {
+                       blocks[0].next = blocks[1].next = 0;
+                       blocks[0].capacity = blocks[1].capacity = sizeof(blocks[0].data);
+
+                       stack.result = &result;
+                       stack.temp = &temp;
+
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       result.error_handler = temp.error_handler = &error_handler;
+               #endif
+               }
+
+               ~xpath_stack_data()
+               {
+                       result.release();
+                       temp.release();
+               }
+       };
+PUGI__NS_END
+
+// String class
+PUGI__NS_BEGIN
+       class xpath_string
+       {
+               const char_t* _buffer;
+               bool _uses_heap;
+               size_t _length_heap;
+
+               static char_t* duplicate_string(const char_t* string, size_t length, xpath_allocator* alloc)
+               {
+                       char_t* result = static_cast<char_t*>(alloc->allocate((length + 1) * sizeof(char_t)));
+                       assert(result);
+
+                       memcpy(result, string, length * sizeof(char_t));
+                       result[length] = 0;
+
+                       return result;
+               }
+
+               xpath_string(const char_t* buffer, bool uses_heap_, size_t length_heap): _buffer(buffer), _uses_heap(uses_heap_), _length_heap(length_heap)
+               {
+               }
+
+       public:
+               static xpath_string from_const(const char_t* str)
+               {
+                       return xpath_string(str, false, 0);
+               }
+
+               static xpath_string from_heap_preallocated(const char_t* begin, const char_t* end)
+               {
+                       assert(begin <= end && *end == 0);
+
+                       return xpath_string(begin, true, static_cast<size_t>(end - begin));
+               }
+
+               static xpath_string from_heap(const char_t* begin, const char_t* end, xpath_allocator* alloc)
+               {
+                       assert(begin <= end);
+
+                       size_t length = static_cast<size_t>(end - begin);
+
+                       return length == 0 ? xpath_string() : xpath_string(duplicate_string(begin, length, alloc), true, length);
+               }
+
+               xpath_string(): _buffer(PUGIXML_TEXT("")), _uses_heap(false), _length_heap(0)
+               {
+               }
+
+               void append(const xpath_string& o, xpath_allocator* alloc)
+               {
+                       // skip empty sources
+                       if (!*o._buffer) return;
+
+                       // fast append for constant empty target and constant source
+                       if (!*_buffer && !_uses_heap && !o._uses_heap)
+                       {
+                               _buffer = o._buffer;
+                       }
+                       else
+                       {
+                               // need to make heap copy
+                               size_t target_length = length();
+                               size_t source_length = o.length();
+                               size_t result_length = target_length + source_length;
+
+                               // allocate new buffer
+                               char_t* result = static_cast<char_t*>(alloc->reallocate(_uses_heap ? const_cast<char_t*>(_buffer) : 0, (target_length + 1) * sizeof(char_t), (result_length + 1) * sizeof(char_t)));
+                               assert(result);
+
+                               // append first string to the new buffer in case there was no reallocation
+                               if (!_uses_heap) memcpy(result, _buffer, target_length * sizeof(char_t));
+
+                               // append second string to the new buffer
+                               memcpy(result + target_length, o._buffer, source_length * sizeof(char_t));
+                               result[result_length] = 0;
+
+                               // finalize
+                               _buffer = result;
+                               _uses_heap = true;
+                               _length_heap = result_length;
+                       }
+               }
+
+               const char_t* c_str() const
+               {
+                       return _buffer;
+               }
+
+               size_t length() const
+               {
+                       return _uses_heap ? _length_heap : strlength(_buffer);
+               }
+               
+               char_t* data(xpath_allocator* alloc)
+               {
+                       // make private heap copy
+                       if (!_uses_heap)
+                       {
+                               size_t length_ = strlength(_buffer);
+
+                               _buffer = duplicate_string(_buffer, length_, alloc);
+                               _uses_heap = true;
+                               _length_heap = length_;
+                       }
+
+                       return const_cast<char_t*>(_buffer);
+               }
+
+               bool empty() const
+               {
+                       return *_buffer == 0;
+               }
+
+               bool operator==(const xpath_string& o) const
+               {
+                       return strequal(_buffer, o._buffer);
+               }
+
+               bool operator!=(const xpath_string& o) const
+               {
+                       return !strequal(_buffer, o._buffer);
+               }
+
+               bool uses_heap() const
+               {
+                       return _uses_heap;
+               }
+       };
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+       PUGI__FN bool starts_with(const char_t* string, const char_t* pattern)
+       {
+               while (*pattern && *string == *pattern)
+               {
+                       string++;
+                       pattern++;
+               }
+
+               return *pattern == 0;
+       }
+
+       PUGI__FN const char_t* find_char(const char_t* s, char_t c)
+       {
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcschr(s, c);
+       #else
+               return strchr(s, c);
+       #endif
+       }
+
+       PUGI__FN const char_t* find_substring(const char_t* s, const char_t* p)
+       {
+       #ifdef PUGIXML_WCHAR_MODE
+               // MSVC6 wcsstr bug workaround (if s is empty it always returns 0)
+               return (*p == 0) ? s : wcsstr(s, p);
+       #else
+               return strstr(s, p);
+       #endif
+       }
+
+       // Converts symbol to lower case, if it is an ASCII one
+       PUGI__FN char_t tolower_ascii(char_t ch)
+       {
+               return static_cast<unsigned int>(ch - 'A') < 26 ? static_cast<char_t>(ch | ' ') : ch;
+       }
+
+       PUGI__FN xpath_string string_value(const xpath_node& na, xpath_allocator* alloc)
+       {
+               if (na.attribute())
+                       return xpath_string::from_const(na.attribute().value());
+               else
+               {
+                       xml_node n = na.node();
+
+                       switch (n.type())
+                       {
+                       case node_pcdata:
+                       case node_cdata:
+                       case node_comment:
+                       case node_pi:
+                               return xpath_string::from_const(n.value());
+                       
+                       case node_document:
+                       case node_element:
+                       {
+                               xpath_string result;
+
+                               xml_node cur = n.first_child();
+                               
+                               while (cur && cur != n)
+                               {
+                                       if (cur.type() == node_pcdata || cur.type() == node_cdata)
+                                               result.append(xpath_string::from_const(cur.value()), alloc);
+
+                                       if (cur.first_child())
+                                               cur = cur.first_child();
+                                       else if (cur.next_sibling())
+                                               cur = cur.next_sibling();
+                                       else
+                                       {
+                                               while (!cur.next_sibling() && cur != n)
+                                                       cur = cur.parent();
+
+                                               if (cur != n) cur = cur.next_sibling();
+                                       }
+                               }
+                               
+                               return result;
+                       }
+                       
+                       default:
+                               return xpath_string();
+                       }
+               }
+       }
+       
+       PUGI__FN bool node_is_before_sibling(xml_node_struct* ln, xml_node_struct* rn)
+       {
+               assert(ln->parent == rn->parent);
+
+               // there is no common ancestor (the shared parent is null), nodes are from different documents
+               if (!ln->parent) return ln < rn;
+
+               // determine sibling order
+               xml_node_struct* ls = ln;
+               xml_node_struct* rs = rn;
+
+               while (ls && rs)
+               {
+                       if (ls == rn) return true;
+                       if (rs == ln) return false;
+
+                       ls = ls->next_sibling;
+                       rs = rs->next_sibling;
+               }
+
+               // if rn sibling chain ended ln must be before rn
+               return !rs;
+       }
+       
+       PUGI__FN bool node_is_before(xml_node_struct* ln, xml_node_struct* rn)
+       {
+               // find common ancestor at the same depth, if any
+               xml_node_struct* lp = ln;
+               xml_node_struct* rp = rn;
+
+               while (lp && rp && lp->parent != rp->parent)
+               {
+                       lp = lp->parent;
+                       rp = rp->parent;
+               }
+
+               // parents are the same!
+               if (lp && rp) return node_is_before_sibling(lp, rp);
+
+               // nodes are at different depths, need to normalize heights
+               bool left_higher = !lp;
+
+               while (lp)
+               {
+                       lp = lp->parent;
+                       ln = ln->parent;
+               }
+
+               while (rp)
+               {
+                       rp = rp->parent;
+                       rn = rn->parent;
+               }
+
+               // one node is the ancestor of the other
+               if (ln == rn) return left_higher;
+
+               // find common ancestor... again
+               while (ln->parent != rn->parent)
+               {
+                       ln = ln->parent;
+                       rn = rn->parent;
+               }
+
+               return node_is_before_sibling(ln, rn);
+       }
+
+       PUGI__FN bool node_is_ancestor(xml_node_struct* parent, xml_node_struct* node)
+       {
+               while (node && node != parent) node = node->parent;
+
+               return parent && node == parent;
+       }
+
+       PUGI__FN const void* document_buffer_order(const xpath_node& xnode)
+       {
+               xml_node_struct* node = xnode.node().internal_object();
+
+               if (node)
+               {
+                       if ((get_document(node).header & xml_memory_page_contents_shared_mask) == 0)
+                       {
+                               if (node->name && (node->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0) return node->name;
+                               if (node->value && (node->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0) return node->value;
+                       }
+
+                       return 0;
+               }
+
+               xml_attribute_struct* attr = xnode.attribute().internal_object();
+
+               if (attr)
+               {
+                       if ((get_document(attr).header & xml_memory_page_contents_shared_mask) == 0)
+                       {
+                               if ((attr->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0) return attr->name;
+                               if ((attr->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0) return attr->value;
+                       }
+
+                       return 0;
+               }
+
+               return 0;
+       }
+       
+       struct document_order_comparator
+       {
+               bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
+               {
+                       // optimized document order based check
+                       const void* lo = document_buffer_order(lhs);
+                       const void* ro = document_buffer_order(rhs);
+
+                       if (lo && ro) return lo < ro;
+
+                       // slow comparison
+                       xml_node ln = lhs.node(), rn = rhs.node();
+
+                       // compare attributes
+                       if (lhs.attribute() && rhs.attribute())
+                       {
+                               // shared parent
+                               if (lhs.parent() == rhs.parent())
+                               {
+                                       // determine sibling order
+                                       for (xml_attribute a = lhs.attribute(); a; a = a.next_attribute())
+                                               if (a == rhs.attribute())
+                                                       return true;
+                                       
+                                       return false;
+                               }
+                               
+                               // compare attribute parents
+                               ln = lhs.parent();
+                               rn = rhs.parent();
+                       }
+                       else if (lhs.attribute())
+                       {
+                               // attributes go after the parent element
+                               if (lhs.parent() == rhs.node()) return false;
+                               
+                               ln = lhs.parent();
+                       }
+                       else if (rhs.attribute())
+                       {
+                               // attributes go after the parent element
+                               if (rhs.parent() == lhs.node()) return true;
+                               
+                               rn = rhs.parent();
+                       }
+
+                       if (ln == rn) return false;
+
+                       if (!ln || !rn) return ln < rn;
+                       
+                       return node_is_before(ln.internal_object(), rn.internal_object());
+               }
+       };
+
+       struct duplicate_comparator
+       {
+               bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
+               {
+                       if (lhs.attribute()) return rhs.attribute() ? lhs.attribute() < rhs.attribute() : true;
+                       else return rhs.attribute() ? false : lhs.node() < rhs.node();
+               }
+       };
+       
+       PUGI__FN double gen_nan()
+       {
+       #if defined(__STDC_IEC_559__) || ((FLT_RADIX - 0 == 2) && (FLT_MAX_EXP - 0 == 128) && (FLT_MANT_DIG - 0 == 24))
+               union { float f; uint32_t i; } u[sizeof(float) == sizeof(uint32_t) ? 1 : -1];
+               u[0].i = 0x7fc00000;
+               return u[0].f;
+       #else
+               // fallback
+               const volatile double zero = 0.0;
+               return zero / zero;
+       #endif
+       }
+       
+       PUGI__FN bool is_nan(double value)
+       {
+       #if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__)
+               return !!_isnan(value);
+       #elif defined(fpclassify) && defined(FP_NAN)
+               return fpclassify(value) == FP_NAN;
+       #else
+               // fallback
+               const volatile double v = value;
+               return v != v;
+       #endif
+       }
+       
+       PUGI__FN const char_t* convert_number_to_string_special(double value)
+       {
+       #if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__)
+               if (_finite(value)) return (value == 0) ? PUGIXML_TEXT("0") : 0;
+               if (_isnan(value)) return PUGIXML_TEXT("NaN");
+               return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+       #elif defined(fpclassify) && defined(FP_NAN) && defined(FP_INFINITE) && defined(FP_ZERO)
+               switch (fpclassify(value))
+               {
+               case FP_NAN:
+                       return PUGIXML_TEXT("NaN");
+
+               case FP_INFINITE:
+                       return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+
+               case FP_ZERO:
+                       return PUGIXML_TEXT("0");
+
+               default:
+                       return 0;
+               }
+       #else
+               // fallback
+               const volatile double v = value;
+
+               if (v == 0) return PUGIXML_TEXT("0");
+               if (v != v) return PUGIXML_TEXT("NaN");
+               if (v * 2 == v) return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+               return 0;
+       #endif
+       }
+       
+       PUGI__FN bool convert_number_to_boolean(double value)
+       {
+               return (value != 0 && !is_nan(value));
+       }
+       
+       PUGI__FN void truncate_zeros(char* begin, char* end)
+       {
+               while (begin != end && end[-1] == '0') end--;
+
+               *end = 0;
+       }
+
+       // gets mantissa digits in the form of 0.xxxxx with 0. implied and the exponent
+#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
+       PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+       {
+               // get base values
+               int sign, exponent;
+               _ecvt_s(buffer, buffer_size, value, DBL_DIG + 1, &exponent, &sign);
+
+               // truncate redundant zeros
+               truncate_zeros(buffer, buffer + strlen(buffer));
+
+               // fill results
+               *out_mantissa = buffer;
+               *out_exponent = exponent;
+       }
+#else
+       PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+       {
+               // get a scientific notation value with IEEE DBL_DIG decimals
+               sprintf(buffer, "%.*e", DBL_DIG, value);
+               assert(strlen(buffer) < buffer_size);
+               (void)!buffer_size;
+
+               // get the exponent (possibly negative)
+               char* exponent_string = strchr(buffer, 'e');
+               assert(exponent_string);
+
+               int exponent = atoi(exponent_string + 1);
+
+               // extract mantissa string: skip sign
+               char* mantissa = buffer[0] == '-' ? buffer + 1 : buffer;
+               assert(mantissa[0] != '0' && mantissa[1] == '.');
+
+               // divide mantissa by 10 to eliminate integer part
+               mantissa[1] = mantissa[0];
+               mantissa++;
+               exponent++;
+
+               // remove extra mantissa digits and zero-terminate mantissa
+               truncate_zeros(mantissa, exponent_string);
+
+               // fill results
+               *out_mantissa = mantissa;
+               *out_exponent = exponent;
+       }
+#endif
+
+       PUGI__FN xpath_string convert_number_to_string(double value, xpath_allocator* alloc)
+       {
+               // try special number conversion
+               const char_t* special = convert_number_to_string_special(value);
+               if (special) return xpath_string::from_const(special);
+
+               // get mantissa + exponent form
+               char mantissa_buffer[32];
+
+               char* mantissa;
+               int exponent;
+               convert_number_to_mantissa_exponent(value, mantissa_buffer, sizeof(mantissa_buffer), &mantissa, &exponent);
+
+               // allocate a buffer of suitable length for the number
+               size_t result_size = strlen(mantissa_buffer) + (exponent > 0 ? exponent : -exponent) + 4;
+               char_t* result = static_cast<char_t*>(alloc->allocate(sizeof(char_t) * result_size));
+               assert(result);
+
+               // make the number!
+               char_t* s = result;
+
+               // sign
+               if (value < 0) *s++ = '-';
+
+               // integer part
+               if (exponent <= 0)
+               {
+                       *s++ = '0';
+               }
+               else
+               {
+                       while (exponent > 0)
+                       {
+                               assert(*mantissa == 0 || static_cast<unsigned int>(static_cast<unsigned int>(*mantissa) - '0') <= 9);
+                               *s++ = *mantissa ? *mantissa++ : '0';
+                               exponent--;
+                       }
+               }
+
+               // fractional part
+               if (*mantissa)
+               {
+                       // decimal point
+                       *s++ = '.';
+
+                       // extra zeroes from negative exponent
+                       while (exponent < 0)
+                       {
+                               *s++ = '0';
+                               exponent++;
+                       }
+
+                       // extra mantissa digits
+                       while (*mantissa)
+                       {
+                               assert(static_cast<unsigned int>(*mantissa - '0') <= 9);
+                               *s++ = *mantissa++;
+                       }
+               }
+
+               // zero-terminate
+               assert(s < result + result_size);
+               *s = 0;
+
+               return xpath_string::from_heap_preallocated(result, s);
+       }
+       
+       PUGI__FN bool check_string_to_number_format(const char_t* string)
+       {
+               // parse leading whitespace
+               while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string;
+
+               // parse sign
+               if (*string == '-') ++string;
+
+               if (!*string) return false;
+
+               // if there is no integer part, there should be a decimal part with at least one digit
+               if (!PUGI__IS_CHARTYPEX(string[0], ctx_digit) && (string[0] != '.' || !PUGI__IS_CHARTYPEX(string[1], ctx_digit))) return false;
+
+               // parse integer part
+               while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string;
+
+               // parse decimal part
+               if (*string == '.')
+               {
+                       ++string;
+
+                       while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string;
+               }
+
+               // parse trailing whitespace
+               while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string;
+
+               return *string == 0;
+       }
+
+       PUGI__FN double convert_string_to_number(const char_t* string)
+       {
+               // check string format
+               if (!check_string_to_number_format(string)) return gen_nan();
+
+               // parse string
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcstod(string, 0);
+       #else
+               return atof(string);
+       #endif
+       }
+
+       PUGI__FN bool convert_string_to_number_scratch(char_t (&buffer)[32], const char_t* begin, const char_t* end, double* out_result)
+       {
+               size_t length = static_cast<size_t>(end - begin);
+               char_t* scratch = buffer;
+
+               if (length >= sizeof(buffer) / sizeof(buffer[0]))
+               {
+                       // need to make dummy on-heap copy
+                       scratch = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+                       if (!scratch) return false;
+               }
+
+               // copy string to zero-terminated buffer and perform conversion
+               memcpy(scratch, begin, length * sizeof(char_t));
+               scratch[length] = 0;
+
+               *out_result = convert_string_to_number(scratch);
+
+               // free dummy buffer
+               if (scratch != buffer) xml_memory::deallocate(scratch);
+
+               return true;
+       }
+       
+       PUGI__FN double round_nearest(double value)
+       {
+               return floor(value + 0.5);
+       }
+
+       PUGI__FN double round_nearest_nzero(double value)
+       {
+               // same as round_nearest, but returns -0 for [-0.5, -0]
+               // ceil is used to differentiate between +0 and -0 (we return -0 for [-0.5, -0] and +0 for +0)
+               return (value >= -0.5 && value <= 0) ? ceil(value) : floor(value + 0.5);
+       }
+       
+       PUGI__FN const char_t* qualified_name(const xpath_node& node)
+       {
+               return node.attribute() ? node.attribute().name() : node.node().name();
+       }
+       
+       PUGI__FN const char_t* local_name(const xpath_node& node)
+       {
+               const char_t* name = qualified_name(node);
+               const char_t* p = find_char(name, ':');
+               
+               return p ? p + 1 : name;
+       }
+
+       struct namespace_uri_predicate
+       {
+               const char_t* prefix;
+               size_t prefix_length;
+
+               namespace_uri_predicate(const char_t* name)
+               {
+                       const char_t* pos = find_char(name, ':');
+
+                       prefix = pos ? name : 0;
+                       prefix_length = pos ? static_cast<size_t>(pos - name) : 0;
+               }
+
+               bool operator()(xml_attribute a) const
+               {
+                       const char_t* name = a.name();
+
+                       if (!starts_with(name, PUGIXML_TEXT("xmlns"))) return false;
+
+                       return prefix ? name[5] == ':' && strequalrange(name + 6, prefix, prefix_length) : name[5] == 0;
+               }
+       };
+
+       PUGI__FN const char_t* namespace_uri(xml_node node)
+       {
+               namespace_uri_predicate pred = node.name();
+               
+               xml_node p = node;
+               
+               while (p)
+               {
+                       xml_attribute a = p.find_attribute(pred);
+                       
+                       if (a) return a.value();
+                       
+                       p = p.parent();
+               }
+               
+               return PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* namespace_uri(xml_attribute attr, xml_node parent)
+       {
+               namespace_uri_predicate pred = attr.name();
+               
+               // Default namespace does not apply to attributes
+               if (!pred.prefix) return PUGIXML_TEXT("");
+               
+               xml_node p = parent;
+               
+               while (p)
+               {
+                       xml_attribute a = p.find_attribute(pred);
+                       
+                       if (a) return a.value();
+                       
+                       p = p.parent();
+               }
+               
+               return PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* namespace_uri(const xpath_node& node)
+       {
+               return node.attribute() ? namespace_uri(node.attribute(), node.parent()) : namespace_uri(node.node());
+       }
+
+       PUGI__FN void normalize_space(char_t* buffer)
+       {
+               char_t* write = buffer;
+
+               for (char_t* it = buffer; *it; )
+               {
+                       char_t ch = *it++;
+
+                       if (PUGI__IS_CHARTYPE(ch, ct_space))
+                       {
+                               // replace whitespace sequence with single space
+                               while (PUGI__IS_CHARTYPE(*it, ct_space)) it++;
+
+                               // avoid leading spaces
+                               if (write != buffer) *write++ = ' ';
+                       }
+                       else *write++ = ch;
+               }
+
+               // remove trailing space
+               if (write != buffer && PUGI__IS_CHARTYPE(write[-1], ct_space)) write--;
+
+               // zero-terminate
+               *write = 0;
+       }
+
+       PUGI__FN void translate(char_t* buffer, const char_t* from, const char_t* to, size_t to_length)
+       {
+               char_t* write = buffer;
+
+               while (*buffer)
+               {
+                       PUGI__DMC_VOLATILE char_t ch = *buffer++;
+
+                       const char_t* pos = find_char(from, ch);
+
+                       if (!pos)
+                               *write++ = ch; // do not process
+                       else if (static_cast<size_t>(pos - from) < to_length)
+                               *write++ = to[pos - from]; // replace
+               }
+
+               // zero-terminate
+               *write = 0;
+       }
+
+       PUGI__FN unsigned char* translate_table_generate(xpath_allocator* alloc, const char_t* from, const char_t* to)
+       {
+               unsigned char table[128] = {0};
+
+               while (*from)
+               {
+                       unsigned int fc = static_cast<unsigned int>(*from);
+                       unsigned int tc = static_cast<unsigned int>(*to);
+
+                       if (fc >= 128 || tc >= 128)
+                               return 0;
+
+                       // code=128 means "skip character"
+                       if (!table[fc])
+                               table[fc] = static_cast<unsigned char>(tc ? tc : 128);
+
+                       from++;
+                       if (tc) to++;
+               }
+
+               for (int i = 0; i < 128; ++i)
+                       if (!table[i])
+                               table[i] = static_cast<unsigned char>(i);
+
+               void* result = alloc->allocate_nothrow(sizeof(table));
+
+               if (result)
+               {
+                       memcpy(result, table, sizeof(table));
+               }
+
+               return static_cast<unsigned char*>(result);
+       }
+
+       PUGI__FN void translate_table(char_t* buffer, const unsigned char* table)
+       {
+               char_t* write = buffer;
+
+               while (*buffer)
+               {
+                       char_t ch = *buffer++;
+                       unsigned int index = static_cast<unsigned int>(ch);
+
+                       if (index < 128)
+                       {
+                               unsigned char code = table[index];
+
+                               // code=128 means "skip character" (table size is 128 so 128 can be a special value)
+                               // this code skips these characters without extra branches
+                               *write = static_cast<char_t>(code);
+                               write += 1 - (code >> 7);
+                       }
+                       else
+                       {
+                               *write++ = ch;
+                       }
+               }
+
+               // zero-terminate
+               *write = 0;
+       }
+
+       inline bool is_xpath_attribute(const char_t* name)
+       {
+               return !(starts_with(name, PUGIXML_TEXT("xmlns")) && (name[5] == 0 || name[5] == ':'));
+       }
+
+       struct xpath_variable_boolean: xpath_variable
+       {
+               xpath_variable_boolean(): value(false)
+               {
+               }
+
+               bool value;
+               char_t name[1];
+       };
+
+       struct xpath_variable_number: xpath_variable
+       {
+               xpath_variable_number(): value(0)
+               {
+               }
+
+               double value;
+               char_t name[1];
+       };
+
+       struct xpath_variable_string: xpath_variable
+       {
+               xpath_variable_string(): value(0)
+               {
+               }
+
+               ~xpath_variable_string()
+               {
+                       if (value) xml_memory::deallocate(value);
+               }
+
+               char_t* value;
+               char_t name[1];
+       };
+
+       struct xpath_variable_node_set: xpath_variable
+       {
+               xpath_node_set value;
+               char_t name[1];
+       };
+
+       static const xpath_node_set dummy_node_set;
+
+       PUGI__FN unsigned int hash_string(const char_t* str)
+       {
+               // Jenkins one-at-a-time hash (http://en.wikipedia.org/wiki/Jenkins_hash_function#one-at-a-time)
+               unsigned int result = 0;
+
+               while (*str)
+               {
+                       result += static_cast<unsigned int>(*str++);
+                       result += result << 10;
+                       result ^= result >> 6;
+               }
+       
+               result += result << 3;
+               result ^= result >> 11;
+               result += result << 15;
+       
+               return result;
+       }
+
+       template <typename T> PUGI__FN T* new_xpath_variable(const char_t* name)
+       {
+               size_t length = strlength(name);
+               if (length == 0) return 0; // empty variable names are invalid
+
+               // $$ we can't use offsetof(T, name) because T is non-POD, so we just allocate additional length characters
+               void* memory = xml_memory::allocate(sizeof(T) + length * sizeof(char_t));
+               if (!memory) return 0;
+
+               T* result = new (memory) T();
+
+               memcpy(result->name, name, (length + 1) * sizeof(char_t));
+
+               return result;
+       }
+
+       PUGI__FN xpath_variable* new_xpath_variable(xpath_value_type type, const char_t* name)
+       {
+               switch (type)
+               {
+               case xpath_type_node_set:
+                       return new_xpath_variable<xpath_variable_node_set>(name);
+
+               case xpath_type_number:
+                       return new_xpath_variable<xpath_variable_number>(name);
+
+               case xpath_type_string:
+                       return new_xpath_variable<xpath_variable_string>(name);
+
+               case xpath_type_boolean:
+                       return new_xpath_variable<xpath_variable_boolean>(name);
+
+               default:
+                       return 0;
+               }
+       }
+
+       template <typename T> PUGI__FN void delete_xpath_variable(T* var)
+       {
+               var->~T();
+               xml_memory::deallocate(var);
+       }
+
+       PUGI__FN void delete_xpath_variable(xpath_value_type type, xpath_variable* var)
+       {
+               switch (type)
+               {
+               case xpath_type_node_set:
+                       delete_xpath_variable(static_cast<xpath_variable_node_set*>(var));
+                       break;
+
+               case xpath_type_number:
+                       delete_xpath_variable(static_cast<xpath_variable_number*>(var));
+                       break;
+
+               case xpath_type_string:
+                       delete_xpath_variable(static_cast<xpath_variable_string*>(var));
+                       break;
+
+               case xpath_type_boolean:
+                       delete_xpath_variable(static_cast<xpath_variable_boolean*>(var));
+                       break;
+
+               default:
+                       assert(!"Invalid variable type");
+               }
+       }
+
+       PUGI__FN xpath_variable* get_variable_scratch(char_t (&buffer)[32], xpath_variable_set* set, const char_t* begin, const char_t* end)
+       {
+               size_t length = static_cast<size_t>(end - begin);
+               char_t* scratch = buffer;
+
+               if (length >= sizeof(buffer) / sizeof(buffer[0]))
+               {
+                       // need to make dummy on-heap copy
+                       scratch = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+                       if (!scratch) return 0;
+               }
+
+               // copy string to zero-terminated buffer and perform lookup
+               memcpy(scratch, begin, length * sizeof(char_t));
+               scratch[length] = 0;
+
+               xpath_variable* result = set->get(scratch);
+
+               // free dummy buffer
+               if (scratch != buffer) xml_memory::deallocate(scratch);
+
+               return result;
+       }
+PUGI__NS_END
+
+// Internal node set class
+PUGI__NS_BEGIN
+       PUGI__FN xpath_node_set::type_t xpath_get_order(const xpath_node* begin, const xpath_node* end)
+       {
+               if (end - begin < 2)
+                       return xpath_node_set::type_sorted;
+
+               document_order_comparator cmp;
+
+               bool first = cmp(begin[0], begin[1]);
+
+               for (const xpath_node* it = begin + 1; it + 1 < end; ++it)
+                       if (cmp(it[0], it[1]) != first)
+                               return xpath_node_set::type_unsorted;
+
+               return first ? xpath_node_set::type_sorted : xpath_node_set::type_sorted_reverse;
+       }
+
+       PUGI__FN xpath_node_set::type_t xpath_sort(xpath_node* begin, xpath_node* end, xpath_node_set::type_t type, bool rev)
+       {
+               xpath_node_set::type_t order = rev ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted;
+
+               if (type == xpath_node_set::type_unsorted)
+               {
+                       xpath_node_set::type_t sorted = xpath_get_order(begin, end);
+
+                       if (sorted == xpath_node_set::type_unsorted)
+                       {
+                               sort(begin, end, document_order_comparator());
+
+                               type = xpath_node_set::type_sorted;
+                       }
+                       else
+                               type = sorted;
+               }
+               
+               if (type != order) reverse(begin, end);
+                       
+               return order;
+       }
+
+       PUGI__FN xpath_node xpath_first(const xpath_node* begin, const xpath_node* end, xpath_node_set::type_t type)
+       {
+               if (begin == end) return xpath_node();
+
+               switch (type)
+               {
+               case xpath_node_set::type_sorted:
+                       return *begin;
+
+               case xpath_node_set::type_sorted_reverse:
+                       return *(end - 1);
+
+               case xpath_node_set::type_unsorted:
+                       return *min_element(begin, end, document_order_comparator());
+
+               default:
+                       assert(!"Invalid node set type");
+                       return xpath_node();
+               }
+       }
+
+       class xpath_node_set_raw
+       {
+               xpath_node_set::type_t _type;
+
+               xpath_node* _begin;
+               xpath_node* _end;
+               xpath_node* _eos;
+
+       public:
+               xpath_node_set_raw(): _type(xpath_node_set::type_unsorted), _begin(0), _end(0), _eos(0)
+               {
+               }
+
+               xpath_node* begin() const
+               {
+                       return _begin;
+               }
+
+               xpath_node* end() const
+               {
+                       return _end;
+               }
+
+               bool empty() const
+               {
+                       return _begin == _end;
+               }
+
+               size_t size() const
+               {
+                       return static_cast<size_t>(_end - _begin);
+               }
+
+               xpath_node first() const
+               {
+                       return xpath_first(_begin, _end, _type);
+               }
+
+               void push_back_grow(const xpath_node& node, xpath_allocator* alloc);
+
+               void push_back(const xpath_node& node, xpath_allocator* alloc)
+               {
+                       if (_end != _eos)
+                               *_end++ = node;
+                       else
+                               push_back_grow(node, alloc);
+               }
+
+               void append(const xpath_node* begin_, const xpath_node* end_, xpath_allocator* alloc)
+               {
+                       if (begin_ == end_) return;
+
+                       size_t size_ = static_cast<size_t>(_end - _begin);
+                       size_t capacity = static_cast<size_t>(_eos - _begin);
+                       size_t count = static_cast<size_t>(end_ - begin_);
+
+                       if (size_ + count > capacity)
+                       {
+                               // reallocate the old array or allocate a new one
+                               xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), (size_ + count) * sizeof(xpath_node)));
+                               assert(data);
+
+                               // finalize
+                               _begin = data;
+                               _end = data + size_;
+                               _eos = data + size_ + count;
+                       }
+
+                       memcpy(_end, begin_, count * sizeof(xpath_node));
+                       _end += count;
+               }
+
+               void sort_do()
+               {
+                       _type = xpath_sort(_begin, _end, _type, false);
+               }
+
+               void truncate(xpath_node* pos)
+               {
+                       assert(_begin <= pos && pos <= _end);
+
+                       _end = pos;
+               }
+
+               void remove_duplicates()
+               {
+                       if (_type == xpath_node_set::type_unsorted)
+                               sort(_begin, _end, duplicate_comparator());
+               
+                       _end = unique(_begin, _end);
+               }
+
+               xpath_node_set::type_t type() const
+               {
+                       return _type;
+               }
+
+               void set_type(xpath_node_set::type_t value)
+               {
+                       _type = value;
+               }
+       };
+
+       PUGI__FN_NO_INLINE void xpath_node_set_raw::push_back_grow(const xpath_node& node, xpath_allocator* alloc)
+       {
+               size_t capacity = static_cast<size_t>(_eos - _begin);
+
+               // get new capacity (1.5x rule)
+               size_t new_capacity = capacity + capacity / 2 + 1;
+
+               // reallocate the old array or allocate a new one
+               xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), new_capacity * sizeof(xpath_node)));
+               assert(data);
+
+               // finalize
+               _begin = data;
+               _end = data + capacity;
+               _eos = data + new_capacity;
+
+               // push
+               *_end++ = node;
+       }
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+       struct xpath_context
+       {
+               xpath_node n;
+               size_t position, size;
+
+               xpath_context(const xpath_node& n_, size_t position_, size_t size_): n(n_), position(position_), size(size_)
+               {
+               }
+       };
+
+       enum lexeme_t
+       {
+               lex_none = 0,
+               lex_equal,
+               lex_not_equal,
+               lex_less,
+               lex_greater,
+               lex_less_or_equal,
+               lex_greater_or_equal,
+               lex_plus,
+               lex_minus,
+               lex_multiply,
+               lex_union,
+               lex_var_ref,
+               lex_open_brace,
+               lex_close_brace,
+               lex_quoted_string,
+               lex_number,
+               lex_slash,
+               lex_double_slash,
+               lex_open_square_brace,
+               lex_close_square_brace,
+               lex_string,
+               lex_comma,
+               lex_axis_attribute,
+               lex_dot,
+               lex_double_dot,
+               lex_double_colon,
+               lex_eof
+       };
+
+       struct xpath_lexer_string
+       {
+               const char_t* begin;
+               const char_t* end;
+
+               xpath_lexer_string(): begin(0), end(0)
+               {
+               }
+
+               bool operator==(const char_t* other) const
+               {
+                       size_t length = static_cast<size_t>(end - begin);
+
+                       return strequalrange(other, begin, length);
+               }
+       };
+
+       class xpath_lexer
+       {
+               const char_t* _cur;
+               const char_t* _cur_lexeme_pos;
+               xpath_lexer_string _cur_lexeme_contents;
+
+               lexeme_t _cur_lexeme;
+
+       public:
+               explicit xpath_lexer(const char_t* query): _cur(query)
+               {
+                       next();
+               }
+               
+               const char_t* state() const
+               {
+                       return _cur;
+               }
+               
+               void next()
+               {
+                       const char_t* cur = _cur;
+
+                       while (PUGI__IS_CHARTYPE(*cur, ct_space)) ++cur;
+
+                       // save lexeme position for error reporting
+                       _cur_lexeme_pos = cur;
+
+                       switch (*cur)
+                       {
+                       case 0:
+                               _cur_lexeme = lex_eof;
+                               break;
+                       
+                       case '>':
+                               if (*(cur+1) == '=')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_greater_or_equal;
+                               }
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_greater;
+                               }
+                               break;
+
+                       case '<':
+                               if (*(cur+1) == '=')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_less_or_equal;
+                               }
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_less;
+                               }
+                               break;
+
+                       case '!':
+                               if (*(cur+1) == '=')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_not_equal;
+                               }
+                               else
+                               {
+                                       _cur_lexeme = lex_none;
+                               }
+                               break;
+
+                       case '=':
+                               cur += 1;
+                               _cur_lexeme = lex_equal;
+
+                               break;
+                       
+                       case '+':
+                               cur += 1;
+                               _cur_lexeme = lex_plus;
+
+                               break;
+
+                       case '-':
+                               cur += 1;
+                               _cur_lexeme = lex_minus;
+
+                               break;
+
+                       case '*':
+                               cur += 1;
+                               _cur_lexeme = lex_multiply;
+
+                               break;
+
+                       case '|':
+                               cur += 1;
+                               _cur_lexeme = lex_union;
+
+                               break;
+                       
+                       case '$':
+                               cur += 1;
+
+                               if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol))
+                               {
+                                       _cur_lexeme_contents.begin = cur;
+
+                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+
+                                       if (cur[0] == ':' && PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // qname
+                                       {
+                                               cur++; // :
+
+                                               while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+                                       }
+
+                                       _cur_lexeme_contents.end = cur;
+                               
+                                       _cur_lexeme = lex_var_ref;
+                               }
+                               else
+                               {
+                                       _cur_lexeme = lex_none;
+                               }
+
+                               break;
+
+                       case '(':
+                               cur += 1;
+                               _cur_lexeme = lex_open_brace;
+
+                               break;
+
+                       case ')':
+                               cur += 1;
+                               _cur_lexeme = lex_close_brace;
+
+                               break;
+                       
+                       case '[':
+                               cur += 1;
+                               _cur_lexeme = lex_open_square_brace;
+
+                               break;
+
+                       case ']':
+                               cur += 1;
+                               _cur_lexeme = lex_close_square_brace;
+
+                               break;
+
+                       case ',':
+                               cur += 1;
+                               _cur_lexeme = lex_comma;
+
+                               break;
+
+                       case '/':
+                               if (*(cur+1) == '/')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_double_slash;
+                               }
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_slash;
+                               }
+                               break;
+               
+                       case '.':
+                               if (*(cur+1) == '.')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_double_dot;
+                               }
+                               else if (PUGI__IS_CHARTYPEX(*(cur+1), ctx_digit))
+                               {
+                                       _cur_lexeme_contents.begin = cur; // .
+
+                                       ++cur;
+
+                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+
+                                       _cur_lexeme_contents.end = cur;
+                                       
+                                       _cur_lexeme = lex_number;
+                               }
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_dot;
+                               }
+                               break;
+
+                       case '@':
+                               cur += 1;
+                               _cur_lexeme = lex_axis_attribute;
+
+                               break;
+
+                       case '"':
+                       case '\'':
+                       {
+                               char_t terminator = *cur;
+
+                               ++cur;
+
+                               _cur_lexeme_contents.begin = cur;
+                               while (*cur && *cur != terminator) cur++;
+                               _cur_lexeme_contents.end = cur;
+                               
+                               if (!*cur)
+                                       _cur_lexeme = lex_none;
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_quoted_string;
+                               }
+
+                               break;
+                       }
+
+                       case ':':
+                               if (*(cur+1) == ':')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_double_colon;
+                               }
+                               else
+                               {
+                                       _cur_lexeme = lex_none;
+                               }
+                               break;
+
+                       default:
+                               if (PUGI__IS_CHARTYPEX(*cur, ctx_digit))
+                               {
+                                       _cur_lexeme_contents.begin = cur;
+
+                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+                               
+                                       if (*cur == '.')
+                                       {
+                                               cur++;
+
+                                               while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+                                       }
+
+                                       _cur_lexeme_contents.end = cur;
+
+                                       _cur_lexeme = lex_number;
+                               }
+                               else if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol))
+                               {
+                                       _cur_lexeme_contents.begin = cur;
+
+                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+
+                                       if (cur[0] == ':')
+                                       {
+                                               if (cur[1] == '*') // namespace test ncname:*
+                                               {
+                                                       cur += 2; // :*
+                                               }
+                                               else if (PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // namespace test qname
+                                               {
+                                                       cur++; // :
+
+                                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+                                               }
+                                       }
+
+                                       _cur_lexeme_contents.end = cur;
+                               
+                                       _cur_lexeme = lex_string;
+                               }
+                               else
+                               {
+                                       _cur_lexeme = lex_none;
+                               }
+                       }
+
+                       _cur = cur;
+               }
+
+               lexeme_t current() const
+               {
+                       return _cur_lexeme;
+               }
+
+               const char_t* current_pos() const
+               {
+                       return _cur_lexeme_pos;
+               }
+
+               const xpath_lexer_string& contents() const
+               {
+                       assert(_cur_lexeme == lex_var_ref || _cur_lexeme == lex_number || _cur_lexeme == lex_string || _cur_lexeme == lex_quoted_string);
+
+                       return _cur_lexeme_contents;
+               }
+       };
+
+       enum ast_type_t
+       {
+               ast_unknown,
+               ast_op_or,                                              // left or right
+               ast_op_and,                                             // left and right
+               ast_op_equal,                                   // left = right
+               ast_op_not_equal,                               // left != right
+               ast_op_less,                                    // left < right
+               ast_op_greater,                                 // left > right
+               ast_op_less_or_equal,                   // left <= right
+               ast_op_greater_or_equal,                // left >= right
+               ast_op_add,                                             // left + right
+               ast_op_subtract,                                // left - right
+               ast_op_multiply,                                // left * right
+               ast_op_divide,                                  // left / right
+               ast_op_mod,                                             // left % right
+               ast_op_negate,                                  // left - right
+               ast_op_union,                                   // left | right
+               ast_predicate,                                  // apply predicate to set; next points to next predicate
+               ast_filter,                                             // select * from left where right
+               ast_string_constant,                    // string constant
+               ast_number_constant,                    // number constant
+               ast_variable,                                   // variable
+               ast_func_last,                                  // last()
+               ast_func_position,                              // position()
+               ast_func_count,                                 // count(left)
+               ast_func_id,                                    // id(left)
+               ast_func_local_name_0,                  // local-name()
+               ast_func_local_name_1,                  // local-name(left)
+               ast_func_namespace_uri_0,               // namespace-uri()
+               ast_func_namespace_uri_1,               // namespace-uri(left)
+               ast_func_name_0,                                // name()
+               ast_func_name_1,                                // name(left)
+               ast_func_string_0,                              // string()
+               ast_func_string_1,                              // string(left)
+               ast_func_concat,                                // concat(left, right, siblings)
+               ast_func_starts_with,                   // starts_with(left, right)
+               ast_func_contains,                              // contains(left, right)
+               ast_func_substring_before,              // substring-before(left, right)
+               ast_func_substring_after,               // substring-after(left, right)
+               ast_func_substring_2,                   // substring(left, right)
+               ast_func_substring_3,                   // substring(left, right, third)
+               ast_func_string_length_0,               // string-length()
+               ast_func_string_length_1,               // string-length(left)
+               ast_func_normalize_space_0,             // normalize-space()
+               ast_func_normalize_space_1,             // normalize-space(left)
+               ast_func_translate,                             // translate(left, right, third)
+               ast_func_boolean,                               // boolean(left)
+               ast_func_not,                                   // not(left)
+               ast_func_true,                                  // true()
+               ast_func_false,                                 // false()
+               ast_func_lang,                                  // lang(left)
+               ast_func_number_0,                              // number()
+               ast_func_number_1,                              // number(left)
+               ast_func_sum,                                   // sum(left)
+               ast_func_floor,                                 // floor(left)
+               ast_func_ceiling,                               // ceiling(left)
+               ast_func_round,                                 // round(left)
+               ast_step,                                               // process set left with step
+               ast_step_root,                                  // select root node
+
+               ast_opt_translate_table,                // translate(left, right, third) where right/third are constants
+               ast_opt_compare_attribute               // @name = 'string'
+       };
+
+       enum axis_t
+       {
+               axis_ancestor,
+               axis_ancestor_or_self,
+               axis_attribute,
+               axis_child,
+               axis_descendant,
+               axis_descendant_or_self,
+               axis_following,
+               axis_following_sibling,
+               axis_namespace,
+               axis_parent,
+               axis_preceding,
+               axis_preceding_sibling,
+               axis_self
+       };
+       
+       enum nodetest_t
+       {
+               nodetest_none,
+               nodetest_name,
+               nodetest_type_node,
+               nodetest_type_comment,
+               nodetest_type_pi,
+               nodetest_type_text,
+               nodetest_pi,
+               nodetest_all,
+               nodetest_all_in_namespace
+       };
+
+       enum predicate_t
+       {
+               predicate_default,
+               predicate_posinv,
+               predicate_constant,
+               predicate_constant_one
+       };
+
+       enum nodeset_eval_t
+       {
+               nodeset_eval_all,
+               nodeset_eval_any,
+               nodeset_eval_first
+       };
+
+       template <axis_t N> struct axis_to_type
+       {
+               static const axis_t axis;
+       };
+
+       template <axis_t N> const axis_t axis_to_type<N>::axis = N;
+               
+       class xpath_ast_node
+       {
+       private:
+               // node type
+               char _type;
+               char _rettype;
+
+               // for ast_step
+               char _axis;
+
+               // for ast_step/ast_predicate/ast_filter
+               char _test;
+
+               // tree node structure
+               xpath_ast_node* _left;
+               xpath_ast_node* _right;
+               xpath_ast_node* _next;
+
+               union
+               {
+                       // value for ast_string_constant
+                       const char_t* string;
+                       // value for ast_number_constant
+                       double number;
+                       // variable for ast_variable
+                       xpath_variable* variable;
+                       // node test for ast_step (node name/namespace/node type/pi target)
+                       const char_t* nodetest;
+                       // table for ast_opt_translate_table
+                       const unsigned char* table;
+               } _data;
+
+               xpath_ast_node(const xpath_ast_node&);
+               xpath_ast_node& operator=(const xpath_ast_node&);
+
+               template <class Comp> static bool compare_eq(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
+               {
+                       xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
+
+                       if (lt != xpath_type_node_set && rt != xpath_type_node_set)
+                       {
+                               if (lt == xpath_type_boolean || rt == xpath_type_boolean)
+                                       return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
+                               else if (lt == xpath_type_number || rt == xpath_type_number)
+                                       return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
+                               else if (lt == xpath_type_string || rt == xpath_type_string)
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       xpath_string ls = lhs->eval_string(c, stack);
+                                       xpath_string rs = rhs->eval_string(c, stack);
+
+                                       return comp(ls, rs);
+                               }
+                       }
+                       else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all);
+                               xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                               for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+                                       for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                                       {
+                                               xpath_allocator_capture cri(stack.result);
+
+                                               if (comp(string_value(*li, stack.result), string_value(*ri, stack.result)))
+                                                       return true;
+                                       }
+
+                               return false;
+                       }
+                       else
+                       {
+                               if (lt == xpath_type_node_set)
+                               {
+                                       swap(lhs, rhs);
+                                       swap(lt, rt);
+                               }
+
+                               if (lt == xpath_type_boolean)
+                                       return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
+                               else if (lt == xpath_type_number)
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       double l = lhs->eval_number(c, stack);
+                                       xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                                       for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                                       {
+                                               xpath_allocator_capture cri(stack.result);
+
+                                               if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+                                                       return true;
+                                       }
+
+                                       return false;
+                               }
+                               else if (lt == xpath_type_string)
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       xpath_string l = lhs->eval_string(c, stack);
+                                       xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                                       for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                                       {
+                                               xpath_allocator_capture cri(stack.result);
+
+                                               if (comp(l, string_value(*ri, stack.result)))
+                                                       return true;
+                                       }
+
+                                       return false;
+                               }
+                       }
+
+                       assert(!"Wrong types");
+                       return false;
+               }
+
+               static bool eval_once(xpath_node_set::type_t type, nodeset_eval_t eval)
+               {
+                       return type == xpath_node_set::type_sorted ? eval != nodeset_eval_all : eval == nodeset_eval_any;
+               }
+
+               template <class Comp> static bool compare_rel(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
+               {
+                       xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
+
+                       if (lt != xpath_type_node_set && rt != xpath_type_node_set)
+                               return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
+                       else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all);
+                               xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                               for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+                               {
+                                       xpath_allocator_capture cri(stack.result);
+
+                                       double l = convert_string_to_number(string_value(*li, stack.result).c_str());
+
+                                       for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                                       {
+                                               xpath_allocator_capture crii(stack.result);
+
+                                               if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+                                                       return true;
+                                       }
+                               }
+
+                               return false;
+                       }
+                       else if (lt != xpath_type_node_set && rt == xpath_type_node_set)
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               double l = lhs->eval_number(c, stack);
+                               xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                               for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                               {
+                                       xpath_allocator_capture cri(stack.result);
+
+                                       if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+                                               return true;
+                               }
+
+                               return false;
+                       }
+                       else if (lt == xpath_type_node_set && rt != xpath_type_node_set)
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all);
+                               double r = rhs->eval_number(c, stack);
+
+                               for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+                               {
+                                       xpath_allocator_capture cri(stack.result);
+
+                                       if (comp(convert_string_to_number(string_value(*li, stack.result).c_str()), r))
+                                               return true;
+                               }
+
+                               return false;
+                       }
+                       else
+                       {
+                               assert(!"Wrong types");
+                               return false;
+                       }
+               }
+
+               static void apply_predicate_boolean(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack, bool once)
+               {
+                       assert(ns.size() >= first);
+                       assert(expr->rettype() != xpath_type_number);
+
+                       size_t i = 1;
+                       size_t size = ns.size() - first;
+
+                       xpath_node* last = ns.begin() + first;
+
+                       // remove_if... or well, sort of
+                       for (xpath_node* it = last; it != ns.end(); ++it, ++i)
+                       {
+                               xpath_context c(*it, i, size);
+
+                               if (expr->eval_boolean(c, stack))
+                               {
+                                       *last++ = *it;
+
+                                       if (once) break;
+                               }
+                       }
+
+                       ns.truncate(last);
+               }
+
+               static void apply_predicate_number(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack, bool once)
+               {
+                       assert(ns.size() >= first);
+                       assert(expr->rettype() == xpath_type_number);
+
+                       size_t i = 1;
+                       size_t size = ns.size() - first;
+
+                       xpath_node* last = ns.begin() + first;
+
+                       // remove_if... or well, sort of
+                       for (xpath_node* it = last; it != ns.end(); ++it, ++i)
+                       {
+                               xpath_context c(*it, i, size);
+
+                               if (expr->eval_number(c, stack) == i)
+                               {
+                                       *last++ = *it;
+
+                                       if (once) break;
+                               }
+                       }
+
+                       ns.truncate(last);
+               }
+
+               static void apply_predicate_number_const(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack)
+               {
+                       assert(ns.size() >= first);
+                       assert(expr->rettype() == xpath_type_number);
+
+                       size_t size = ns.size() - first;
+
+                       xpath_node* last = ns.begin() + first;
+
+                       xpath_context c(xpath_node(), 1, size);
+
+                       double er = expr->eval_number(c, stack);
+
+                       if (er >= 1.0 && er <= size)
+                       {
+                               size_t eri = static_cast<size_t>(er);
+
+                               if (er == eri)
+                               {
+                                       xpath_node r = last[eri - 1];
+
+                                       *last++ = r;
+                               }
+                       }
+
+                       ns.truncate(last);
+               }
+
+               void apply_predicate(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, bool once)
+               {
+                       if (ns.size() == first) return;
+
+                       assert(_type == ast_filter || _type == ast_predicate);
+
+                       if (_test == predicate_constant || _test == predicate_constant_one)
+                               apply_predicate_number_const(ns, first, _right, stack);
+                       else if (_right->rettype() == xpath_type_number)
+                               apply_predicate_number(ns, first, _right, stack, once);
+                       else
+                               apply_predicate_boolean(ns, first, _right, stack, once);
+               }
+
+               void apply_predicates(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, nodeset_eval_t eval)
+               {
+                       if (ns.size() == first) return;
+
+                       bool last_once = eval_once(ns.type(), eval);
+
+                       for (xpath_ast_node* pred = _right; pred; pred = pred->_next)
+                               pred->apply_predicate(ns, first, stack, !pred->_next && last_once);
+               }
+
+               bool step_push(xpath_node_set_raw& ns, xml_attribute_struct* a, xml_node_struct* parent, xpath_allocator* alloc)
+               {
+                       assert(a);
+
+                       const char_t* name = a->name ? a->name : PUGIXML_TEXT("");
+
+                       switch (_test)
+                       {
+                       case nodetest_name:
+                               if (strequal(name, _data.nodetest) && is_xpath_attribute(name))
+                               {
+                                       ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_type_node:
+                       case nodetest_all:
+                               if (is_xpath_attribute(name))
+                               {
+                                       ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_all_in_namespace:
+                               if (starts_with(name, _data.nodetest) && is_xpath_attribute(name))
+                               {
+                                       ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc);
+                                       return true;
+                               }
+                               break;
+                       
+                       default:
+                               ;
+                       }
+
+                       return false;
+               }
+               
+               bool step_push(xpath_node_set_raw& ns, xml_node_struct* n, xpath_allocator* alloc)
+               {
+                       assert(n);
+
+                       xml_node_type type = PUGI__NODETYPE(n);
+
+                       switch (_test)
+                       {
+                       case nodetest_name:
+                               if (type == node_element && n->name && strequal(n->name, _data.nodetest))
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_type_node:
+                               ns.push_back(xml_node(n), alloc);
+                               return true;
+                               
+                       case nodetest_type_comment:
+                               if (type == node_comment)
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_type_text:
+                               if (type == node_pcdata || type == node_cdata)
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_type_pi:
+                               if (type == node_pi)
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                                                                       
+                       case nodetest_pi:
+                               if (type == node_pi && n->name && strequal(n->name, _data.nodetest))
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_all:
+                               if (type == node_element)
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_all_in_namespace:
+                               if (type == node_element && n->name && starts_with(n->name, _data.nodetest))
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+
+                       default:
+                               assert(!"Unknown axis");
+                       }
+
+                       return false;
+               }
+
+               template <class T> void step_fill(xpath_node_set_raw& ns, xml_node_struct* n, xpath_allocator* alloc, bool once, T)
+               {
+                       const axis_t axis = T::axis;
+
+                       switch (axis)
+                       {
+                       case axis_attribute:
+                       {
+                               for (xml_attribute_struct* a = n->first_attribute; a; a = a->next_attribute)
+                                       if (step_push(ns, a, n, alloc) & once)
+                                               return;
+                               
+                               break;
+                       }
+                       
+                       case axis_child:
+                       {
+                               for (xml_node_struct* c = n->first_child; c; c = c->next_sibling)
+                                       if (step_push(ns, c, alloc) & once)
+                                               return;
+                                       
+                               break;
+                       }
+                       
+                       case axis_descendant:
+                       case axis_descendant_or_self:
+                       {
+                               if (axis == axis_descendant_or_self)
+                                       if (step_push(ns, n, alloc) & once)
+                                               return;
+                                       
+                               xml_node_struct* cur = n->first_child;
+                               
+                               while (cur)
+                               {
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+                                       
+                                       if (cur->first_child)
+                                               cur = cur->first_child;
+                                       else
+                                       {
+                                               while (!cur->next_sibling)
+                                               {
+                                                       cur = cur->parent;
+
+                                                       if (cur == n) return;
+                                               }
+                                       
+                                               cur = cur->next_sibling;
+                                       }
+                               }
+                               
+                               break;
+                       }
+                       
+                       case axis_following_sibling:
+                       {
+                               for (xml_node_struct* c = n->next_sibling; c; c = c->next_sibling)
+                                       if (step_push(ns, c, alloc) & once)
+                                               return;
+                               
+                               break;
+                       }
+                       
+                       case axis_preceding_sibling:
+                       {
+                               for (xml_node_struct* c = n->prev_sibling_c; c->next_sibling; c = c->prev_sibling_c)
+                                       if (step_push(ns, c, alloc) & once)
+                                               return;
+                               
+                               break;
+                       }
+                       
+                       case axis_following:
+                       {
+                               xml_node_struct* cur = n;
+
+                               // exit from this node so that we don't include descendants
+                               while (!cur->next_sibling)
+                               {
+                                       cur = cur->parent;
+
+                                       if (!cur) return;
+                               }
+
+                               cur = cur->next_sibling;
+
+                               while (cur)
+                               {
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+
+                                       if (cur->first_child)
+                                               cur = cur->first_child;
+                                       else
+                                       {
+                                               while (!cur->next_sibling)
+                                               {
+                                                       cur = cur->parent;
+
+                                                       if (!cur) return;
+                                               }
+
+                                               cur = cur->next_sibling;
+                                       }
+                               }
+
+                               break;
+                       }
+
+                       case axis_preceding:
+                       {
+                               xml_node_struct* cur = n;
+
+                               // exit from this node so that we don't include descendants
+                               while (!cur->prev_sibling_c->next_sibling)
+                               {
+                                       cur = cur->parent;
+
+                                       if (!cur) return;
+                               }
+
+                               cur = cur->prev_sibling_c;
+
+                               while (cur)
+                               {
+                                       if (cur->first_child)
+                                               cur = cur->first_child->prev_sibling_c;
+                                       else
+                                       {
+                                               // leaf node, can't be ancestor
+                                               if (step_push(ns, cur, alloc) & once)
+                                                       return;
+
+                                               while (!cur->prev_sibling_c->next_sibling)
+                                               {
+                                                       cur = cur->parent;
+
+                                                       if (!cur) return;
+
+                                                       if (!node_is_ancestor(cur, n))
+                                                               if (step_push(ns, cur, alloc) & once)
+                                                                       return;
+                                               }
+
+                                               cur = cur->prev_sibling_c;
+                                       }
+                               }
+
+                               break;
+                       }
+                       
+                       case axis_ancestor:
+                       case axis_ancestor_or_self:
+                       {
+                               if (axis == axis_ancestor_or_self)
+                                       if (step_push(ns, n, alloc) & once)
+                                               return;
+
+                               xml_node_struct* cur = n->parent;
+                               
+                               while (cur)
+                               {
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+                                       
+                                       cur = cur->parent;
+                               }
+                               
+                               break;
+                       }
+
+                       case axis_self:
+                       {
+                               step_push(ns, n, alloc);
+
+                               break;
+                       }
+
+                       case axis_parent:
+                       {
+                               if (n->parent)
+                                       step_push(ns, n->parent, alloc);
+
+                               break;
+                       }
+                               
+                       default:
+                               assert(!"Unimplemented axis");
+                       }
+               }
+               
+               template <class T> void step_fill(xpath_node_set_raw& ns, xml_attribute_struct* a, xml_node_struct* p, xpath_allocator* alloc, bool once, T v)
+               {
+                       const axis_t axis = T::axis;
+
+                       switch (axis)
+                       {
+                       case axis_ancestor:
+                       case axis_ancestor_or_self:
+                       {
+                               if (axis == axis_ancestor_or_self && _test == nodetest_type_node) // reject attributes based on principal node type test
+                                       if (step_push(ns, a, p, alloc) & once)
+                                               return;
+
+                               xml_node_struct* cur = p;
+                               
+                               while (cur)
+                               {
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+                                       
+                                       cur = cur->parent;
+                               }
+                               
+                               break;
+                       }
+
+                       case axis_descendant_or_self:
+                       case axis_self:
+                       {
+                               if (_test == nodetest_type_node) // reject attributes based on principal node type test
+                                       step_push(ns, a, p, alloc);
+
+                               break;
+                       }
+
+                       case axis_following:
+                       {
+                               xml_node_struct* cur = p;
+                               
+                               while (cur)
+                               {
+                                       if (cur->first_child)
+                                               cur = cur->first_child;
+                                       else
+                                       {
+                                               while (!cur->next_sibling)
+                                               {
+                                                       cur = cur->parent;
+
+                                                       if (!cur) return;
+                                               }
+
+                                               cur = cur->next_sibling;
+                                       }
+
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+                               }
+
+                               break;
+                       }
+
+                       case axis_parent:
+                       {
+                               step_push(ns, p, alloc);
+
+                               break;
+                       }
+
+                       case axis_preceding:
+                       {
+                               // preceding:: axis does not include attribute nodes and attribute ancestors (they are the same as parent's ancestors), so we can reuse node preceding
+                               step_fill(ns, p, alloc, once, v);
+                               break;
+                       }
+                       
+                       default:
+                               assert(!"Unimplemented axis");
+                       }
+               }
+
+               template <class T> void step_fill(xpath_node_set_raw& ns, const xpath_node& xn, xpath_allocator* alloc, bool once, T v)
+               {
+                       const axis_t axis = T::axis;
+                       const bool axis_has_attributes = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_descendant_or_self || axis == axis_following || axis == axis_parent || axis == axis_preceding || axis == axis_self);
+
+                       if (xn.node())
+                               step_fill(ns, xn.node().internal_object(), alloc, once, v);
+                       else if (axis_has_attributes && xn.attribute() && xn.parent())
+                               step_fill(ns, xn.attribute().internal_object(), xn.parent().internal_object(), alloc, once, v);
+               }
+
+               template <class T> xpath_node_set_raw step_do(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval, T v)
+               {
+                       const axis_t axis = T::axis;
+                       const bool axis_reverse = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling);
+                       const xpath_node_set::type_t axis_type = axis_reverse ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted;
+
+                       bool once =
+                               (axis == axis_attribute && _test == nodetest_name) ||
+                               (!_right && eval_once(axis_type, eval)) ||
+                               (_right && !_right->_next && _right->_test == predicate_constant_one);
+
+                       xpath_node_set_raw ns;
+                       ns.set_type(axis_type);
+
+                       if (_left)
+                       {
+                               xpath_node_set_raw s = _left->eval_node_set(c, stack, nodeset_eval_all);
+
+                               // self axis preserves the original order
+                               if (axis == axis_self) ns.set_type(s.type());
+
+                               for (const xpath_node* it = s.begin(); it != s.end(); ++it)
+                               {
+                                       size_t size = ns.size();
+
+                                       // in general, all axes generate elements in a particular order, but there is no order guarantee if axis is applied to two nodes
+                                       if (axis != axis_self && size != 0) ns.set_type(xpath_node_set::type_unsorted);
+                                       
+                                       step_fill(ns, *it, stack.result, once, v);
+                                       if (_right) apply_predicates(ns, size, stack, eval);
+                               }
+                       }
+                       else
+                       {
+                               step_fill(ns, c.n, stack.result, once, v);
+                               if (_right) apply_predicates(ns, 0, stack, eval);
+                       }
+
+                       // child, attribute and self axes always generate unique set of nodes
+                       // for other axis, if the set stayed sorted, it stayed unique because the traversal algorithms do not visit the same node twice
+                       if (axis != axis_child && axis != axis_attribute && axis != axis_self && ns.type() == xpath_node_set::type_unsorted)
+                               ns.remove_duplicates();
+
+                       return ns;
+               }
+               
+       public:
+               xpath_ast_node(ast_type_t type, xpath_value_type rettype_, const char_t* value):
+                       _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+               {
+                       assert(type == ast_string_constant);
+                       _data.string = value;
+               }
+
+               xpath_ast_node(ast_type_t type, xpath_value_type rettype_, double value):
+                       _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+               {
+                       assert(type == ast_number_constant);
+                       _data.number = value;
+               }
+               
+               xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_variable* value):
+                       _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+               {
+                       assert(type == ast_variable);
+                       _data.variable = value;
+               }
+               
+               xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_ast_node* left = 0, xpath_ast_node* right = 0):
+                       _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(left), _right(right), _next(0)
+               {
+               }
+
+               xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents):
+                       _type(static_cast<char>(type)), _rettype(xpath_type_node_set), _axis(static_cast<char>(axis)), _test(static_cast<char>(test)), _left(left), _right(0), _next(0)
+               {
+                       assert(type == ast_step);
+                       _data.nodetest = contents;
+               }
+
+               xpath_ast_node(ast_type_t type, xpath_ast_node* left, xpath_ast_node* right, predicate_t test):
+                       _type(static_cast<char>(type)), _rettype(xpath_type_node_set), _axis(0), _test(static_cast<char>(test)), _left(left), _right(right), _next(0)
+               {
+                       assert(type == ast_filter || type == ast_predicate);
+               }
+
+               void set_next(xpath_ast_node* value)
+               {
+                       _next = value;
+               }
+
+               void set_right(xpath_ast_node* value)
+               {
+                       _right = value;
+               }
+
+               bool eval_boolean(const xpath_context& c, const xpath_stack& stack)
+               {
+                       switch (_type)
+                       {
+                       case ast_op_or:
+                               return _left->eval_boolean(c, stack) || _right->eval_boolean(c, stack);
+                               
+                       case ast_op_and:
+                               return _left->eval_boolean(c, stack) && _right->eval_boolean(c, stack);
+                               
+                       case ast_op_equal:
+                               return compare_eq(_left, _right, c, stack, equal_to());
+
+                       case ast_op_not_equal:
+                               return compare_eq(_left, _right, c, stack, not_equal_to());
+       
+                       case ast_op_less:
+                               return compare_rel(_left, _right, c, stack, less());
+                       
+                       case ast_op_greater:
+                               return compare_rel(_right, _left, c, stack, less());
+
+                       case ast_op_less_or_equal:
+                               return compare_rel(_left, _right, c, stack, less_equal());
+                       
+                       case ast_op_greater_or_equal:
+                               return compare_rel(_right, _left, c, stack, less_equal());
+
+                       case ast_func_starts_with:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_string lr = _left->eval_string(c, stack);
+                               xpath_string rr = _right->eval_string(c, stack);
+
+                               return starts_with(lr.c_str(), rr.c_str());
+                       }
+
+                       case ast_func_contains:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_string lr = _left->eval_string(c, stack);
+                               xpath_string rr = _right->eval_string(c, stack);
+
+                               return find_substring(lr.c_str(), rr.c_str()) != 0;
+                       }
+
+                       case ast_func_boolean:
+                               return _left->eval_boolean(c, stack);
+                               
+                       case ast_func_not:
+                               return !_left->eval_boolean(c, stack);
+                               
+                       case ast_func_true:
+                               return true;
+                               
+                       case ast_func_false:
+                               return false;
+
+                       case ast_func_lang:
+                       {
+                               if (c.n.attribute()) return false;
+                               
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_string lang = _left->eval_string(c, stack);
+                               
+                               for (xml_node n = c.n.node(); n; n = n.parent())
+                               {
+                                       xml_attribute a = n.attribute(PUGIXML_TEXT("xml:lang"));
+                                       
+                                       if (a)
+                                       {
+                                               const char_t* value = a.value();
+                                               
+                                               // strnicmp / strncasecmp is not portable
+                                               for (const char_t* lit = lang.c_str(); *lit; ++lit)
+                                               {
+                                                       if (tolower_ascii(*lit) != tolower_ascii(*value)) return false;
+                                                       ++value;
+                                               }
+                                               
+                                               return *value == 0 || *value == '-';
+                                       }
+                               }
+                               
+                               return false;
+                       }
+
+                       case ast_opt_compare_attribute:
+                       {
+                               const char_t* value = (_right->_type == ast_string_constant) ? _right->_data.string : _right->_data.variable->get_string();
+
+                               xml_attribute attr = c.n.node().attribute(_left->_data.nodetest);
+
+                               return attr && strequal(attr.value(), value) && is_xpath_attribute(attr.name());
+                       }
+
+                       case ast_variable:
+                       {
+                               assert(_rettype == _data.variable->type());
+
+                               if (_rettype == xpath_type_boolean)
+                                       return _data.variable->get_boolean();
+                       }
+                       // fallthrough
+
+                       default:
+                       {
+                               switch (_rettype)
+                               {
+                               case xpath_type_number:
+                                       return convert_number_to_boolean(eval_number(c, stack));
+                                       
+                               case xpath_type_string:
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       return !eval_string(c, stack).empty();
+                               }
+                                       
+                               case xpath_type_node_set:                               
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       return !eval_node_set(c, stack, nodeset_eval_any).empty();
+                               }
+
+                               default:
+                                       assert(!"Wrong expression for return type boolean");
+                                       return false;
+                               }
+                       }
+                       }
+               }
+
+               double eval_number(const xpath_context& c, const xpath_stack& stack)
+               {
+                       switch (_type)
+                       {
+                       case ast_op_add:
+                               return _left->eval_number(c, stack) + _right->eval_number(c, stack);
+                               
+                       case ast_op_subtract:
+                               return _left->eval_number(c, stack) - _right->eval_number(c, stack);
+
+                       case ast_op_multiply:
+                               return _left->eval_number(c, stack) * _right->eval_number(c, stack);
+
+                       case ast_op_divide:
+                               return _left->eval_number(c, stack) / _right->eval_number(c, stack);
+
+                       case ast_op_mod:
+                               return fmod(_left->eval_number(c, stack), _right->eval_number(c, stack));
+
+                       case ast_op_negate:
+                               return -_left->eval_number(c, stack);
+
+                       case ast_number_constant:
+                               return _data.number;
+
+                       case ast_func_last:
+                               return static_cast<double>(c.size);
+                       
+                       case ast_func_position:
+                               return static_cast<double>(c.position);
+
+                       case ast_func_count:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               return static_cast<double>(_left->eval_node_set(c, stack, nodeset_eval_all).size());
+                       }
+                       
+                       case ast_func_string_length_0:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               return static_cast<double>(string_value(c.n, stack.result).length());
+                       }
+                       
+                       case ast_func_string_length_1:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               return static_cast<double>(_left->eval_string(c, stack).length());
+                       }
+                       
+                       case ast_func_number_0:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               return convert_string_to_number(string_value(c.n, stack.result).c_str());
+                       }
+                       
+                       case ast_func_number_1:
+                               return _left->eval_number(c, stack);
+
+                       case ast_func_sum:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               double r = 0;
+                               
+                               xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_all);
+                               
+                               for (const xpath_node* it = ns.begin(); it != ns.end(); ++it)
+                               {
+                                       xpath_allocator_capture cri(stack.result);
+
+                                       r += convert_string_to_number(string_value(*it, stack.result).c_str());
+                               }
+                       
+                               return r;
+                       }
+
+                       case ast_func_floor:
+                       {
+                               double r = _left->eval_number(c, stack);
+                               
+                               return r == r ? floor(r) : r;
+                       }
+
+                       case ast_func_ceiling:
+                       {
+                               double r = _left->eval_number(c, stack);
+                               
+                               return r == r ? ceil(r) : r;
+                       }
+
+                       case ast_func_round:
+                               return round_nearest_nzero(_left->eval_number(c, stack));
+                       
+                       case ast_variable:
+                       {
+                               assert(_rettype == _data.variable->type());
+
+                               if (_rettype == xpath_type_number)
+                                       return _data.variable->get_number();
+                       }
+                       // fallthrough
+
+                       default:
+                       {
+                               switch (_rettype)
+                               {
+                               case xpath_type_boolean:
+                                       return eval_boolean(c, stack) ? 1 : 0;
+                                       
+                               case xpath_type_string:
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       return convert_string_to_number(eval_string(c, stack).c_str());
+                               }
+                                       
+                               case xpath_type_node_set:
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       return convert_string_to_number(eval_string(c, stack).c_str());
+                               }
+                                       
+                               default:
+                                       assert(!"Wrong expression for return type number");
+                                       return 0;
+                               }
+                               
+                       }
+                       }
+               }
+               
+               xpath_string eval_string_concat(const xpath_context& c, const xpath_stack& stack)
+               {
+                       assert(_type == ast_func_concat);
+
+                       xpath_allocator_capture ct(stack.temp);
+
+                       // count the string number
+                       size_t count = 1;
+                       for (xpath_ast_node* nc = _right; nc; nc = nc->_next) count++;
+
+                       // gather all strings
+                       xpath_string static_buffer[4];
+                       xpath_string* buffer = static_buffer;
+
+                       // allocate on-heap for large concats
+                       if (count > sizeof(static_buffer) / sizeof(static_buffer[0]))
+                       {
+                               buffer = static_cast<xpath_string*>(stack.temp->allocate(count * sizeof(xpath_string)));
+                               assert(buffer);
+                       }
+
+                       // evaluate all strings to temporary stack
+                       xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                       buffer[0] = _left->eval_string(c, swapped_stack);
+
+                       size_t pos = 1;
+                       for (xpath_ast_node* n = _right; n; n = n->_next, ++pos) buffer[pos] = n->eval_string(c, swapped_stack);
+                       assert(pos == count);
+
+                       // get total length
+                       size_t length = 0;
+                       for (size_t i = 0; i < count; ++i) length += buffer[i].length();
+
+                       // create final string
+                       char_t* result = static_cast<char_t*>(stack.result->allocate((length + 1) * sizeof(char_t)));
+                       assert(result);
+
+                       char_t* ri = result;
+
+                       for (size_t j = 0; j < count; ++j)
+                               for (const char_t* bi = buffer[j].c_str(); *bi; ++bi)
+                                       *ri++ = *bi;
+
+                       *ri = 0;
+
+                       return xpath_string::from_heap_preallocated(result, ri);
+               }
+
+               xpath_string eval_string(const xpath_context& c, const xpath_stack& stack)
+               {
+                       switch (_type)
+                       {
+                       case ast_string_constant:
+                               return xpath_string::from_const(_data.string);
+                       
+                       case ast_func_local_name_0:
+                       {
+                               xpath_node na = c.n;
+                               
+                               return xpath_string::from_const(local_name(na));
+                       }
+
+                       case ast_func_local_name_1:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first);
+                               xpath_node na = ns.first();
+                               
+                               return xpath_string::from_const(local_name(na));
+                       }
+
+                       case ast_func_name_0:
+                       {
+                               xpath_node na = c.n;
+                               
+                               return xpath_string::from_const(qualified_name(na));
+                       }
+
+                       case ast_func_name_1:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first);
+                               xpath_node na = ns.first();
+                               
+                               return xpath_string::from_const(qualified_name(na));
+                       }
+
+                       case ast_func_namespace_uri_0:
+                       {
+                               xpath_node na = c.n;
+                               
+                               return xpath_string::from_const(namespace_uri(na));
+                       }
+
+                       case ast_func_namespace_uri_1:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first);
+                               xpath_node na = ns.first();
+                               
+                               return xpath_string::from_const(namespace_uri(na));
+                       }
+
+                       case ast_func_string_0:
+                               return string_value(c.n, stack.result);
+
+                       case ast_func_string_1:
+                               return _left->eval_string(c, stack);
+
+                       case ast_func_concat:
+                               return eval_string_concat(c, stack);
+
+                       case ast_func_substring_before:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, swapped_stack);
+                               xpath_string p = _right->eval_string(c, swapped_stack);
+
+                               const char_t* pos = find_substring(s.c_str(), p.c_str());
+                               
+                               return pos ? xpath_string::from_heap(s.c_str(), pos, stack.result) : xpath_string();
+                       }
+                       
+                       case ast_func_substring_after:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, swapped_stack);
+                               xpath_string p = _right->eval_string(c, swapped_stack);
+                               
+                               const char_t* pos = find_substring(s.c_str(), p.c_str());
+                               if (!pos) return xpath_string();
+
+                               const char_t* rbegin = pos + p.length();
+                               const char_t* rend = s.c_str() + s.length();
+
+                               return s.uses_heap() ? xpath_string::from_heap(rbegin, rend, stack.result) : xpath_string::from_const(rbegin);
+                       }
+
+                       case ast_func_substring_2:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, swapped_stack);
+                               size_t s_length = s.length();
+
+                               double first = round_nearest(_right->eval_number(c, stack));
+                               
+                               if (is_nan(first)) return xpath_string(); // NaN
+                               else if (first >= s_length + 1) return xpath_string();
+                               
+                               size_t pos = first < 1 ? 1 : static_cast<size_t>(first);
+                               assert(1 <= pos && pos <= s_length + 1);
+
+                               const char_t* rbegin = s.c_str() + (pos - 1);
+                               const char_t* rend = s.c_str() + s.length();
+                               
+                               return s.uses_heap() ? xpath_string::from_heap(rbegin, rend, stack.result) : xpath_string::from_const(rbegin);
+                       }
+                       
+                       case ast_func_substring_3:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, swapped_stack);
+                               size_t s_length = s.length();
+
+                               double first = round_nearest(_right->eval_number(c, stack));
+                               double last = first + round_nearest(_right->_next->eval_number(c, stack));
+                               
+                               if (is_nan(first) || is_nan(last)) return xpath_string();
+                               else if (first >= s_length + 1) return xpath_string();
+                               else if (first >= last) return xpath_string();
+                               else if (last < 1) return xpath_string();
+                               
+                               size_t pos = first < 1 ? 1 : static_cast<size_t>(first);
+                               size_t end = last >= s_length + 1 ? s_length + 1 : static_cast<size_t>(last);
+
+                               assert(1 <= pos && pos <= end && end <= s_length + 1);
+                               const char_t* rbegin = s.c_str() + (pos - 1);
+                               const char_t* rend = s.c_str() + (end - 1);
+
+                               return (end == s_length + 1 && !s.uses_heap()) ? xpath_string::from_const(rbegin) : xpath_string::from_heap(rbegin, rend, stack.result);
+                       }
+
+                       case ast_func_normalize_space_0:
+                       {
+                               xpath_string s = string_value(c.n, stack.result);
+
+                               normalize_space(s.data(stack.result));
+
+                               return s;
+                       }
+
+                       case ast_func_normalize_space_1:
+                       {
+                               xpath_string s = _left->eval_string(c, stack);
+
+                               normalize_space(s.data(stack.result));
+                       
+                               return s;
+                       }
+
+                       case ast_func_translate:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, stack);
+                               xpath_string from = _right->eval_string(c, swapped_stack);
+                               xpath_string to = _right->_next->eval_string(c, swapped_stack);
+
+                               translate(s.data(stack.result), from.c_str(), to.c_str(), to.length());
+
+                               return s;
+                       }
+
+                       case ast_opt_translate_table:
+                       {
+                               xpath_string s = _left->eval_string(c, stack);
+
+                               translate_table(s.data(stack.result), _data.table);
+
+                               return s;
+                       }
+
+                       case ast_variable:
+                       {
+                               assert(_rettype == _data.variable->type());
+
+                               if (_rettype == xpath_type_string)
+                                       return xpath_string::from_const(_data.variable->get_string());
+                       }
+                       // fallthrough
+
+                       default:
+                       {
+                               switch (_rettype)
+                               {
+                               case xpath_type_boolean:
+                                       return xpath_string::from_const(eval_boolean(c, stack) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
+                                       
+                               case xpath_type_number:
+                                       return convert_number_to_string(eval_number(c, stack), stack.result);
+                                       
+                               case xpath_type_node_set:
+                               {
+                                       xpath_allocator_capture cr(stack.temp);
+
+                                       xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                                       xpath_node_set_raw ns = eval_node_set(c, swapped_stack, nodeset_eval_first);
+                                       return ns.empty() ? xpath_string() : string_value(ns.first(), stack.result);
+                               }
+                               
+                               default:
+                                       assert(!"Wrong expression for return type string");
+                                       return xpath_string();
+                               }
+                       }
+                       }
+               }
+
+               xpath_node_set_raw eval_node_set(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval)
+               {
+                       switch (_type)
+                       {
+                       case ast_op_union:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack, eval);
+                               xpath_node_set_raw rs = _right->eval_node_set(c, stack, eval);
+
+                               // we can optimize merging two sorted sets, but this is a very rare operation, so don't bother
+                               rs.set_type(xpath_node_set::type_unsorted);
+
+                               rs.append(ls.begin(), ls.end(), stack.result);
+                               rs.remove_duplicates();
+
+                               return rs;
+                       }
+
+                       case ast_filter:
+                       {
+                               xpath_node_set_raw set = _left->eval_node_set(c, stack, _test == predicate_constant_one ? nodeset_eval_first : nodeset_eval_all);
+
+                               // either expression is a number or it contains position() call; sort by document order
+                               if (_test != predicate_posinv) set.sort_do();
+
+                               bool once = eval_once(set.type(), eval);
+
+                               apply_predicate(set, 0, stack, once);
+                       
+                               return set;
+                       }
+                       
+                       case ast_func_id:
+                               return xpath_node_set_raw();
+                       
+                       case ast_step:
+                       {
+                               switch (_axis)
+                               {
+                               case axis_ancestor:
+                                       return step_do(c, stack, eval, axis_to_type<axis_ancestor>());
+                                       
+                               case axis_ancestor_or_self:
+                                       return step_do(c, stack, eval, axis_to_type<axis_ancestor_or_self>());
+
+                               case axis_attribute:
+                                       return step_do(c, stack, eval, axis_to_type<axis_attribute>());
+
+                               case axis_child:
+                                       return step_do(c, stack, eval, axis_to_type<axis_child>());
+                               
+                               case axis_descendant:
+                                       return step_do(c, stack, eval, axis_to_type<axis_descendant>());
+
+                               case axis_descendant_or_self:
+                                       return step_do(c, stack, eval, axis_to_type<axis_descendant_or_self>());
+
+                               case axis_following:
+                                       return step_do(c, stack, eval, axis_to_type<axis_following>());
+                               
+                               case axis_following_sibling:
+                                       return step_do(c, stack, eval, axis_to_type<axis_following_sibling>());
+                               
+                               case axis_namespace:
+                                       // namespaced axis is not supported
+                                       return xpath_node_set_raw();
+                               
+                               case axis_parent:
+                                       return step_do(c, stack, eval, axis_to_type<axis_parent>());
+                               
+                               case axis_preceding:
+                                       return step_do(c, stack, eval, axis_to_type<axis_preceding>());
+
+                               case axis_preceding_sibling:
+                                       return step_do(c, stack, eval, axis_to_type<axis_preceding_sibling>());
+                               
+                               case axis_self:
+                                       return step_do(c, stack, eval, axis_to_type<axis_self>());
+
+                               default:
+                                       assert(!"Unknown axis");
+                                       return xpath_node_set_raw();
+                               }
+                       }
+
+                       case ast_step_root:
+                       {
+                               assert(!_right); // root step can't have any predicates
+
+                               xpath_node_set_raw ns;
+
+                               ns.set_type(xpath_node_set::type_sorted);
+
+                               if (c.n.node()) ns.push_back(c.n.node().root(), stack.result);
+                               else if (c.n.attribute()) ns.push_back(c.n.parent().root(), stack.result);
+
+                               return ns;
+                       }
+
+                       case ast_variable:
+                       {
+                               assert(_rettype == _data.variable->type());
+
+                               if (_rettype == xpath_type_node_set)
+                               {
+                                       const xpath_node_set& s = _data.variable->get_node_set();
+
+                                       xpath_node_set_raw ns;
+
+                                       ns.set_type(s.type());
+                                       ns.append(s.begin(), s.end(), stack.result);
+
+                                       return ns;
+                               }
+                       }
+                       // fallthrough
+
+                       default:
+                               assert(!"Wrong expression for return type node set");
+                               return xpath_node_set_raw();
+                       }
+               }
+
+               void optimize(xpath_allocator* alloc)
+               {
+                       if (_left) _left->optimize(alloc);
+                       if (_right) _right->optimize(alloc);
+                       if (_next) _next->optimize(alloc);
+
+                       // Rewrite [position()=expr] with [expr]
+                       // Note that this step has to go before classification to recognize [position()=1]
+                       if ((_type == ast_filter || _type == ast_predicate) &&
+                               _right->_type == ast_op_equal && _right->_left->_type == ast_func_position && _right->_right->_rettype == xpath_type_number)
+                       {
+                               _right = _right->_right;
+                       }
+
+                       // Classify filter/predicate ops to perform various optimizations during evaluation
+                       if (_type == ast_filter || _type == ast_predicate)
+                       {
+                               assert(_test == predicate_default);
+
+                               if (_right->_type == ast_number_constant && _right->_data.number == 1.0)
+                                       _test = predicate_constant_one;
+                               else if (_right->_rettype == xpath_type_number && (_right->_type == ast_number_constant || _right->_type == ast_variable || _right->_type == ast_func_last))
+                                       _test = predicate_constant;
+                               else if (_right->_rettype != xpath_type_number && _right->is_posinv_expr())
+                                       _test = predicate_posinv;
+                       }
+
+                       // Rewrite descendant-or-self::node()/child::foo with descendant::foo
+                       // The former is a full form of //foo, the latter is much faster since it executes the node test immediately
+                       // Do a similar kind of rewrite for self/descendant/descendant-or-self axes
+                       // Note that we only rewrite positionally invariant steps (//foo[1] != /descendant::foo[1])
+                       if (_type == ast_step && (_axis == axis_child || _axis == axis_self || _axis == axis_descendant || _axis == axis_descendant_or_self) && _left &&
+                               _left->_type == ast_step && _left->_axis == axis_descendant_or_self && _left->_test == nodetest_type_node && !_left->_right &&
+                               is_posinv_step())
+                       {
+                               if (_axis == axis_child || _axis == axis_descendant)
+                                       _axis = axis_descendant;
+                               else
+                                       _axis = axis_descendant_or_self;
+
+                               _left = _left->_left;
+                       }
+
+                       // Use optimized lookup table implementation for translate() with constant arguments
+                       if (_type == ast_func_translate && _right->_type == ast_string_constant && _right->_next->_type == ast_string_constant)
+                       {
+                               unsigned char* table = translate_table_generate(alloc, _right->_data.string, _right->_next->_data.string);
+
+                               if (table)
+                               {
+                                       _type = ast_opt_translate_table;
+                                       _data.table = table;
+                               }
+                       }
+
+                       // Use optimized path for @attr = 'value' or @attr = $value
+                       if (_type == ast_op_equal &&
+                               _left->_type == ast_step && _left->_axis == axis_attribute && _left->_test == nodetest_name && !_left->_left && !_left->_right &&
+                               (_right->_type == ast_string_constant || (_right->_type == ast_variable && _right->_rettype == xpath_type_string)))
+                       {
+                               _type = ast_opt_compare_attribute;
+                       }
+               }
+               
+               bool is_posinv_expr() const
+               {
+                       switch (_type)
+                       {
+                       case ast_func_position:
+                       case ast_func_last:
+                               return false;
+
+                       case ast_string_constant:
+                       case ast_number_constant:
+                       case ast_variable:
+                               return true;
+
+                       case ast_step:
+                       case ast_step_root:
+                               return true;
+
+                       case ast_predicate:
+                       case ast_filter:
+                               return true;
+
+                       default:
+                               if (_left && !_left->is_posinv_expr()) return false;
+                               
+                               for (xpath_ast_node* n = _right; n; n = n->_next)
+                                       if (!n->is_posinv_expr()) return false;
+                                       
+                               return true;
+                       }
+               }
+
+               bool is_posinv_step() const
+               {
+                       assert(_type == ast_step);
+
+                       for (xpath_ast_node* n = _right; n; n = n->_next)
+                       {
+                               assert(n->_type == ast_predicate);
+
+                               if (n->_test != predicate_posinv)
+                                       return false;
+                       }
+
+                       return true;
+               }
+
+               xpath_value_type rettype() const
+               {
+                       return static_cast<xpath_value_type>(_rettype);
+               }
+       };
+
+       struct xpath_parser
+       {
+               xpath_allocator* _alloc;
+               xpath_lexer _lexer;
+
+               const char_t* _query;
+               xpath_variable_set* _variables;
+
+               xpath_parse_result* _result;
+
+               char_t _scratch[32];
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               jmp_buf _error_handler;
+       #endif
+
+               void throw_error(const char* message)
+               {
+                       _result->error = message;
+                       _result->offset = _lexer.current_pos() - _query;
+
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       longjmp(_error_handler, 1);
+               #else
+                       throw xpath_exception(*_result);
+               #endif
+               }
+
+               void throw_error_oom()
+               {
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       throw_error("Out of memory");
+               #else
+                       throw std::bad_alloc();
+               #endif
+               }
+
+               void* alloc_node()
+               {
+                       void* result = _alloc->allocate_nothrow(sizeof(xpath_ast_node));
+
+                       if (!result) throw_error_oom();
+
+                       return result;
+               }
+
+               const char_t* alloc_string(const xpath_lexer_string& value)
+               {
+                       if (value.begin)
+                       {
+                               size_t length = static_cast<size_t>(value.end - value.begin);
+
+                               char_t* c = static_cast<char_t*>(_alloc->allocate_nothrow((length + 1) * sizeof(char_t)));
+                               if (!c) throw_error_oom();
+                               assert(c); // workaround for clang static analysis
+
+                               memcpy(c, value.begin, length * sizeof(char_t));
+                               c[length] = 0;
+
+                               return c;
+                       }
+                       else return 0;
+               }
+
+               xpath_ast_node* parse_function_helper(ast_type_t type0, ast_type_t type1, size_t argc, xpath_ast_node* args[2])
+               {
+                       assert(argc <= 1);
+
+                       if (argc == 1 && args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+
+                       return new (alloc_node()) xpath_ast_node(argc == 0 ? type0 : type1, xpath_type_string, args[0]);
+               }
+
+               xpath_ast_node* parse_function(const xpath_lexer_string& name, size_t argc, xpath_ast_node* args[2])
+               {
+                       switch (name.begin[0])
+                       {
+                       case 'b':
+                               if (name == PUGIXML_TEXT("boolean") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_boolean, xpath_type_boolean, args[0]);
+                                       
+                               break;
+                       
+                       case 'c':
+                               if (name == PUGIXML_TEXT("count") && argc == 1)
+                               {
+                                       if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+                                       return new (alloc_node()) xpath_ast_node(ast_func_count, xpath_type_number, args[0]);
+                               }
+                               else if (name == PUGIXML_TEXT("contains") && argc == 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_contains, xpath_type_boolean, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("concat") && argc >= 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_concat, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("ceiling") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_ceiling, xpath_type_number, args[0]);
+                                       
+                               break;
+                       
+                       case 'f':
+                               if (name == PUGIXML_TEXT("false") && argc == 0)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_false, xpath_type_boolean);
+                               else if (name == PUGIXML_TEXT("floor") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_floor, xpath_type_number, args[0]);
+                                       
+                               break;
+                       
+                       case 'i':
+                               if (name == PUGIXML_TEXT("id") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_id, xpath_type_node_set, args[0]);
+                                       
+                               break;
+                       
+                       case 'l':
+                               if (name == PUGIXML_TEXT("last") && argc == 0)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_last, xpath_type_number);
+                               else if (name == PUGIXML_TEXT("lang") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_lang, xpath_type_boolean, args[0]);
+                               else if (name == PUGIXML_TEXT("local-name") && argc <= 1)
+                                       return parse_function_helper(ast_func_local_name_0, ast_func_local_name_1, argc, args);
+                       
+                               break;
+                       
+                       case 'n':
+                               if (name == PUGIXML_TEXT("name") && argc <= 1)
+                                       return parse_function_helper(ast_func_name_0, ast_func_name_1, argc, args);
+                               else if (name == PUGIXML_TEXT("namespace-uri") && argc <= 1)
+                                       return parse_function_helper(ast_func_namespace_uri_0, ast_func_namespace_uri_1, argc, args);
+                               else if (name == PUGIXML_TEXT("normalize-space") && argc <= 1)
+                                       return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("not") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_not, xpath_type_boolean, args[0]);
+                               else if (name == PUGIXML_TEXT("number") && argc <= 1)
+                                       return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_number_0 : ast_func_number_1, xpath_type_number, args[0]);
+                       
+                               break;
+                       
+                       case 'p':
+                               if (name == PUGIXML_TEXT("position") && argc == 0)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_position, xpath_type_number);
+                               
+                               break;
+                       
+                       case 'r':
+                               if (name == PUGIXML_TEXT("round") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_round, xpath_type_number, args[0]);
+
+                               break;
+                       
+                       case 's':
+                               if (name == PUGIXML_TEXT("string") && argc <= 1)
+                                       return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_0 : ast_func_string_1, xpath_type_string, args[0]);
+                               else if (name == PUGIXML_TEXT("string-length") && argc <= 1)
+                                       return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1, xpath_type_number, args[0]);
+                               else if (name == PUGIXML_TEXT("starts-with") && argc == 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_starts_with, xpath_type_boolean, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("substring-before") && argc == 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_substring_before, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("substring-after") && argc == 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_substring_after, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("substring") && (argc == 2 || argc == 3))
+                                       return new (alloc_node()) xpath_ast_node(argc == 2 ? ast_func_substring_2 : ast_func_substring_3, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("sum") && argc == 1)
+                               {
+                                       if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+                                       return new (alloc_node()) xpath_ast_node(ast_func_sum, xpath_type_number, args[0]);
+                               }
+
+                               break;
+                       
+                       case 't':
+                               if (name == PUGIXML_TEXT("translate") && argc == 3)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_translate, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("true") && argc == 0)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_true, xpath_type_boolean);
+                                       
+                               break;
+
+                       default:
+                               break;
+                       }
+
+                       throw_error("Unrecognized function or wrong parameter count");
+
+                       return 0;
+               }
+
+               axis_t parse_axis_name(const xpath_lexer_string& name, bool& specified)
+               {
+                       specified = true;
+
+                       switch (name.begin[0])
+                       {
+                       case 'a':
+                               if (name == PUGIXML_TEXT("ancestor"))
+                                       return axis_ancestor;
+                               else if (name == PUGIXML_TEXT("ancestor-or-self"))
+                                       return axis_ancestor_or_self;
+                               else if (name == PUGIXML_TEXT("attribute"))
+                                       return axis_attribute;
+                               
+                               break;
+                       
+                       case 'c':
+                               if (name == PUGIXML_TEXT("child"))
+                                       return axis_child;
+                               
+                               break;
+                       
+                       case 'd':
+                               if (name == PUGIXML_TEXT("descendant"))
+                                       return axis_descendant;
+                               else if (name == PUGIXML_TEXT("descendant-or-self"))
+                                       return axis_descendant_or_self;
+                               
+                               break;
+                       
+                       case 'f':
+                               if (name == PUGIXML_TEXT("following"))
+                                       return axis_following;
+                               else if (name == PUGIXML_TEXT("following-sibling"))
+                                       return axis_following_sibling;
+                               
+                               break;
+                       
+                       case 'n':
+                               if (name == PUGIXML_TEXT("namespace"))
+                                       return axis_namespace;
+                               
+                               break;
+                       
+                       case 'p':
+                               if (name == PUGIXML_TEXT("parent"))
+                                       return axis_parent;
+                               else if (name == PUGIXML_TEXT("preceding"))
+                                       return axis_preceding;
+                               else if (name == PUGIXML_TEXT("preceding-sibling"))
+                                       return axis_preceding_sibling;
+                               
+                               break;
+                       
+                       case 's':
+                               if (name == PUGIXML_TEXT("self"))
+                                       return axis_self;
+                               
+                               break;
+
+                       default:
+                               break;
+                       }
+
+                       specified = false;
+                       return axis_child;
+               }
+
+               nodetest_t parse_node_test_type(const xpath_lexer_string& name)
+               {
+                       switch (name.begin[0])
+                       {
+                       case 'c':
+                               if (name == PUGIXML_TEXT("comment"))
+                                       return nodetest_type_comment;
+
+                               break;
+
+                       case 'n':
+                               if (name == PUGIXML_TEXT("node"))
+                                       return nodetest_type_node;
+
+                               break;
+
+                       case 'p':
+                               if (name == PUGIXML_TEXT("processing-instruction"))
+                                       return nodetest_type_pi;
+
+                               break;
+
+                       case 't':
+                               if (name == PUGIXML_TEXT("text"))
+                                       return nodetest_type_text;
+
+                               break;
+                       
+                       default:
+                               break;
+                       }
+
+                       return nodetest_none;
+               }
+
+               // PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall
+               xpath_ast_node* parse_primary_expression()
+               {
+                       switch (_lexer.current())
+                       {
+                       case lex_var_ref:
+                       {
+                               xpath_lexer_string name = _lexer.contents();
+
+                               if (!_variables)
+                                       throw_error("Unknown variable: variable set is not provided");
+
+                               xpath_variable* var = get_variable_scratch(_scratch, _variables, name.begin, name.end);
+
+                               if (!var)
+                                       throw_error("Unknown variable: variable set does not contain the given name");
+
+                               _lexer.next();
+
+                               return new (alloc_node()) xpath_ast_node(ast_variable, var->type(), var);
+                       }
+
+                       case lex_open_brace:
+                       {
+                               _lexer.next();
+
+                               xpath_ast_node* n = parse_expression();
+
+                               if (_lexer.current() != lex_close_brace)
+                                       throw_error("Unmatched braces");
+
+                               _lexer.next();
+
+                               return n;
+                       }
+
+                       case lex_quoted_string:
+                       {
+                               const char_t* value = alloc_string(_lexer.contents());
+
+                               xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_string_constant, xpath_type_string, value);
+                               _lexer.next();
+
+                               return n;
+                       }
+
+                       case lex_number:
+                       {
+                               double value = 0;
+
+                               if (!convert_string_to_number_scratch(_scratch, _lexer.contents().begin, _lexer.contents().end, &value))
+                                       throw_error_oom();
+
+                               xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_number_constant, xpath_type_number, value);
+                               _lexer.next();
+
+                               return n;
+                       }
+
+                       case lex_string:
+                       {
+                               xpath_ast_node* args[2] = {0};
+                               size_t argc = 0;
+                               
+                               xpath_lexer_string function = _lexer.contents();
+                               _lexer.next();
+                               
+                               xpath_ast_node* last_arg = 0;
+                               
+                               if (_lexer.current() != lex_open_brace)
+                                       throw_error("Unrecognized function call");
+                               _lexer.next();
+
+                               if (_lexer.current() != lex_close_brace)
+                                       args[argc++] = parse_expression();
+
+                               while (_lexer.current() != lex_close_brace)
+                               {
+                                       if (_lexer.current() != lex_comma)
+                                               throw_error("No comma between function arguments");
+                                       _lexer.next();
+                                       
+                                       xpath_ast_node* n = parse_expression();
+                                       
+                                       if (argc < 2) args[argc] = n;
+                                       else last_arg->set_next(n);
+
+                                       argc++;
+                                       last_arg = n;
+                               }
+                               
+                               _lexer.next();
+
+                               return parse_function(function, argc, args);
+                       }
+
+                       default:
+                               throw_error("Unrecognizable primary expression");
+
+                               return 0;
+                       }
+               }
+               
+               // FilterExpr ::= PrimaryExpr | FilterExpr Predicate
+               // Predicate ::= '[' PredicateExpr ']'
+               // PredicateExpr ::= Expr
+               xpath_ast_node* parse_filter_expression()
+               {
+                       xpath_ast_node* n = parse_primary_expression();
+
+                       while (_lexer.current() == lex_open_square_brace)
+                       {
+                               _lexer.next();
+
+                               xpath_ast_node* expr = parse_expression();
+
+                               if (n->rettype() != xpath_type_node_set) throw_error("Predicate has to be applied to node set");
+
+                               n = new (alloc_node()) xpath_ast_node(ast_filter, n, expr, predicate_default);
+
+                               if (_lexer.current() != lex_close_square_brace)
+                                       throw_error("Unmatched square brace");
+                       
+                               _lexer.next();
+                       }
+                       
+                       return n;
+               }
+               
+               // Step ::= AxisSpecifier NodeTest Predicate* | AbbreviatedStep
+               // AxisSpecifier ::= AxisName '::' | '@'?
+               // NodeTest ::= NameTest | NodeType '(' ')' | 'processing-instruction' '(' Literal ')'
+               // NameTest ::= '*' | NCName ':' '*' | QName
+               // AbbreviatedStep ::= '.' | '..'
+               xpath_ast_node* parse_step(xpath_ast_node* set)
+               {
+                       if (set && set->rettype() != xpath_type_node_set)
+                               throw_error("Step has to be applied to node set");
+
+                       bool axis_specified = false;
+                       axis_t axis = axis_child; // implied child axis
+
+                       if (_lexer.current() == lex_axis_attribute)
+                       {
+                               axis = axis_attribute;
+                               axis_specified = true;
+                               
+                               _lexer.next();
+                       }
+                       else if (_lexer.current() == lex_dot)
+                       {
+                               _lexer.next();
+                               
+                               return new (alloc_node()) xpath_ast_node(ast_step, set, axis_self, nodetest_type_node, 0);
+                       }
+                       else if (_lexer.current() == lex_double_dot)
+                       {
+                               _lexer.next();
+                               
+                               return new (alloc_node()) xpath_ast_node(ast_step, set, axis_parent, nodetest_type_node, 0);
+                       }
+               
+                       nodetest_t nt_type = nodetest_none;
+                       xpath_lexer_string nt_name;
+                       
+                       if (_lexer.current() == lex_string)
+                       {
+                               // node name test
+                               nt_name = _lexer.contents();
+                               _lexer.next();
+
+                               // was it an axis name?
+                               if (_lexer.current() == lex_double_colon)
+                               {
+                                       // parse axis name
+                                       if (axis_specified) throw_error("Two axis specifiers in one step");
+
+                                       axis = parse_axis_name(nt_name, axis_specified);
+
+                                       if (!axis_specified) throw_error("Unknown axis");
+
+                                       // read actual node test
+                                       _lexer.next();
+
+                                       if (_lexer.current() == lex_multiply)
+                                       {
+                                               nt_type = nodetest_all;
+                                               nt_name = xpath_lexer_string();
+                                               _lexer.next();
+                                       }
+                                       else if (_lexer.current() == lex_string)
+                                       {
+                                               nt_name = _lexer.contents();
+                                               _lexer.next();
+                                       }
+                                       else throw_error("Unrecognized node test");
+                               }
+                               
+                               if (nt_type == nodetest_none)
+                               {
+                                       // node type test or processing-instruction
+                                       if (_lexer.current() == lex_open_brace)
+                                       {
+                                               _lexer.next();
+                                               
+                                               if (_lexer.current() == lex_close_brace)
+                                               {
+                                                       _lexer.next();
+
+                                                       nt_type = parse_node_test_type(nt_name);
+
+                                                       if (nt_type == nodetest_none) throw_error("Unrecognized node type");
+                                                       
+                                                       nt_name = xpath_lexer_string();
+                                               }
+                                               else if (nt_name == PUGIXML_TEXT("processing-instruction"))
+                                               {
+                                                       if (_lexer.current() != lex_quoted_string)
+                                                               throw_error("Only literals are allowed as arguments to processing-instruction()");
+                                               
+                                                       nt_type = nodetest_pi;
+                                                       nt_name = _lexer.contents();
+                                                       _lexer.next();
+                                                       
+                                                       if (_lexer.current() != lex_close_brace)
+                                                               throw_error("Unmatched brace near processing-instruction()");
+                                                       _lexer.next();
+                                               }
+                                               else
+                                                       throw_error("Unmatched brace near node type test");
+
+                                       }
+                                       // QName or NCName:*
+                                       else
+                                       {
+                                               if (nt_name.end - nt_name.begin > 2 && nt_name.end[-2] == ':' && nt_name.end[-1] == '*') // NCName:*
+                                               {
+                                                       nt_name.end--; // erase *
+                                                       
+                                                       nt_type = nodetest_all_in_namespace;
+                                               }
+                                               else nt_type = nodetest_name;
+                                       }
+                               }
+                       }
+                       else if (_lexer.current() == lex_multiply)
+                       {
+                               nt_type = nodetest_all;
+                               _lexer.next();
+                       }
+                       else throw_error("Unrecognized node test");
+                       
+                       xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step, set, axis, nt_type, alloc_string(nt_name));
+                       
+                       xpath_ast_node* last = 0;
+                       
+                       while (_lexer.current() == lex_open_square_brace)
+                       {
+                               _lexer.next();
+                               
+                               xpath_ast_node* expr = parse_expression();
+
+                               xpath_ast_node* pred = new (alloc_node()) xpath_ast_node(ast_predicate, 0, expr, predicate_default);
+                               
+                               if (_lexer.current() != lex_close_square_brace)
+                                       throw_error("Unmatched square brace");
+                               _lexer.next();
+                               
+                               if (last) last->set_next(pred);
+                               else n->set_right(pred);
+                               
+                               last = pred;
+                       }
+
+                       return n;
+               }
+               
+               // RelativeLocationPath ::= Step | RelativeLocationPath '/' Step | RelativeLocationPath '//' Step
+               xpath_ast_node* parse_relative_location_path(xpath_ast_node* set)
+               {
+                       xpath_ast_node* n = parse_step(set);
+                       
+                       while (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
+                       {
+                               lexeme_t l = _lexer.current();
+                               _lexer.next();
+
+                               if (l == lex_double_slash)
+                                       n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+                               
+                               n = parse_step(n);
+                       }
+                       
+                       return n;
+               }
+               
+               // LocationPath ::= RelativeLocationPath | AbsoluteLocationPath
+               // AbsoluteLocationPath ::= '/' RelativeLocationPath? | '//' RelativeLocationPath
+               xpath_ast_node* parse_location_path()
+               {
+                       if (_lexer.current() == lex_slash)
+                       {
+                               _lexer.next();
+                               
+                               xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+
+                               // relative location path can start from axis_attribute, dot, double_dot, multiply and string lexemes; any other lexeme means standalone root path
+                               lexeme_t l = _lexer.current();
+
+                               if (l == lex_string || l == lex_axis_attribute || l == lex_dot || l == lex_double_dot || l == lex_multiply)
+                                       return parse_relative_location_path(n);
+                               else
+                                       return n;
+                       }
+                       else if (_lexer.current() == lex_double_slash)
+                       {
+                               _lexer.next();
+                               
+                               xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+                               n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+                               
+                               return parse_relative_location_path(n);
+                       }
+
+                       // else clause moved outside of if because of bogus warning 'control may reach end of non-void function being inlined' in gcc 4.0.1
+                       return parse_relative_location_path(0);
+               }
+               
+               // PathExpr ::= LocationPath
+               //                              | FilterExpr
+               //                              | FilterExpr '/' RelativeLocationPath
+               //                              | FilterExpr '//' RelativeLocationPath
+               // UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
+               // UnaryExpr ::= UnionExpr | '-' UnaryExpr
+               xpath_ast_node* parse_path_or_unary_expression()
+               {
+                       // Clarification.
+                       // PathExpr begins with either LocationPath or FilterExpr.
+                       // FilterExpr begins with PrimaryExpr
+                       // PrimaryExpr begins with '$' in case of it being a variable reference,
+                       // '(' in case of it being an expression, string literal, number constant or
+                       // function call.
+
+                       if (_lexer.current() == lex_var_ref || _lexer.current() == lex_open_brace || 
+                               _lexer.current() == lex_quoted_string || _lexer.current() == lex_number ||
+                               _lexer.current() == lex_string)
+                       {
+                               if (_lexer.current() == lex_string)
+                               {
+                                       // This is either a function call, or not - if not, we shall proceed with location path
+                                       const char_t* state = _lexer.state();
+                                       
+                                       while (PUGI__IS_CHARTYPE(*state, ct_space)) ++state;
+                                       
+                                       if (*state != '(') return parse_location_path();
+
+                                       // This looks like a function call; however this still can be a node-test. Check it.
+                                       if (parse_node_test_type(_lexer.contents()) != nodetest_none) return parse_location_path();
+                               }
+                               
+                               xpath_ast_node* n = parse_filter_expression();
+
+                               if (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
+                               {
+                                       lexeme_t l = _lexer.current();
+                                       _lexer.next();
+                                       
+                                       if (l == lex_double_slash)
+                                       {
+                                               if (n->rettype() != xpath_type_node_set) throw_error("Step has to be applied to node set");
+
+                                               n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+                                       }
+       
+                                       // select from location path
+                                       return parse_relative_location_path(n);
+                               }
+
+                               return n;
+                       }
+                       else if (_lexer.current() == lex_minus)
+                       {
+                               _lexer.next();
+
+                               // precedence 7+ - only parses union expressions
+                               xpath_ast_node* expr = parse_expression_rec(parse_path_or_unary_expression(), 7);
+
+                               return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr);
+                       }
+                       else
+                               return parse_location_path();
+               }
+
+               struct binary_op_t
+               {
+                       ast_type_t asttype;
+                       xpath_value_type rettype;
+                       int precedence;
+
+                       binary_op_t(): asttype(ast_unknown), rettype(xpath_type_none), precedence(0)
+                       {
+                       }
+
+                       binary_op_t(ast_type_t asttype_, xpath_value_type rettype_, int precedence_): asttype(asttype_), rettype(rettype_), precedence(precedence_)
+                       {
+                       }
+
+                       static binary_op_t parse(xpath_lexer& lexer)
+                       {
+                               switch (lexer.current())
+                               {
+                               case lex_string:
+                                       if (lexer.contents() == PUGIXML_TEXT("or"))
+                                               return binary_op_t(ast_op_or, xpath_type_boolean, 1);
+                                       else if (lexer.contents() == PUGIXML_TEXT("and"))
+                                               return binary_op_t(ast_op_and, xpath_type_boolean, 2);
+                                       else if (lexer.contents() == PUGIXML_TEXT("div"))
+                                               return binary_op_t(ast_op_divide, xpath_type_number, 6);
+                                       else if (lexer.contents() == PUGIXML_TEXT("mod"))
+                                               return binary_op_t(ast_op_mod, xpath_type_number, 6);
+                                       else
+                                               return binary_op_t();
+
+                               case lex_equal:
+                                       return binary_op_t(ast_op_equal, xpath_type_boolean, 3);
+
+                               case lex_not_equal:
+                                       return binary_op_t(ast_op_not_equal, xpath_type_boolean, 3);
+
+                               case lex_less:
+                                       return binary_op_t(ast_op_less, xpath_type_boolean, 4);
+
+                               case lex_greater:
+                                       return binary_op_t(ast_op_greater, xpath_type_boolean, 4);
+
+                               case lex_less_or_equal:
+                                       return binary_op_t(ast_op_less_or_equal, xpath_type_boolean, 4);
+
+                               case lex_greater_or_equal:
+                                       return binary_op_t(ast_op_greater_or_equal, xpath_type_boolean, 4);
+
+                               case lex_plus:
+                                       return binary_op_t(ast_op_add, xpath_type_number, 5);
+
+                               case lex_minus:
+                                       return binary_op_t(ast_op_subtract, xpath_type_number, 5);
+
+                               case lex_multiply:
+                                       return binary_op_t(ast_op_multiply, xpath_type_number, 6);
+
+                               case lex_union:
+                                       return binary_op_t(ast_op_union, xpath_type_node_set, 7);
+
+                               default:
+                                       return binary_op_t();
+                               }
+                       }
+               };
+
+               xpath_ast_node* parse_expression_rec(xpath_ast_node* lhs, int limit)
+               {
+                       binary_op_t op = binary_op_t::parse(_lexer);
+
+                       while (op.asttype != ast_unknown && op.precedence >= limit)
+                       {
+                               _lexer.next();
+
+                               xpath_ast_node* rhs = parse_path_or_unary_expression();
+
+                               binary_op_t nextop = binary_op_t::parse(_lexer);
+
+                               while (nextop.asttype != ast_unknown && nextop.precedence > op.precedence)
+                               {
+                                       rhs = parse_expression_rec(rhs, nextop.precedence);
+
+                                       nextop = binary_op_t::parse(_lexer);
+                               }
+
+                               if (op.asttype == ast_op_union && (lhs->rettype() != xpath_type_node_set || rhs->rettype() != xpath_type_node_set))
+                                       throw_error("Union operator has to be applied to node sets");
+
+                               lhs = new (alloc_node()) xpath_ast_node(op.asttype, op.rettype, lhs, rhs);
+
+                               op = binary_op_t::parse(_lexer);
+                       }
+
+                       return lhs;
+               }
+
+               // Expr ::= OrExpr
+               // OrExpr ::= AndExpr | OrExpr 'or' AndExpr
+               // AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr
+               // EqualityExpr ::= RelationalExpr
+               //                                      | EqualityExpr '=' RelationalExpr
+               //                                      | EqualityExpr '!=' RelationalExpr
+               // RelationalExpr ::= AdditiveExpr
+               //                                        | RelationalExpr '<' AdditiveExpr
+               //                                        | RelationalExpr '>' AdditiveExpr
+               //                                        | RelationalExpr '<=' AdditiveExpr
+               //                                        | RelationalExpr '>=' AdditiveExpr
+               // AdditiveExpr ::= MultiplicativeExpr
+               //                                      | AdditiveExpr '+' MultiplicativeExpr
+               //                                      | AdditiveExpr '-' MultiplicativeExpr
+               // MultiplicativeExpr ::= UnaryExpr
+               //                                                | MultiplicativeExpr '*' UnaryExpr
+               //                                                | MultiplicativeExpr 'div' UnaryExpr
+               //                                                | MultiplicativeExpr 'mod' UnaryExpr
+               xpath_ast_node* parse_expression()
+               {
+                       return parse_expression_rec(parse_path_or_unary_expression(), 0);
+               }
+
+               xpath_parser(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result): _alloc(alloc), _lexer(query), _query(query), _variables(variables), _result(result)
+               {
+               }
+
+               xpath_ast_node* parse()
+               {
+                       xpath_ast_node* result = parse_expression();
+                       
+                       if (_lexer.current() != lex_eof)
+                       {
+                               // there are still unparsed tokens left, error
+                               throw_error("Incorrect query");
+                       }
+                       
+                       return result;
+               }
+
+               static xpath_ast_node* parse(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result)
+               {
+                       xpath_parser parser(query, variables, alloc, result);
+
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       int error = setjmp(parser._error_handler);
+
+                       return (error == 0) ? parser.parse() : 0;
+               #else
+                       return parser.parse();
+               #endif
+               }
+       };
+
+       struct xpath_query_impl
+       {
+               static xpath_query_impl* create()
+               {
+                       void* memory = xml_memory::allocate(sizeof(xpath_query_impl));
+
+                       return new (memory) xpath_query_impl();
+               }
+
+               static void destroy(void* ptr)
+               {
+                       if (!ptr) return;
+                       
+                       // free all allocated pages
+                       static_cast<xpath_query_impl*>(ptr)->alloc.release();
+
+                       // free allocator memory (with the first page)
+                       xml_memory::deallocate(ptr);
+               }
+
+               xpath_query_impl(): root(0), alloc(&block)
+               {
+                       block.next = 0;
+                       block.capacity = sizeof(block.data);
+               }
+
+               xpath_ast_node* root;
+               xpath_allocator alloc;
+               xpath_memory_block block;
+       };
+
+       PUGI__FN xpath_string evaluate_string_impl(xpath_query_impl* impl, const xpath_node& n, xpath_stack_data& sd)
+       {
+               if (!impl) return xpath_string();
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return xpath_string();
+       #endif
+
+               xpath_context c(n, 1, 1);
+
+               return impl->root->eval_string(c, sd.stack);
+       }
+
+       PUGI__FN impl::xpath_ast_node* evaluate_node_set_prepare(xpath_query_impl* impl)
+       {
+               if (!impl) return 0;
+
+               if (impl->root->rettype() != xpath_type_node_set)
+               {
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       return 0;
+               #else
+                       xpath_parse_result res;
+                       res.error = "Expression does not evaluate to node set";
+
+                       throw xpath_exception(res);
+               #endif
+               }
+
+               return impl->root;
+       }
+PUGI__NS_END
+
+namespace pugi
+{
+#ifndef PUGIXML_NO_EXCEPTIONS
+       PUGI__FN xpath_exception::xpath_exception(const xpath_parse_result& result_): _result(result_)
+       {
+               assert(_result.error);
+       }
+       
+       PUGI__FN const char* xpath_exception::what() const throw()
+       {
+               return _result.error;
+       }
+
+       PUGI__FN const xpath_parse_result& xpath_exception::result() const
+       {
+               return _result;
+       }
+#endif
+       
+       PUGI__FN xpath_node::xpath_node()
+       {
+       }
+               
+       PUGI__FN xpath_node::xpath_node(const xml_node& node_): _node(node_)
+       {
+       }
+               
+       PUGI__FN xpath_node::xpath_node(const xml_attribute& attribute_, const xml_node& parent_): _node(attribute_ ? parent_ : xml_node()), _attribute(attribute_)
+       {
+       }
+
+       PUGI__FN xml_node xpath_node::node() const
+       {
+               return _attribute ? xml_node() : _node;
+       }
+               
+       PUGI__FN xml_attribute xpath_node::attribute() const
+       {
+               return _attribute;
+       }
+       
+       PUGI__FN xml_node xpath_node::parent() const
+       {
+               return _attribute ? _node : _node.parent();
+       }
+
+       PUGI__FN static void unspecified_bool_xpath_node(xpath_node***)
+       {
+       }
+
+       PUGI__FN xpath_node::operator xpath_node::unspecified_bool_type() const
+       {
+               return (_node || _attribute) ? unspecified_bool_xpath_node : 0;
+       }
+       
+       PUGI__FN bool xpath_node::operator!() const
+       {
+               return !(_node || _attribute);
+       }
+
+       PUGI__FN bool xpath_node::operator==(const xpath_node& n) const
+       {
+               return _node == n._node && _attribute == n._attribute;
+       }
+       
+       PUGI__FN bool xpath_node::operator!=(const xpath_node& n) const
+       {
+               return _node != n._node || _attribute != n._attribute;
+       }
+
+#ifdef __BORLANDC__
+       PUGI__FN bool operator&&(const xpath_node& lhs, bool rhs)
+       {
+               return (bool)lhs && rhs;
+       }
+
+       PUGI__FN bool operator||(const xpath_node& lhs, bool rhs)
+       {
+               return (bool)lhs || rhs;
+       }
+#endif
+
+       PUGI__FN void xpath_node_set::_assign(const_iterator begin_, const_iterator end_)
+       {
+               assert(begin_ <= end_);
+
+               size_t size_ = static_cast<size_t>(end_ - begin_);
+
+               if (size_ <= 1)
+               {
+                       // deallocate old buffer
+                       if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+
+                       // use internal buffer
+                       if (begin_ != end_) _storage = *begin_;
+
+                       _begin = &_storage;
+                       _end = &_storage + size_;
+               }
+               else
+               {
+                       // make heap copy
+                       xpath_node* storage = static_cast<xpath_node*>(impl::xml_memory::allocate(size_ * sizeof(xpath_node)));
+
+                       if (!storage)
+                       {
+                       #ifdef PUGIXML_NO_EXCEPTIONS
+                               return;
+                       #else
+                               throw std::bad_alloc();
+                       #endif
+                       }
+
+                       memcpy(storage, begin_, size_ * sizeof(xpath_node));
+                       
+                       // deallocate old buffer
+                       if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+
+                       // finalize
+                       _begin = storage;
+                       _end = storage + size_;
+               }
+       }
+
+       PUGI__FN xpath_node_set::xpath_node_set(): _type(type_unsorted), _begin(&_storage), _end(&_storage)
+       {
+       }
+
+       PUGI__FN xpath_node_set::xpath_node_set(const_iterator begin_, const_iterator end_, type_t type_): _type(type_), _begin(&_storage), _end(&_storage)
+       {
+               _assign(begin_, end_);
+       }
+
+       PUGI__FN xpath_node_set::~xpath_node_set()
+       {
+               if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+       }
+               
+       PUGI__FN xpath_node_set::xpath_node_set(const xpath_node_set& ns): _type(ns._type), _begin(&_storage), _end(&_storage)
+       {
+               _assign(ns._begin, ns._end);
+       }
+       
+       PUGI__FN xpath_node_set& xpath_node_set::operator=(const xpath_node_set& ns)
+       {
+               if (this == &ns) return *this;
+               
+               _type = ns._type;
+               _assign(ns._begin, ns._end);
+
+               return *this;
+       }
+
+       PUGI__FN xpath_node_set::type_t xpath_node_set::type() const
+       {
+               return _type;
+       }
+               
+       PUGI__FN size_t xpath_node_set::size() const
+       {
+               return _end - _begin;
+       }
+               
+       PUGI__FN bool xpath_node_set::empty() const
+       {
+               return _begin == _end;
+       }
+               
+       PUGI__FN const xpath_node& xpath_node_set::operator[](size_t index) const
+       {
+               assert(index < size());
+               return _begin[index];
+       }
+
+       PUGI__FN xpath_node_set::const_iterator xpath_node_set::begin() const
+       {
+               return _begin;
+       }
+               
+       PUGI__FN xpath_node_set::const_iterator xpath_node_set::end() const
+       {
+               return _end;
+       }
+       
+       PUGI__FN void xpath_node_set::sort(bool reverse)
+       {
+               _type = impl::xpath_sort(_begin, _end, _type, reverse);
+       }
+
+       PUGI__FN xpath_node xpath_node_set::first() const
+       {
+               return impl::xpath_first(_begin, _end, _type);
+       }
+
+       PUGI__FN xpath_parse_result::xpath_parse_result(): error("Internal error"), offset(0)
+       {
+       }
+
+       PUGI__FN xpath_parse_result::operator bool() const
+       {
+               return error == 0;
+       }
+
+       PUGI__FN const char* xpath_parse_result::description() const
+       {
+               return error ? error : "No error";
+       }
+
+       PUGI__FN xpath_variable::xpath_variable(): _type(xpath_type_none), _next(0)
+       {
+       }
+
+       PUGI__FN const char_t* xpath_variable::name() const
+       {
+               switch (_type)
+               {
+               case xpath_type_node_set:
+                       return static_cast<const impl::xpath_variable_node_set*>(this)->name;
+
+               case xpath_type_number:
+                       return static_cast<const impl::xpath_variable_number*>(this)->name;
+
+               case xpath_type_string:
+                       return static_cast<const impl::xpath_variable_string*>(this)->name;
+
+               case xpath_type_boolean:
+                       return static_cast<const impl::xpath_variable_boolean*>(this)->name;
+
+               default:
+                       assert(!"Invalid variable type");
+                       return 0;
+               }
+       }
+
+       PUGI__FN xpath_value_type xpath_variable::type() const
+       {
+               return _type;
+       }
+
+       PUGI__FN bool xpath_variable::get_boolean() const
+       {
+               return (_type == xpath_type_boolean) ? static_cast<const impl::xpath_variable_boolean*>(this)->value : false;
+       }
+
+       PUGI__FN double xpath_variable::get_number() const
+       {
+               return (_type == xpath_type_number) ? static_cast<const impl::xpath_variable_number*>(this)->value : impl::gen_nan();
+       }
+
+       PUGI__FN const char_t* xpath_variable::get_string() const
+       {
+               const char_t* value = (_type == xpath_type_string) ? static_cast<const impl::xpath_variable_string*>(this)->value : 0;
+               return value ? value : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const xpath_node_set& xpath_variable::get_node_set() const
+       {
+               return (_type == xpath_type_node_set) ? static_cast<const impl::xpath_variable_node_set*>(this)->value : impl::dummy_node_set;
+       }
+
+       PUGI__FN bool xpath_variable::set(bool value)
+       {
+               if (_type != xpath_type_boolean) return false;
+
+               static_cast<impl::xpath_variable_boolean*>(this)->value = value;
+               return true;
+       }
+
+       PUGI__FN bool xpath_variable::set(double value)
+       {
+               if (_type != xpath_type_number) return false;
+
+               static_cast<impl::xpath_variable_number*>(this)->value = value;
+               return true;
+       }
+
+       PUGI__FN bool xpath_variable::set(const char_t* value)
+       {
+               if (_type != xpath_type_string) return false;
+
+               impl::xpath_variable_string* var = static_cast<impl::xpath_variable_string*>(this);
+
+               // duplicate string
+               size_t size = (impl::strlength(value) + 1) * sizeof(char_t);
+
+               char_t* copy = static_cast<char_t*>(impl::xml_memory::allocate(size));
+               if (!copy) return false;
+
+               memcpy(copy, value, size);
+
+               // replace old string
+               if (var->value) impl::xml_memory::deallocate(var->value);
+               var->value = copy;
+
+               return true;
+       }
+
+       PUGI__FN bool xpath_variable::set(const xpath_node_set& value)
+       {
+               if (_type != xpath_type_node_set) return false;
+
+               static_cast<impl::xpath_variable_node_set*>(this)->value = value;
+               return true;
+       }
+
+       PUGI__FN xpath_variable_set::xpath_variable_set()
+       {
+               for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) _data[i] = 0;
+       }
+
+       PUGI__FN xpath_variable_set::~xpath_variable_set()
+       {
+               for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i)
+               {
+                       xpath_variable* var = _data[i];
+
+                       while (var)
+                       {
+                               xpath_variable* next = var->_next;
+
+                               impl::delete_xpath_variable(var->_type, var);
+
+                               var = next;
+                       }
+               }
+       }
+
+       PUGI__FN xpath_variable* xpath_variable_set::find(const char_t* name) const
+       {
+               const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
+               size_t hash = impl::hash_string(name) % hash_size;
+
+               // look for existing variable
+               for (xpath_variable* var = _data[hash]; var; var = var->_next)
+                       if (impl::strequal(var->name(), name))
+                               return var;
+
+               return 0;
+       }
+
+       PUGI__FN xpath_variable* xpath_variable_set::add(const char_t* name, xpath_value_type type)
+       {
+               const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
+               size_t hash = impl::hash_string(name) % hash_size;
+
+               // look for existing variable
+               for (xpath_variable* var = _data[hash]; var; var = var->_next)
+                       if (impl::strequal(var->name(), name))
+                               return var->type() == type ? var : 0;
+
+               // add new variable
+               xpath_variable* result = impl::new_xpath_variable(type, name);
+
+               if (result)
+               {
+                       result->_type = type;
+                       result->_next = _data[hash];
+
+                       _data[hash] = result;
+               }
+
+               return result;
+       }
+
+       PUGI__FN bool xpath_variable_set::set(const char_t* name, bool value)
+       {
+               xpath_variable* var = add(name, xpath_type_boolean);
+               return var ? var->set(value) : false;
+       }
+
+       PUGI__FN bool xpath_variable_set::set(const char_t* name, double value)
+       {
+               xpath_variable* var = add(name, xpath_type_number);
+               return var ? var->set(value) : false;
+       }
+
+       PUGI__FN bool xpath_variable_set::set(const char_t* name, const char_t* value)
+       {
+               xpath_variable* var = add(name, xpath_type_string);
+               return var ? var->set(value) : false;
+       }
+
+       PUGI__FN bool xpath_variable_set::set(const char_t* name, const xpath_node_set& value)
+       {
+               xpath_variable* var = add(name, xpath_type_node_set);
+               return var ? var->set(value) : false;
+       }
+
+       PUGI__FN xpath_variable* xpath_variable_set::get(const char_t* name)
+       {
+               return find(name);
+       }
+
+       PUGI__FN const xpath_variable* xpath_variable_set::get(const char_t* name) const
+       {
+               return find(name);
+       }
+
+       PUGI__FN xpath_query::xpath_query(const char_t* query, xpath_variable_set* variables): _impl(0)
+       {
+               impl::xpath_query_impl* qimpl = impl::xpath_query_impl::create();
+
+               if (!qimpl)
+               {
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       _result.error = "Out of memory";
+               #else
+                       throw std::bad_alloc();
+               #endif
+               }
+               else
+               {
+                       impl::buffer_holder impl_holder(qimpl, impl::xpath_query_impl::destroy);
+
+                       qimpl->root = impl::xpath_parser::parse(query, variables, &qimpl->alloc, &_result);
+
+                       if (qimpl->root)
+                       {
+                               qimpl->root->optimize(&qimpl->alloc);
+
+                               _impl = static_cast<impl::xpath_query_impl*>(impl_holder.release());
+                               _result.error = 0;
+                       }
+               }
+       }
+
+       PUGI__FN xpath_query::~xpath_query()
+       {
+               impl::xpath_query_impl::destroy(_impl);
+       }
+
+       PUGI__FN xpath_value_type xpath_query::return_type() const
+       {
+               if (!_impl) return xpath_type_none;
+
+               return static_cast<impl::xpath_query_impl*>(_impl)->root->rettype();
+       }
+
+       PUGI__FN bool xpath_query::evaluate_boolean(const xpath_node& n) const
+       {
+               if (!_impl) return false;
+               
+               impl::xpath_context c(n, 1, 1);
+               impl::xpath_stack_data sd;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return false;
+       #endif
+               
+               return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_boolean(c, sd.stack);
+       }
+       
+       PUGI__FN double xpath_query::evaluate_number(const xpath_node& n) const
+       {
+               if (!_impl) return impl::gen_nan();
+               
+               impl::xpath_context c(n, 1, 1);
+               impl::xpath_stack_data sd;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return impl::gen_nan();
+       #endif
+
+               return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_number(c, sd.stack);
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN string_t xpath_query::evaluate_string(const xpath_node& n) const
+       {
+               impl::xpath_stack_data sd;
+
+               impl::xpath_string r = impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd);
+
+               return string_t(r.c_str(), r.length());
+       }
+#endif
+
+       PUGI__FN size_t xpath_query::evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const
+       {
+               impl::xpath_stack_data sd;
+
+               impl::xpath_string r = impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd);
+
+               size_t full_size = r.length() + 1;
+               
+               if (capacity > 0)
+               {
+                       size_t size = (full_size < capacity) ? full_size : capacity;
+                       assert(size > 0);
+
+                       memcpy(buffer, r.c_str(), (size - 1) * sizeof(char_t));
+                       buffer[size - 1] = 0;
+               }
+               
+               return full_size;
+       }
+
+       PUGI__FN xpath_node_set xpath_query::evaluate_node_set(const xpath_node& n) const
+       {
+               impl::xpath_ast_node* root = impl::evaluate_node_set_prepare(static_cast<impl::xpath_query_impl*>(_impl));
+               if (!root) return xpath_node_set();
+
+               impl::xpath_context c(n, 1, 1);
+               impl::xpath_stack_data sd;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return xpath_node_set();
+       #endif
+
+               impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_all);
+
+               return xpath_node_set(r.begin(), r.end(), r.type());
+       }
+
+       PUGI__FN xpath_node xpath_query::evaluate_node(const xpath_node& n) const
+       {
+               impl::xpath_ast_node* root = impl::evaluate_node_set_prepare(static_cast<impl::xpath_query_impl*>(_impl));
+               if (!root) return xpath_node();
+
+               impl::xpath_context c(n, 1, 1);
+               impl::xpath_stack_data sd;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return xpath_node();
+       #endif
+
+               impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_first);
+
+               return r.first();
+       }
+
+       PUGI__FN const xpath_parse_result& xpath_query::result() const
+       {
+               return _result;
+       }
+
+       PUGI__FN static void unspecified_bool_xpath_query(xpath_query***)
+       {
+       }
+
+       PUGI__FN xpath_query::operator xpath_query::unspecified_bool_type() const
+       {
+               return _impl ? unspecified_bool_xpath_query : 0;
+       }
+
+       PUGI__FN bool xpath_query::operator!() const
+       {
+               return !_impl;
+       }
+
+       PUGI__FN xpath_node xml_node::select_node(const char_t* query, xpath_variable_set* variables) const
+       {
+               xpath_query q(query, variables);
+               return select_node(q);
+       }
+
+       PUGI__FN xpath_node xml_node::select_node(const xpath_query& query) const
+       {
+               return query.evaluate_node(*this);
+       }
+
+       PUGI__FN xpath_node_set xml_node::select_nodes(const char_t* query, xpath_variable_set* variables) const
+       {
+               xpath_query q(query, variables);
+               return select_nodes(q);
+       }
+
+       PUGI__FN xpath_node_set xml_node::select_nodes(const xpath_query& query) const
+       {
+               return query.evaluate_node_set(*this);
+       }
+
+       PUGI__FN xpath_node xml_node::select_single_node(const char_t* query, xpath_variable_set* variables) const
+       {
+               xpath_query q(query, variables);
+               return select_single_node(q);
+       }
+
+       PUGI__FN xpath_node xml_node::select_single_node(const xpath_query& query) const
+       {
+               return query.evaluate_node(*this);
+       }
+}
+
+#endif
+
+#ifdef __BORLANDC__
+#      pragma option pop
+#endif
+
+// Intel C++ does not properly keep warning state for function templates,
+// so popping warning state at the end of translation unit leads to warnings in the middle.
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#      pragma warning(pop)
+#endif
+
+// Undefine all local macros (makes sure we're not leaking macros in header-only mode)
+#undef PUGI__NO_INLINE
+#undef PUGI__UNLIKELY
+#undef PUGI__STATIC_ASSERT
+#undef PUGI__DMC_VOLATILE
+#undef PUGI__MSVC_CRT_VERSION
+#undef PUGI__NS_BEGIN
+#undef PUGI__NS_END
+#undef PUGI__FN
+#undef PUGI__FN_NO_INLINE
+#undef PUGI__NODETYPE
+#undef PUGI__IS_CHARTYPE_IMPL
+#undef PUGI__IS_CHARTYPE
+#undef PUGI__IS_CHARTYPEX
+#undef PUGI__ENDSWITH
+#undef PUGI__SKIPWS
+#undef PUGI__OPTSET
+#undef PUGI__PUSHNODE
+#undef PUGI__POPNODE
+#undef PUGI__SCANFOR
+#undef PUGI__SCANWHILE
+#undef PUGI__SCANWHILE_UNROLL
+#undef PUGI__ENDSEG
+#undef PUGI__THROW_ERROR
+#undef PUGI__CHECK_ERROR
+
+#endif
+
+/**
+ * Copyright (c) 2006-2014 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma GCC diagnostic pop
diff --git a/src/pugixml/pugixml.hpp b/src/pugixml/pugixml.hpp

new file mode 100644 (file)

index 0000000..d4d5a62
--- /dev/null
+++ b/src/pugixml/pugixml.hpp
@@ -0,0 +1,1366 @@
+/**
+ * pugixml parser - version 1.5
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef PUGIXML_VERSION
+// Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons
+#      define PUGIXML_VERSION 150
+#endif
+
+// Include user configuration file (this can define various configuration macros)
+#include "pugiconfig.hpp"
+
+#ifndef HEADER_PUGIXML_HPP
+#define HEADER_PUGIXML_HPP
+
+// Include stddef.h for size_t and ptrdiff_t
+#include <cstddef>
+
+// Include exception header for XPath
+#if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS)
+#      include <exception>
+#endif
+
+// Include STL headers
+#ifndef PUGIXML_NO_STL
+#      include <iterator>
+#      include <iosfwd>
+#      include <string>
+#endif
+
+// Macro for deprecated features
+#ifndef PUGIXML_DEPRECATED
+#      if defined(__GNUC__)
+#              define PUGIXML_DEPRECATED __attribute__((deprecated))
+#      elif defined(_MSC_VER) && _MSC_VER >= 1300
+#              define PUGIXML_DEPRECATED __declspec(deprecated)
+#      else
+#              define PUGIXML_DEPRECATED
+#      endif
+#endif
+
+// If no API is defined, assume default
+#ifndef PUGIXML_API
+#      define PUGIXML_API
+#endif
+
+// If no API for classes is defined, assume default
+#ifndef PUGIXML_CLASS
+#      define PUGIXML_CLASS PUGIXML_API
+#endif
+
+// If no API for functions is defined, assume default
+#ifndef PUGIXML_FUNCTION
+#      define PUGIXML_FUNCTION PUGIXML_API
+#endif
+
+// If the platform is known to have long long support, enable long long functions
+#ifndef PUGIXML_HAS_LONG_LONG
+#      if defined(__cplusplus) && __cplusplus >= 201103
+#              define PUGIXML_HAS_LONG_LONG
+#      elif defined(_MSC_VER) && _MSC_VER >= 1400
+#              define PUGIXML_HAS_LONG_LONG
+#      endif
+#endif
+
+// Character interface macros
+#ifdef PUGIXML_WCHAR_MODE
+#      define PUGIXML_TEXT(t) L ## t
+#      define PUGIXML_CHAR wchar_t
+#else
+#      define PUGIXML_TEXT(t) t
+#      define PUGIXML_CHAR char
+#endif
+
+namespace pugi
+{
+       // Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE
+       typedef PUGIXML_CHAR char_t;
+
+#ifndef PUGIXML_NO_STL
+       // String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE
+       typedef std::basic_string<PUGIXML_CHAR, std::char_traits<PUGIXML_CHAR>, std::allocator<PUGIXML_CHAR> > string_t;
+#endif
+}
+
+// The PugiXML namespace
+namespace pugi
+{
+       // Tree node types
+       enum xml_node_type
+       {
+               node_null,                      // Empty (null) node handle
+               node_document,          // A document tree's absolute root
+               node_element,           // Element tag, i.e. '<node/>'
+               node_pcdata,            // Plain character data, i.e. 'text'
+               node_cdata,                     // Character data, i.e. '<![CDATA[text]]>'
+               node_comment,           // Comment tag, i.e. '<!-- text -->'
+               node_pi,                        // Processing instruction, i.e. '<?name?>'
+               node_declaration,       // Document declaration, i.e. '<?xml version="1.0"?>'
+               node_doctype            // Document type declaration, i.e. '<!DOCTYPE doc>'
+       };
+
+       // Parsing options
+
+       // Minimal parsing mode (equivalent to turning all other flags off).
+       // Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed.
+       const unsigned int parse_minimal = 0x0000;
+
+       // This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default.
+       const unsigned int parse_pi = 0x0001;
+
+       // This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default.
+       const unsigned int parse_comments = 0x0002;
+
+       // This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default.
+       const unsigned int parse_cdata = 0x0004;
+
+       // This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree.
+       // This flag is off by default; turning it on usually results in slower parsing and more memory consumption.
+       const unsigned int parse_ws_pcdata = 0x0008;
+
+       // This flag determines if character and entity references are expanded during parsing. This flag is on by default.
+       const unsigned int parse_escapes = 0x0010;
+
+       // This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
+       const unsigned int parse_eol = 0x0020;
+       
+       // This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
+       const unsigned int parse_wconv_attribute = 0x0040;
+
+       // This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
+       const unsigned int parse_wnorm_attribute = 0x0080;
+       
+       // This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
+       const unsigned int parse_declaration = 0x0100;
+
+       // This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default.
+       const unsigned int parse_doctype = 0x0200;
+
+       // This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only
+       // of whitespace is added to the DOM tree.
+       // This flag is off by default; turning it on may result in slower parsing and more memory consumption.
+       const unsigned int parse_ws_pcdata_single = 0x0400;
+
+       // This flag determines if leading and trailing whitespace is to be removed from plain character data. This flag is off by default.
+       const unsigned int parse_trim_pcdata = 0x0800;
+
+       // This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document
+       // is a valid document. This flag is off by default.
+       const unsigned int parse_fragment = 0x1000;
+
+       // The default parsing mode.
+       // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
+       // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+       const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol;
+
+       // The full parsing mode.
+       // Nodes of all types are added to the DOM tree, character/reference entities are expanded,
+       // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+       const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype;
+
+       // These flags determine the encoding of input data for XML document
+       enum xml_encoding
+       {
+               encoding_auto,          // Auto-detect input encoding using BOM or < / <? detection; use UTF8 if BOM is not found
+               encoding_utf8,          // UTF8 encoding
+               encoding_utf16_le,      // Little-endian UTF16
+               encoding_utf16_be,      // Big-endian UTF16
+               encoding_utf16,         // UTF16 with native endianness
+               encoding_utf32_le,      // Little-endian UTF32
+               encoding_utf32_be,      // Big-endian UTF32
+               encoding_utf32,         // UTF32 with native endianness
+               encoding_wchar,         // The same encoding wchar_t has (either UTF16 or UTF32)
+               encoding_latin1
+       };
+
+       // Formatting flags
+       
+       // Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
+       const unsigned int format_indent = 0x01;
+       
+       // Write encoding-specific BOM to the output stream. This flag is off by default.
+       const unsigned int format_write_bom = 0x02;
+
+       // Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
+       const unsigned int format_raw = 0x04;
+       
+       // Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
+       const unsigned int format_no_declaration = 0x08;
+
+       // Don't escape attribute values and PCDATA contents. This flag is off by default.
+       const unsigned int format_no_escapes = 0x10;
+
+       // Open file using text mode in xml_document::save_file. This enables special character (i.e. new-line) conversions on some systems. This flag is off by default.
+       const unsigned int format_save_file_text = 0x20;
+
+       // The default set of formatting flags.
+       // Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
+       const unsigned int format_default = format_indent;
+               
+       // Forward declarations
+       struct xml_attribute_struct;
+       struct xml_node_struct;
+
+       class xml_node_iterator;
+       class xml_attribute_iterator;
+       class xml_named_node_iterator;
+
+       class xml_tree_walker;
+
+       struct xml_parse_result;
+
+       class xml_node;
+
+       class xml_text;
+       
+       #ifndef PUGIXML_NO_XPATH
+       class xpath_node;
+       class xpath_node_set;
+       class xpath_query;
+       class xpath_variable_set;
+       #endif
+
+       // Range-based for loop support
+       template <typename It> class xml_object_range
+       {
+       public:
+               typedef It const_iterator;
+               typedef It iterator;
+
+               xml_object_range(It b, It e): _begin(b), _end(e)
+               {
+               }
+
+               It begin() const { return _begin; }
+               It end() const { return _end; }
+
+       private:
+               It _begin, _end;
+       };
+
+       // Writer interface for node printing (see xml_node::print)
+       class PUGIXML_CLASS xml_writer
+       {
+       public:
+               virtual ~xml_writer() {}
+
+               // Write memory chunk into stream/file/whatever
+               virtual void write(const void* data, size_t size) = 0;
+       };
+
+       // xml_writer implementation for FILE*
+       class PUGIXML_CLASS xml_writer_file: public xml_writer
+       {
+       public:
+               // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
+               xml_writer_file(void* file);
+
+               virtual void write(const void* data, size_t size);
+
+       private:
+               void* file;
+       };
+
+       #ifndef PUGIXML_NO_STL
+       // xml_writer implementation for streams
+       class PUGIXML_CLASS xml_writer_stream: public xml_writer
+       {
+       public:
+               // Construct writer from an output stream object
+               xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
+               xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);
+
+               virtual void write(const void* data, size_t size);
+
+       private:
+               std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
+               std::basic_ostream<wchar_t, std::char_traits<wchar_t> >* wide_stream;
+       };
+       #endif
+
+       // A light-weight handle for manipulating attributes in DOM tree
+       class PUGIXML_CLASS xml_attribute
+       {
+               friend class xml_attribute_iterator;
+               friend class xml_node;
+
+       private:
+               xml_attribute_struct* _attr;
+       
+               typedef void (*unspecified_bool_type)(xml_attribute***);
+
+       public:
+               // Default constructor. Constructs an empty attribute.
+               xml_attribute();
+               
+               // Constructs attribute from internal pointer
+               explicit xml_attribute(xml_attribute_struct* attr);
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+
+               // Borland C++ workaround
+               bool operator!() const;
+
+               // Comparison operators (compares wrapped attribute pointers)
+               bool operator==(const xml_attribute& r) const;
+               bool operator!=(const xml_attribute& r) const;
+               bool operator<(const xml_attribute& r) const;
+               bool operator>(const xml_attribute& r) const;
+               bool operator<=(const xml_attribute& r) const;
+               bool operator>=(const xml_attribute& r) const;
+
+               // Check if attribute is empty
+               bool empty() const;
+
+               // Get attribute name/value, or "" if attribute is empty
+               const char_t* name() const;
+               const char_t* value() const;
+
+               // Get attribute value, or the default value if attribute is empty
+               const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+               // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty
+               int as_int(int def = 0) const;
+               unsigned int as_uint(unsigned int def = 0) const;
+               double as_double(double def = 0) const;
+               float as_float(float def = 0) const;
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               long long as_llong(long long def = 0) const;
+               unsigned long long as_ullong(unsigned long long def = 0) const;
+       #endif
+
+               // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty
+               bool as_bool(bool def = false) const;
+
+               // Set attribute name/value (returns false if attribute is empty or there is not enough memory)
+               bool set_name(const char_t* rhs);
+               bool set_value(const char_t* rhs);
+
+               // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+               bool set_value(int rhs);
+               bool set_value(unsigned int rhs);
+               bool set_value(double rhs);
+               bool set_value(float rhs);
+               bool set_value(bool rhs);
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               bool set_value(long long rhs);
+               bool set_value(unsigned long long rhs);
+       #endif
+
+               // Set attribute value (equivalent to set_value without error checking)
+               xml_attribute& operator=(const char_t* rhs);
+               xml_attribute& operator=(int rhs);
+               xml_attribute& operator=(unsigned int rhs);
+               xml_attribute& operator=(double rhs);
+               xml_attribute& operator=(float rhs);
+               xml_attribute& operator=(bool rhs);
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               xml_attribute& operator=(long long rhs);
+               xml_attribute& operator=(unsigned long long rhs);
+       #endif
+
+               // Get next/previous attribute in the attribute list of the parent node
+               xml_attribute next_attribute() const;
+               xml_attribute previous_attribute() const;
+
+               // Get hash value (unique for handles to the same object)
+               size_t hash_value() const;
+
+               // Get internal pointer
+               xml_attribute_struct* internal_object() const;
+       };
+
+#ifdef __BORLANDC__
+       // Borland C++ workaround
+       bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs);
+       bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs);
+#endif
+
+       // A light-weight handle for manipulating nodes in DOM tree
+       class PUGIXML_CLASS xml_node
+       {
+               friend class xml_attribute_iterator;
+               friend class xml_node_iterator;
+               friend class xml_named_node_iterator;
+
+       protected:
+               xml_node_struct* _root;
+
+               typedef void (*unspecified_bool_type)(xml_node***);
+
+       public:
+               // Default constructor. Constructs an empty node.
+               xml_node();
+
+               // Constructs node from internal pointer
+               explicit xml_node(xml_node_struct* p);
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+
+               // Borland C++ workaround
+               bool operator!() const;
+       
+               // Comparison operators (compares wrapped node pointers)
+               bool operator==(const xml_node& r) const;
+               bool operator!=(const xml_node& r) const;
+               bool operator<(const xml_node& r) const;
+               bool operator>(const xml_node& r) const;
+               bool operator<=(const xml_node& r) const;
+               bool operator>=(const xml_node& r) const;
+
+               // Check if node is empty.
+               bool empty() const;
+
+               // Get node type
+               xml_node_type type() const;
+
+               // Get node name, or "" if node is empty or it has no name
+               const char_t* name() const;
+
+               // Get node value, or "" if node is empty or it has no value
+               // Note: For <node>text</node> node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes.
+               const char_t* value() const;
+       
+               // Get attribute list
+               xml_attribute first_attribute() const;
+               xml_attribute last_attribute() const;
+
+               // Get children list
+               xml_node first_child() const;
+               xml_node last_child() const;
+
+               // Get next/previous sibling in the children list of the parent node
+               xml_node next_sibling() const;
+               xml_node previous_sibling() const;
+               
+               // Get parent node
+               xml_node parent() const;
+
+               // Get root of DOM tree this node belongs to
+               xml_node root() const;
+
+               // Get text object for the current node
+               xml_text text() const;
+
+               // Get child, attribute or next/previous sibling with the specified name
+               xml_node child(const char_t* name) const;
+               xml_attribute attribute(const char_t* name) const;
+               xml_node next_sibling(const char_t* name) const;
+               xml_node previous_sibling(const char_t* name) const;
+
+               // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA
+               const char_t* child_value() const;
+
+               // Get child value of child with specified name. Equivalent to child(name).child_value().
+               const char_t* child_value(const char_t* name) const;
+
+               // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
+               bool set_name(const char_t* rhs);
+               bool set_value(const char_t* rhs);
+               
+               // Add attribute with specified name. Returns added attribute, or empty attribute on errors.
+               xml_attribute append_attribute(const char_t* name);
+               xml_attribute prepend_attribute(const char_t* name);
+               xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr);
+               xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr);
+
+               // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors.
+               xml_attribute append_copy(const xml_attribute& proto);
+               xml_attribute prepend_copy(const xml_attribute& proto);
+               xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr);
+               xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr);
+
+               // Add child node with specified type. Returns added node, or empty node on errors.
+               xml_node append_child(xml_node_type type = node_element);
+               xml_node prepend_child(xml_node_type type = node_element);
+               xml_node insert_child_after(xml_node_type type, const xml_node& node);
+               xml_node insert_child_before(xml_node_type type, const xml_node& node);
+
+               // Add child element with specified name. Returns added node, or empty node on errors.
+               xml_node append_child(const char_t* name);
+               xml_node prepend_child(const char_t* name);
+               xml_node insert_child_after(const char_t* name, const xml_node& node);
+               xml_node insert_child_before(const char_t* name, const xml_node& node);
+
+               // Add a copy of the specified node as a child. Returns added node, or empty node on errors.
+               xml_node append_copy(const xml_node& proto);
+               xml_node prepend_copy(const xml_node& proto);
+               xml_node insert_copy_after(const xml_node& proto, const xml_node& node);
+               xml_node insert_copy_before(const xml_node& proto, const xml_node& node);
+
+               // Move the specified node to become a child of this node. Returns moved node, or empty node on errors.
+               xml_node append_move(const xml_node& moved);
+               xml_node prepend_move(const xml_node& moved);
+               xml_node insert_move_after(const xml_node& moved, const xml_node& node);
+               xml_node insert_move_before(const xml_node& moved, const xml_node& node);
+
+               // Remove specified attribute
+               bool remove_attribute(const xml_attribute& a);
+               bool remove_attribute(const char_t* name);
+
+               // Remove specified child
+               bool remove_child(const xml_node& n);
+               bool remove_child(const char_t* name);
+
+               // Parses buffer as an XML document fragment and appends all nodes as children of the current node.
+               // Copies/converts the buffer, so it may be deleted or changed after the function returns.
+               // Note: append_buffer allocates memory that has the lifetime of the owning document; removing the appended nodes does not immediately reclaim that memory.
+               xml_parse_result append_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Find attribute using predicate. Returns first attribute for which predicate returned true.
+               template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
+               {
+                       if (!_root) return xml_attribute();
+                       
+                       for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
+                               if (pred(attrib))
+                                       return attrib;
+               
+                       return xml_attribute();
+               }
+
+               // Find child node using predicate. Returns first child for which predicate returned true.
+               template <typename Predicate> xml_node find_child(Predicate pred) const
+               {
+                       if (!_root) return xml_node();
+       
+                       for (xml_node node = first_child(); node; node = node.next_sibling())
+                               if (pred(node))
+                                       return node;
+               
+                       return xml_node();
+               }
+
+               // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true.
+               template <typename Predicate> xml_node find_node(Predicate pred) const
+               {
+                       if (!_root) return xml_node();
+
+                       xml_node cur = first_child();
+                       
+                       while (cur._root && cur._root != _root)
+                       {
+                               if (pred(cur)) return cur;
+
+                               if (cur.first_child()) cur = cur.first_child();
+                               else if (cur.next_sibling()) cur = cur.next_sibling();
+                               else
+                               {
+                                       while (!cur.next_sibling() && cur._root != _root) cur = cur.parent();
+
+                                       if (cur._root != _root) cur = cur.next_sibling();
+                               }
+                       }
+
+                       return xml_node();
+               }
+
+               // Find child node by attribute name/value
+               xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const;
+               xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const;
+
+       #ifndef PUGIXML_NO_STL
+               // Get the absolute node path from root as a text string.
+               string_t path(char_t delimiter = '/') const;
+       #endif
+
+               // Search for a node by path consisting of node names and . or .. elements.
+               xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const;
+
+               // Recursively traverse subtree with xml_tree_walker
+               bool traverse(xml_tree_walker& walker);
+       
+       #ifndef PUGIXML_NO_XPATH
+               // Select single node by evaluating XPath query. Returns first node from the resulting node set.
+               xpath_node select_node(const char_t* query, xpath_variable_set* variables = 0) const;
+               xpath_node select_node(const xpath_query& query) const;
+
+               // Select node set by evaluating XPath query
+               xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const;
+               xpath_node_set select_nodes(const xpath_query& query) const;
+
+               // (deprecated: use select_node instead) Select single node by evaluating XPath query.
+               xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
+               xpath_node select_single_node(const xpath_query& query) const;
+
+       #endif
+               
+               // Print subtree using a writer object
+               void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+
+       #ifndef PUGIXML_NO_STL
+               // Print subtree to stream
+               void print(std::basic_ostream<char, std::char_traits<char> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+               void print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const;
+       #endif
+
+               // Child nodes iterators
+               typedef xml_node_iterator iterator;
+
+               iterator begin() const;
+               iterator end() const;
+
+               // Attribute iterators
+               typedef xml_attribute_iterator attribute_iterator;
+
+               attribute_iterator attributes_begin() const;
+               attribute_iterator attributes_end() const;
+
+               // Range-based for support
+               xml_object_range<xml_node_iterator> children() const;
+               xml_object_range<xml_named_node_iterator> children(const char_t* name) const;
+               xml_object_range<xml_attribute_iterator> attributes() const;
+
+               // Get node offset in parsed file/string (in char_t units) for debugging purposes
+               ptrdiff_t offset_debug() const;
+
+               // Get hash value (unique for handles to the same object)
+               size_t hash_value() const;
+
+               // Get internal pointer
+               xml_node_struct* internal_object() const;
+       };
+
+#ifdef __BORLANDC__
+       // Borland C++ workaround
+       bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs);
+       bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs);
+#endif
+
+       // A helper for working with text inside PCDATA nodes
+       class PUGIXML_CLASS xml_text
+       {
+               friend class xml_node;
+
+               xml_node_struct* _root;
+
+               typedef void (*unspecified_bool_type)(xml_text***);
+
+               explicit xml_text(xml_node_struct* root);
+
+               xml_node_struct* _data_new();
+               xml_node_struct* _data() const;
+
+       public:
+               // Default constructor. Constructs an empty object.
+               xml_text();
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+
+               // Borland C++ workaround
+               bool operator!() const;
+
+               // Check if text object is empty
+               bool empty() const;
+
+               // Get text, or "" if object is empty
+               const char_t* get() const;
+
+               // Get text, or the default value if object is empty
+               const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+               // Get text as a number, or the default value if conversion did not succeed or object is empty
+               int as_int(int def = 0) const;
+               unsigned int as_uint(unsigned int def = 0) const;
+               double as_double(double def = 0) const;
+               float as_float(float def = 0) const;
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               long long as_llong(long long def = 0) const;
+               unsigned long long as_ullong(unsigned long long def = 0) const;
+       #endif
+
+               // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty
+               bool as_bool(bool def = false) const;
+
+               // Set text (returns false if object is empty or there is not enough memory)
+               bool set(const char_t* rhs);
+
+               // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+               bool set(int rhs);
+               bool set(unsigned int rhs);
+               bool set(double rhs);
+               bool set(float rhs);
+               bool set(bool rhs);
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               bool set(long long rhs);
+               bool set(unsigned long long rhs);
+       #endif
+
+               // Set text (equivalent to set without error checking)
+               xml_text& operator=(const char_t* rhs);
+               xml_text& operator=(int rhs);
+               xml_text& operator=(unsigned int rhs);
+               xml_text& operator=(double rhs);
+               xml_text& operator=(float rhs);
+               xml_text& operator=(bool rhs);
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               xml_text& operator=(long long rhs);
+               xml_text& operator=(unsigned long long rhs);
+       #endif
+
+               // Get the data node (node_pcdata or node_cdata) for this object
+               xml_node data() const;
+       };
+
+#ifdef __BORLANDC__
+       // Borland C++ workaround
+       bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs);
+       bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs);
+#endif
+
+       // Child node iterator (a bidirectional iterator over a collection of xml_node)
+       class PUGIXML_CLASS xml_node_iterator
+       {
+               friend class xml_node;
+
+       private:
+               mutable xml_node _wrap;
+               xml_node _parent;
+
+               xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent);
+
+       public:
+               // Iterator traits
+               typedef ptrdiff_t difference_type;
+               typedef xml_node value_type;
+               typedef xml_node* pointer;
+               typedef xml_node& reference;
+
+       #ifndef PUGIXML_NO_STL
+               typedef std::bidirectional_iterator_tag iterator_category;
+       #endif
+
+               // Default constructor
+               xml_node_iterator();
+
+               // Construct an iterator which points to the specified node
+               xml_node_iterator(const xml_node& node);
+
+               // Iterator operators
+               bool operator==(const xml_node_iterator& rhs) const;
+               bool operator!=(const xml_node_iterator& rhs) const;
+
+               xml_node& operator*() const;
+               xml_node* operator->() const;
+
+               const xml_node_iterator& operator++();
+               xml_node_iterator operator++(int);
+
+               const xml_node_iterator& operator--();
+               xml_node_iterator operator--(int);
+       };
+
+       // Attribute iterator (a bidirectional iterator over a collection of xml_attribute)
+       class PUGIXML_CLASS xml_attribute_iterator
+       {
+               friend class xml_node;
+
+       private:
+               mutable xml_attribute _wrap;
+               xml_node _parent;
+
+               xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent);
+
+       public:
+               // Iterator traits
+               typedef ptrdiff_t difference_type;
+               typedef xml_attribute value_type;
+               typedef xml_attribute* pointer;
+               typedef xml_attribute& reference;
+
+       #ifndef PUGIXML_NO_STL
+               typedef std::bidirectional_iterator_tag iterator_category;
+       #endif
+
+               // Default constructor
+               xml_attribute_iterator();
+
+               // Construct an iterator which points to the specified attribute
+               xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent);
+
+               // Iterator operators
+               bool operator==(const xml_attribute_iterator& rhs) const;
+               bool operator!=(const xml_attribute_iterator& rhs) const;
+
+               xml_attribute& operator*() const;
+               xml_attribute* operator->() const;
+
+               const xml_attribute_iterator& operator++();
+               xml_attribute_iterator operator++(int);
+
+               const xml_attribute_iterator& operator--();
+               xml_attribute_iterator operator--(int);
+       };
+
+       // Named node range helper
+       class PUGIXML_CLASS xml_named_node_iterator
+       {
+               friend class xml_node;
+
+       public:
+               // Iterator traits
+               typedef ptrdiff_t difference_type;
+               typedef xml_node value_type;
+               typedef xml_node* pointer;
+               typedef xml_node& reference;
+
+       #ifndef PUGIXML_NO_STL
+               typedef std::bidirectional_iterator_tag iterator_category;
+       #endif
+
+               // Default constructor
+               xml_named_node_iterator();
+
+               // Construct an iterator which points to the specified node
+               xml_named_node_iterator(const xml_node& node, const char_t* name);
+
+               // Iterator operators
+               bool operator==(const xml_named_node_iterator& rhs) const;
+               bool operator!=(const xml_named_node_iterator& rhs) const;
+
+               xml_node& operator*() const;
+               xml_node* operator->() const;
+
+               const xml_named_node_iterator& operator++();
+               xml_named_node_iterator operator++(int);
+
+               const xml_named_node_iterator& operator--();
+               xml_named_node_iterator operator--(int);
+
+       private:
+               mutable xml_node _wrap;
+               xml_node _parent;
+               const char_t* _name;
+
+               xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name);
+       };
+
+       // Abstract tree walker class (see xml_node::traverse)
+       class PUGIXML_CLASS xml_tree_walker
+       {
+               friend class xml_node;
+
+       private:
+               int _depth;
+       
+       protected:
+               // Get current traversal depth
+               int depth() const;
+       
+       public:
+               xml_tree_walker();
+               virtual ~xml_tree_walker();
+
+               // Callback that is called when traversal begins
+               virtual bool begin(xml_node& node);
+
+               // Callback that is called for each node traversed
+               virtual bool for_each(xml_node& node) = 0;
+
+               // Callback that is called when traversal ends
+               virtual bool end(xml_node& node);
+       };
+
+       // Parsing status, returned as part of xml_parse_result object
+       enum xml_parse_status
+       {
+               status_ok = 0,                          // No error
+
+               status_file_not_found,          // File was not found during load_file()
+               status_io_error,                        // Error reading from file/stream
+               status_out_of_memory,           // Could not allocate memory
+               status_internal_error,          // Internal error occurred
+
+               status_unrecognized_tag,        // Parser could not determine tag type
+
+               status_bad_pi,                          // Parsing error occurred while parsing document declaration/processing instruction
+               status_bad_comment,                     // Parsing error occurred while parsing comment
+               status_bad_cdata,                       // Parsing error occurred while parsing CDATA section
+               status_bad_doctype,                     // Parsing error occurred while parsing document type declaration
+               status_bad_pcdata,                      // Parsing error occurred while parsing PCDATA section
+               status_bad_start_element,       // Parsing error occurred while parsing start element tag
+               status_bad_attribute,           // Parsing error occurred while parsing element attribute
+               status_bad_end_element,         // Parsing error occurred while parsing end element tag
+               status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag)
+
+               status_append_invalid_root,     // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer)
+
+               status_no_document_element      // Parsing resulted in a document without element nodes
+       };
+
+       // Parsing result
+       struct PUGIXML_CLASS xml_parse_result
+       {
+               // Parsing status (see xml_parse_status)
+               xml_parse_status status;
+
+               // Last parsed offset (in char_t units from start of input data)
+               ptrdiff_t offset;
+
+               // Source document encoding
+               xml_encoding encoding;
+
+               // Default constructor, initializes object to failed state
+               xml_parse_result();
+
+               // Cast to bool operator
+               operator bool() const;
+
+               // Get error description
+               const char* description() const;
+       };
+
+       // Document class (DOM tree root)
+       class PUGIXML_CLASS xml_document: public xml_node
+       {
+       private:
+               char_t* _buffer;
+
+               char _memory[192];
+               
+               // Non-copyable semantics
+               xml_document(const xml_document&);
+               const xml_document& operator=(const xml_document&);
+
+               void create();
+               void destroy();
+
+       public:
+               // Default constructor, makes empty document
+               xml_document();
+
+               // Destructor, invalidates all node/attribute handles to this document
+               ~xml_document();
+
+               // Removes all nodes, leaving the empty document
+               void reset();
+
+               // Removes all nodes, then copies the entire contents of the specified document
+               void reset(const xml_document& proto);
+
+       #ifndef PUGIXML_NO_STL
+               // Load document from stream.
+               xml_parse_result load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+               xml_parse_result load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options = parse_default);
+       #endif
+
+               // (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied.
+               xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
+
+               // Load document from zero-terminated string. No encoding conversions are applied.
+               xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default);
+
+               // Load document from file
+               xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+               xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns.
+               xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+               // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed.
+               xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+               // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore).
+               xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details).
+               void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+       #ifndef PUGIXML_NO_STL
+               // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details).
+               void save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+               void save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const;
+       #endif
+
+               // Save XML to file
+               bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+               bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+               // Get document element
+               xml_node document_element() const;
+       };
+
+#ifndef PUGIXML_NO_XPATH
+       // XPath query return type
+       enum xpath_value_type
+       {
+               xpath_type_none,          // Unknown type (query failed to compile)
+               xpath_type_node_set,  // Node set (xpath_node_set)
+               xpath_type_number,        // Number
+               xpath_type_string,        // String
+               xpath_type_boolean        // Boolean
+       };
+
+       // XPath parsing result
+       struct PUGIXML_CLASS xpath_parse_result
+       {
+               // Error message (0 if no error)
+               const char* error;
+
+               // Last parsed offset (in char_t units from string start)
+               ptrdiff_t offset;
+
+               // Default constructor, initializes object to failed state
+               xpath_parse_result();
+
+               // Cast to bool operator
+               operator bool() const;
+
+               // Get error description
+               const char* description() const;
+       };
+
+       // A single XPath variable
+       class PUGIXML_CLASS xpath_variable
+       {
+               friend class xpath_variable_set;
+
+       protected:
+               xpath_value_type _type;
+               xpath_variable* _next;
+
+               xpath_variable();
+
+               // Non-copyable semantics
+               xpath_variable(const xpath_variable&);
+               xpath_variable& operator=(const xpath_variable&);
+               
+       public:
+               // Get variable name
+               const char_t* name() const;
+
+               // Get variable type
+               xpath_value_type type() const;
+
+               // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error
+               bool get_boolean() const;
+               double get_number() const;
+               const char_t* get_string() const;
+               const xpath_node_set& get_node_set() const;
+
+               // Set variable value; no type conversion is performed, false is returned on type mismatch error
+               bool set(bool value);
+               bool set(double value);
+               bool set(const char_t* value);
+               bool set(const xpath_node_set& value);
+       };
+
+       // A set of XPath variables
+       class PUGIXML_CLASS xpath_variable_set
+       {
+       private:
+               xpath_variable* _data[64];
+
+               // Non-copyable semantics
+               xpath_variable_set(const xpath_variable_set&);
+               xpath_variable_set& operator=(const xpath_variable_set&);
+
+               xpath_variable* find(const char_t* name) const;
+
+       public:
+               // Default constructor/destructor
+               xpath_variable_set();
+               ~xpath_variable_set();
+
+               // Add a new variable or get the existing one, if the types match
+               xpath_variable* add(const char_t* name, xpath_value_type type);
+
+               // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch
+               bool set(const char_t* name, bool value);
+               bool set(const char_t* name, double value);
+               bool set(const char_t* name, const char_t* value);
+               bool set(const char_t* name, const xpath_node_set& value);
+
+               // Get existing variable by name
+               xpath_variable* get(const char_t* name);
+               const xpath_variable* get(const char_t* name) const;
+       };
+
+       // A compiled XPath query object
+       class PUGIXML_CLASS xpath_query
+       {
+       private:
+               void* _impl;
+               xpath_parse_result _result;
+
+               typedef void (*unspecified_bool_type)(xpath_query***);
+
+               // Non-copyable semantics
+               xpath_query(const xpath_query&);
+               xpath_query& operator=(const xpath_query&);
+
+       public:
+               // Construct a compiled object from XPath expression.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors.
+               explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0);
+
+               // Destructor
+               ~xpath_query();
+
+               // Get query expression return type
+               xpath_value_type return_type() const;
+               
+               // Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+               bool evaluate_boolean(const xpath_node& n) const;
+               
+               // Evaluate expression as double value in the specified context; performs type conversion if necessary.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+               double evaluate_number(const xpath_node& n) const;
+               
+       #ifndef PUGIXML_NO_STL
+               // Evaluate expression as string value in the specified context; performs type conversion if necessary.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+               string_t evaluate_string(const xpath_node& n) const;
+       #endif
+               
+               // Evaluate expression as string value in the specified context; performs type conversion if necessary.
+               // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+               // If PUGIXML_NO_EXCEPTIONS is defined, returns empty  set instead.
+               size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const;
+
+               // Evaluate expression as node set in the specified context.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
+               // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead.
+               xpath_node_set evaluate_node_set(const xpath_node& n) const;
+
+               // Evaluate expression as node set in the specified context.
+               // Return first node in document order, or empty node if node set is empty.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
+               // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node instead.
+               xpath_node evaluate_node(const xpath_node& n) const;
+
+               // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode)
+               const xpath_parse_result& result() const;
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+
+               // Borland C++ workaround
+               bool operator!() const;
+       };
+       
+       #ifndef PUGIXML_NO_EXCEPTIONS
+       // XPath exception class
+       class PUGIXML_CLASS xpath_exception: public std::exception
+       {
+       private:
+               xpath_parse_result _result;
+
+       public:
+               // Construct exception from parse result
+               explicit xpath_exception(const xpath_parse_result& result);
+
+               // Get error message
+               virtual const char* what() const throw();
+
+               // Get parse result
+               const xpath_parse_result& result() const;
+       };
+       #endif
+       
+       // XPath node class (either xml_node or xml_attribute)
+       class PUGIXML_CLASS xpath_node
+       {
+       private:
+               xml_node _node;
+               xml_attribute _attribute;
+       
+               typedef void (*unspecified_bool_type)(xpath_node***);
+
+       public:
+               // Default constructor; constructs empty XPath node
+               xpath_node();
+               
+               // Construct XPath node from XML node/attribute
+               xpath_node(const xml_node& node);
+               xpath_node(const xml_attribute& attribute, const xml_node& parent);
+
+               // Get node/attribute, if any
+               xml_node node() const;
+               xml_attribute attribute() const;
+               
+               // Get parent of contained node/attribute
+               xml_node parent() const;
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+               
+               // Borland C++ workaround
+               bool operator!() const;
+
+               // Comparison operators
+               bool operator==(const xpath_node& n) const;
+               bool operator!=(const xpath_node& n) const;
+       };
+
+#ifdef __BORLANDC__
+       // Borland C++ workaround
+       bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs);
+       bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs);
+#endif
+
+       // A fixed-size collection of XPath nodes
+       class PUGIXML_CLASS xpath_node_set
+       {
+       public:
+               // Collection type
+               enum type_t
+               {
+                       type_unsorted,                  // Not ordered
+                       type_sorted,                    // Sorted by document order (ascending)
+                       type_sorted_reverse             // Sorted by document order (descending)
+               };
+               
+               // Constant iterator type
+               typedef const xpath_node* const_iterator;
+
+               // We define non-constant iterator to be the same as constant iterator so that various generic algorithms (i.e. boost foreach) work
+               typedef const xpath_node* iterator;
+       
+               // Default constructor. Constructs empty set.
+               xpath_node_set();
+
+               // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful
+               xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted);
+
+               // Destructor
+               ~xpath_node_set();
+               
+               // Copy constructor/assignment operator
+               xpath_node_set(const xpath_node_set& ns);
+               xpath_node_set& operator=(const xpath_node_set& ns);
+
+               // Get collection type
+               type_t type() const;
+               
+               // Get collection size
+               size_t size() const;
+
+               // Indexing operator
+               const xpath_node& operator[](size_t index) const;
+               
+               // Collection iterators
+               const_iterator begin() const;
+               const_iterator end() const;
+
+               // Sort the collection in ascending/descending order by document order
+               void sort(bool reverse = false);
+               
+               // Get first node in the collection by document order
+               xpath_node first() const;
+               
+               // Check if collection is empty
+               bool empty() const;
+       
+       private:
+               type_t _type;
+               
+               xpath_node _storage;
+               
+               xpath_node* _begin;
+               xpath_node* _end;
+
+               void _assign(const_iterator begin, const_iterator end);
+       };
+#endif
+
+#ifndef PUGIXML_NO_STL
+       // Convert wide string to UTF8
+       std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
+       std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
+       
+       // Convert UTF8 to wide string
+       std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
+       std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
+#endif
+
+       // Memory allocation function interface; returns pointer to allocated memory or NULL on failure
+       typedef void* (*allocation_function)(size_t size);
+       
+       // Memory deallocation function interface
+       typedef void (*deallocation_function)(void* ptr);
+
+       // Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
+       void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
+       
+       // Get current memory management functions
+       allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
+       deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
+}
+
+#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
+namespace std
+{
+       // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&);
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&);
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&);
+}
+#endif
+
+#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
+namespace std
+{
+       // Workarounds for (non-standard) iterator category detection
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&);
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&);
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&);
+}
+#endif
+
+#endif
+
+// Make sure implementation is included in header-only mode
+// Use macro expansion in #include to work around QMake (QTBUG-11923)
+#if defined(PUGIXML_HEADER_ONLY) && !defined(PUGIXML_SOURCE)
+#      define PUGIXML_SOURCE "pugixml.cpp"
+#      include PUGIXML_SOURCE
+#endif
+
+/**
+ * Copyright (c) 2006-2014 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/src/vcf/VcfFile.cpp b/src/vcf/VcfFile.cpp

new file mode 100644 (file)

index 0000000..b2fd2d4
--- /dev/null
+++ b/src/vcf/VcfFile.cpp
@@ -0,0 +1,34 @@
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/vcf/VcfFile.h"
+
+#include <cassert>
+#include <type_traits>
+
+#include "pbbam/vcf/VcfFormat.h"
+
+namespace PacBio {
+namespace VCF {
+
+static_assert(std::is_copy_constructible<VcfFile>::value,
+              "VcfFile(const VcfFile&) is not = default");
+static_assert(std::is_copy_assignable<VcfFile>::value,
+              "VcfFile& operator=(const VcfFile&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<VcfFile>::value,
+              "VcfFile(VcfFile&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<VcfFile>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+VcfFile::VcfFile(std::string fn)
+    : filename_{std::move(fn)}, header_{VcfFormat::HeaderFromFile(filename_)}
+{
+}
+
+const std::string& VcfFile::Filename() const { return filename_; }
+
+const VcfHeader& VcfFile::Header() const { return header_; }
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfFormat.cpp b/src/vcf/VcfFormat.cpp

new file mode 100644 (file)

index 0000000..005e5a7
--- /dev/null
+++ b/src/vcf/VcfFormat.cpp
@@ -0,0 +1,551 @@
+// Author: Derek Barnett
+
+#include "../PbbamInternalConfig.h"
+
+#include <pbbam/vcf/VcfFormat.h>
+
+#include <cassert>
+#include <cmath>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include <htslib/vcf.h>
+
+#include <pbbam/StringUtilities.h>
+#include <pbbam/vcf/VcfHeader.h>
+
+namespace PacBio {
+namespace VCF {
+
+namespace {  // anonymous
+
+// using htslib's current version for better compatibility
+static constexpr const char current_version[] = "VCFv4.2";
+
+namespace Tokens {
+
+static constexpr const char file_format[] = "fileformat";
+
+static constexpr const char double_hash[] = "##";
+static constexpr const char contig_lead[] = "##contig=<";
+static constexpr const char filter_lead[] = "##FILTER=<";
+static constexpr const char format_lead[] = "##FORMAT=<";
+static constexpr const char info_lead[] = "##INFO=<";
+static constexpr const char chrom_lead[] = "#CHROM";
+
+static constexpr const char id[] = "ID";
+static constexpr const char number[] = "Number";
+static constexpr const char type[] = "Type";
+static constexpr const char description[] = "Description";
+static constexpr const char source[] = "Source";
+static constexpr const char version[] = "Version";
+
+}  // namespace Tokens
+
+std::string QuotedText(const std::string& d) { return "\"" + d + "\""; }
+
+std::string UnquotedText(const std::string& d)
+{
+    if (d.size() < 2 || d.front() != '"' || d.back() != '"')
+        throw std::runtime_error{"VcfFormat: description text is not quoted: " + d};
+    return d.substr(1, d.size() - 2);
+}
+
+}  // namespace anonymous
+
+const char* VcfFormat::CurrentVersion() { return current_version; }
+
+std::string VcfFormat::FormattedContigDefinition(const ContigDefinition& def)
+{
+    std::ostringstream text;
+
+    // ID
+    text << Tokens::contig_lead << Tokens::id << '=' << def.Id();
+
+    // attributes
+    if (!def.Attributes().empty()) {
+        text << ',';
+        bool first = true;
+        for (const auto& attr : def.Attributes()) {
+            if (!first) text << ',';
+            text << attr.first << '=' << attr.second;
+            first = false;
+        }
+    }
+    text << '>';
+    return text.str();
+}
+
+std::string VcfFormat::FormattedFilterDefinition(const FilterDefinition& def)
+{
+    std::ostringstream text;
+    text << Tokens::filter_lead << Tokens::id << '=' << def.Id() << ',' << Tokens::description
+         << '=' << QuotedText(def.Description()) << '>';
+    return text.str();
+}
+
+std::string VcfFormat::FormattedFormatDefinition(const FormatDefinition& def)
+{
+    std::ostringstream text;
+    text << Tokens::format_lead << Tokens::id << '=' << def.Id() << ',' << Tokens::number << '='
+         << def.Number() << ',' << Tokens::type << '=' << def.Type() << ',' << Tokens::description
+         << '=' << QuotedText(def.Description()) << '>';
+    return text.str();
+}
+
+std::string VcfFormat::FormattedGeneralDefinition(const GeneralDefinition& def)
+{
+    std::ostringstream text;
+    text << Tokens::double_hash << def.Id() << '=' << def.Text();
+    return text.str();
+}
+
+std::string VcfFormat::FormattedInfoDefinition(const InfoDefinition& def)
+{
+    std::ostringstream text;
+    text << Tokens::info_lead << Tokens::id << '=' << def.Id() << ',' << Tokens::number << '='
+         << def.Number() << ',' << Tokens::type << '=' << def.Type() << ',' << Tokens::description
+         << '=' << QuotedText(def.Description());
+
+    if (def.Source().is_initialized() && !def.Source().get().empty())
+        text << ',' << Tokens::source << '=' << QuotedText(def.Source().get());
+
+    if (def.Version().is_initialized() && !def.Version().get().empty())
+        text << ',' << Tokens::version << '=' << QuotedText(def.Version().get());
+
+    text << '>';
+    return text.str();
+}
+
+std::string VcfFormat::FormattedHeader(const VcfHeader& header)
+{
+    std::ostringstream out;
+
+    const auto& fileformat = header.GeneralDefinition(Tokens::file_format);
+    out << FormattedGeneralDefinition(fileformat) << '\n';
+
+    // remaining general definiitions
+    for (const auto& def : header.GeneralDefinitions()) {
+        if (def.Id() != Tokens::file_format) out << FormattedGeneralDefinition(def) << '\n';
+    }
+
+    // ##contig
+    for (const auto& contig : header.ContigDefinitions())
+        out << FormattedContigDefinition(contig) << '\n';
+
+    // ##FILTER
+    for (const auto& filter : header.FilterDefinitions())
+        out << FormattedFilterDefinition(filter) << '\n';
+
+    // ##INFO
+    for (const auto& info : header.InfoDefinitions())
+        out << FormattedInfoDefinition(info) << '\n';
+
+    // ##FORMAT
+    for (const auto& format : header.FormatDefinitions())
+        out << FormattedFormatDefinition(format) << '\n';
+
+    // fixed headers
+    out << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
+
+    // samples
+    const auto& samples = header.Samples();
+    if (!samples.empty()) {
+        out << "\tFORMAT";
+        for (const auto& sample : samples)
+            out << '\t' << sample;
+    }
+
+    return out.str();
+}
+
+ContigDefinition VcfFormat::ParsedContigDefinition(std::string line)
+{
+    // should already be checked by "normal" code path
+    assert(line.find(Tokens::contig_lead) == 0);
+
+    // substring between brackets
+    const auto lastBracketPos = line.find_last_of('>');
+    if (lastBracketPos == std::string::npos)
+        throw std::runtime_error{"VcfFormat: malformed ##contig line: " + line};
+    line = std::string(line.cbegin() + 10, line.cbegin() + lastBracketPos);
+
+    std::string id;
+    std::vector<std::pair<std::string, std::string>> attributes;
+
+    const auto fields = PacBio::BAM::Split(line, ',');
+    for (const auto& field : fields) {
+        const auto tokens = PacBio::BAM::Split(field, '=');
+        if (tokens.size() != 2) {
+            throw std::runtime_error{"VcfFormat: malformed ##contig line: " + line};
+        }
+        if (tokens[0] == Tokens::id)
+            id = tokens[1];
+        else
+            attributes.push_back(std::make_pair(tokens[0], tokens[1]));
+    }
+
+    return ContigDefinition{std::move(id), std::move(attributes)};
+}
+
+FilterDefinition VcfFormat::ParsedFilterDefinition(std::string line)
+{
+    // should already be checked by "normal" code path
+    assert(line.find(Tokens::filter_lead) == 0);
+
+    // substring between brackets
+    const auto lastBracketPos = line.find_last_of('>');
+    if (lastBracketPos == std::string::npos)
+        throw std::runtime_error{"VcfFormat: malformed FILTER line: " + line};
+    line = std::string(line.cbegin() + 10, line.cbegin() + lastBracketPos);
+
+    std::string id;
+    std::string description;
+
+    const auto fields = PacBio::BAM::Split(line, ',');
+    for (const auto& field : fields) {
+        const auto tokens = PacBio::BAM::Split(field, '=');
+        if (tokens.size() != 2) {
+            throw std::runtime_error{"VcfFormat: malformed FILTER line: " + line};
+        }
+        if (tokens[0] == Tokens::id)
+            id = tokens[1];
+        else if (tokens[0] == Tokens::description) {
+            description = UnquotedText(tokens[1]);
+        } else
+            throw std::runtime_error{"VcfFormat: unrecognized FILTER field: " + tokens[0]};
+    }
+
+    return FilterDefinition{std::move(id), std::move(description)};
+}
+
+FormatDefinition VcfFormat::ParsedFormatDefinition(std::string line)
+{
+    // should already be checked by "normal" code path
+    assert(line.find(Tokens::format_lead) == 0);
+
+    // substring between brackets
+    const auto lastBracketPos = line.find_last_of('>');
+    if (lastBracketPos == std::string::npos)
+        throw std::runtime_error{"VcfFormat: malformed FORMAT line: " + line};
+    line = std::string(line.cbegin() + 10, line.cbegin() + lastBracketPos);
+
+    std::string id;
+    std::string number;
+    std::string type;
+    std::string description;
+
+    const auto fields = PacBio::BAM::Split(line, ',');
+    for (const auto& field : fields) {
+        const auto tokens = PacBio::BAM::Split(field, '=');
+        if (tokens.size() != 2) {
+            throw std::runtime_error{"VcfFormat: malformed FORMAT line: " + line};
+        }
+        if (tokens[0] == Tokens::id)
+            id = tokens[1];
+        else if (tokens[0] == Tokens::number)
+            number = tokens[1];
+        else if (tokens[0] == Tokens::type)
+            type = tokens[1];
+        else if (tokens[0] == Tokens::description) {
+            description = UnquotedText(tokens[1]);
+        } else
+            throw std::runtime_error{"VcfFormat: unrecognized FORMAT field: " + tokens[0]};
+    }
+
+    return FormatDefinition{std::move(id), std::move(number), std::move(type),
+                            std::move(description)};
+}
+
+GeneralDefinition VcfFormat::ParsedGeneralDefinition(const std::string& line)
+{
+    const auto tokens = PacBio::BAM::Split(line, '=');
+    if (tokens.size() != 2 || tokens[0].find(Tokens::double_hash) != 0) {
+        throw std::runtime_error{"VcfFormat: malformed header line: " + line};
+    }
+
+    return GeneralDefinition{tokens[0].substr(2), tokens[1]};
+}
+
+InfoDefinition VcfFormat::ParsedInfoDefinition(std::string line)
+{
+    // should already be checked by "normal" code path
+    assert(line.find(Tokens::info_lead) == 0);
+
+    // substring between brackets
+    const auto lastBracketPos = line.find_last_of('>');
+    if (lastBracketPos == std::string::npos)
+        throw std::runtime_error{"VcfFormat: malformed INFO line: " + line};
+    line = std::string(line.cbegin() + 8, line.cbegin() + lastBracketPos);
+
+    std::string id;
+    std::string number;
+    std::string type;
+    std::string description;
+    std::string source;
+    std::string version;
+
+    const auto fields = PacBio::BAM::Split(line, ',');
+    for (const auto& field : fields) {
+        const auto tokens = PacBio::BAM::Split(field, '=');
+        if (tokens.size() != 2) {
+            throw std::runtime_error{"VcfFormat: malformed INFO line: " + line};
+        }
+        if (tokens[0] == Tokens::id)
+            id = tokens[1];
+        else if (tokens[0] == Tokens::number)
+            number = tokens[1];
+        else if (tokens[0] == Tokens::type)
+            type = tokens[1];
+        else if (tokens[0] == Tokens::description) {
+            description = UnquotedText(tokens[1]);
+        } else if (tokens[0] == Tokens::source) {
+            source = UnquotedText(tokens[1]);
+        } else if (tokens[0] == Tokens::version) {
+            version = UnquotedText(tokens[1]);
+        } else
+            throw std::runtime_error{"VcfFormat: unrecognized INFO field: " + tokens[0]};
+    }
+
+    return InfoDefinition{std::move(id),          std::move(number), std::move(type),
+                          std::move(description), std::move(source), std::move(version)};
+}
+
+VcfHeader VcfFormat::ParsedHeader(const std::string& hdrText)
+{
+    VcfHeader hdr;
+
+    std::istringstream text{hdrText};
+    std::string line;
+
+    // quick check for fileformat - should be the first line
+    std::getline(text, line);
+    {
+        auto genDef = ParsedGeneralDefinition(line);
+        if (genDef.Id() != Tokens::file_format)
+            throw std::runtime_error{"VcfFormat: file must begin with #fileformat line"};
+        hdr.AddGeneralDefinition(std::move(genDef));
+    }
+
+    // read through rest of header
+    bool chromLineFound = false;
+    for (; std::getline(text, line);) {
+        if (line.empty()) continue;
+
+        // info line
+        if (line.find(Tokens::info_lead) == 0) hdr.AddInfoDefinition(ParsedInfoDefinition(line));
+
+        // filter line
+        else if (line.find(Tokens::filter_lead) == 0)
+            hdr.AddFilterDefinition(ParsedFilterDefinition(line));
+
+        // format line
+        else if (line.find(Tokens::format_lead) == 0)
+            hdr.AddFormatDefinition(ParsedFormatDefinition(line));
+
+        // contig line
+        else if (line.find(Tokens::contig_lead) == 0)
+            hdr.AddContigDefinition(ParsedContigDefinition(line));
+
+        // general comment line
+        //
+        // NOTE: Check this after all other specific header line types. This
+        //       catches all remaining lines starting with "##"
+        //
+        else if (line.find(Tokens::double_hash) == 0)
+            hdr.AddGeneralDefinition(ParsedGeneralDefinition(line));
+
+        // CHROM line (maybe w/ samples)
+        else if (line.find(Tokens::chrom_lead) == 0) {
+            std::vector<Sample> samples;
+
+            // If samples are present, skip the fixed colums & FORMAT column (9)
+            // and read the remaining column labels as sample names.
+            //
+            auto columns = PacBio::BAM::Split(line, '\t');
+            for (size_t i = 9; i < columns.size(); ++i)
+                samples.push_back(std::move(columns[i]));
+            hdr.Samples(std::move(samples));
+
+            // quit header parsing after CHROM line
+            chromLineFound = true;
+            break;
+        } else
+            throw std::runtime_error{"VcfFormat: unexpected line found in header:\n" + line};
+    }
+
+    if (!chromLineFound) throw std::runtime_error{"VcfFormat: CHROM column line is missing"};
+
+    return hdr;
+}
+
+VcfHeader VcfFormat::HeaderFromFile(const std::string& fn)
+{
+    std::ifstream in(fn);
+    return HeaderFromStream(in);
+}
+
+VcfHeader VcfFormat::HeaderFromStream(std::istream& in)
+{
+    std::stringstream text;
+
+    std::string line;
+    while (std::getline(in, line)) {
+        if (line.empty()) continue;
+        if (line.front() == '#')
+            text << line << '\n';
+        else
+            break;
+    }
+
+    return ParsedHeader(text.str());
+}
+
+InfoField VcfFormat::ParsedInfoField(const std::string& text)
+{
+    const auto& tokens = PacBio::BAM::Split(text, '=');
+    if (tokens.empty()) throw std::runtime_error{"VcfFormat: malformed INFO field: " + text};
+
+    // required ID
+    InfoField result;
+    result.id = tokens.at(0);
+    if (tokens.size() == 1) return result;
+
+    // optional value or values
+    const auto& valueStr = tokens.at(1);
+    const auto commaFound = valueStr.find(',');
+    if (commaFound != std::string::npos) {
+        std::vector<std::string> values;
+        for (auto&& value : PacBio::BAM::Split(valueStr, ','))
+            values.push_back(std::move(value));
+        result.values = std::move(values);
+    } else
+        result.value = valueStr;
+
+    return result;
+}
+
+std::vector<InfoField> VcfFormat::ParsedInfoFields(const std::string& text)
+{
+    std::vector<InfoField> result;
+    const auto& fields = PacBio::BAM::Split(text, ';');
+    for (const auto& field : fields)
+        result.push_back(ParsedInfoField(field));
+    return result;
+}
+
+GenotypeField VcfFormat::ParsedGenotypeField(const std::string& field)
+{
+    GenotypeField result;
+    const auto fieldValues = PacBio::BAM::Split(field, ':');
+    for (const auto& fieldValue : fieldValues) {
+        GenotypeData data;
+        const auto genotypeDataValues = PacBio::BAM::Split(fieldValue, ',');
+        if (genotypeDataValues.size() == 1)
+            data.value = genotypeDataValues.at(0);
+        else
+            data.values = genotypeDataValues;
+        result.data.push_back(std::move(data));
+    }
+    return result;
+}
+
+VcfVariant VcfFormat::ParsedVariant(const std::string& line)
+{
+    const auto fields = PacBio::BAM::Split(line, '\t');
+    if (fields.size() < 7)
+        throw std::runtime_error{"VcfFormat: record is missing required fields: " + line};
+
+    // CHROM POS ID REF ALT REF
+    auto chrom = fields.at(0);
+    auto pos = std::stoi(fields.at(1));
+    auto id = fields.at(2);
+    auto ref = fields.at(3);
+    auto alt = fields.at(4);
+
+    VcfVariant var{std::move(id), std::move(chrom), std::move(pos), std::move(ref), std::move(alt)};
+
+    // QUAL
+    const auto& qualStr = fields.at(5);
+    const float qual = (qualStr == "." ? NAN : stof(qualStr));
+    var.Quality(qual);
+
+    // FILTER
+    auto filter = fields.at(6);
+    var.Filter(std::move(filter));
+
+    // INFO (allow empty)
+    if (fields.size() >= 8) var.InfoFields(ParsedInfoFields(fields.at(7)));
+
+    // GENOTYPE (samples)
+    if (fields.size() > 9) {
+        var.GenotypeIds(PacBio::BAM::Split(fields.at(8), ':'));
+
+        std::vector<GenotypeField> genotypes;
+        for (size_t i = 9; i < fields.size(); ++i)
+            genotypes.emplace_back(ParsedGenotypeField(fields.at(i)));
+        var.Genotypes(std::move(genotypes));
+    }
+
+    return var;
+}
+
+std::string VcfFormat::FormattedInfoField(const InfoField& field)
+{
+    std::ostringstream out;
+    out << field.id;
+    if (field.value.is_initialized())
+        out << '=' << field.value.get();
+    else if (field.values.is_initialized())
+        out << '=' << PacBio::BAM::Join(field.values.get(), ',');
+    return out.str();
+}
+
+std::string VcfFormat::FormattedInfoFields(const std::vector<InfoField>& fields)
+{
+    std::vector<std::string> result;
+    for (const auto& field : fields)
+        result.push_back(FormattedInfoField(field));
+    return PacBio::BAM::Join(result, ';');
+}
+
+std::string VcfFormat::FormattedGenotypeField(const GenotypeField& field)
+{
+    std::string result;
+    bool firstDataEntry = true;
+    for (const auto& d : field.data) {
+        if (!firstDataEntry) result += ':';
+        if (d.value.is_initialized())
+            result += d.value.get();
+        else {
+            assert(d.values.is_initialized());
+            result += PacBio::BAM::Join(d.values.get(), ',');
+        }
+        firstDataEntry = false;
+    }
+    return result;
+}
+
+std::string VcfFormat::FormattedVariant(const VcfVariant& var)
+{
+    std::ostringstream out;
+    out << var.Chrom() << '\t' << var.Position() << '\t' << var.Id() << '\t' << var.RefAllele()
+        << '\t' << var.AltAllele() << '\t'
+        << (var.IsQualityMissing() ? "." : std::to_string(var.Quality())) << '\t' << var.Filter()
+        << '\t' << FormattedInfoFields(var.InfoFields());
+
+    const auto& genotypeIds = var.GenotypeIds();
+    if (!genotypeIds.empty()) {
+        out << '\t' << PacBio::BAM::Join(genotypeIds, ':');
+        const auto& genotypes = var.Genotypes();
+        for (const auto& genotype : genotypes)
+            out << '\t' << FormattedGenotypeField(genotype);
+    }
+    return out.str();
+}
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfHeader.cpp b/src/vcf/VcfHeader.cpp

new file mode 100644 (file)

index 0000000..ff2f07d
--- /dev/null
+++ b/src/vcf/VcfHeader.cpp
@@ -0,0 +1,234 @@
+// Author: Derek Barnett
+
+#include "../PbbamInternalConfig.h"
+
+#include <pbbam/vcf/VcfHeader.h>
+
+#include <cassert>
+#include <type_traits>
+
+#include <pbbam/vcf/VcfFormat.h>
+
+namespace PacBio {
+namespace VCF {
+
+static_assert(std::is_copy_constructible<VcfHeader>::value,
+              "VcfHeader(const VcfHeader&) is not = default");
+static_assert(std::is_copy_assignable<VcfHeader>::value,
+              "VcfHeader& operator=(const VcfHeader&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<VcfHeader>::value,
+              "VcfHeader(VcfHeader&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<VcfHeader>::value,
+              "VcfHeader& operator=(VcfHeader&&) is not = noexcept");
+
+VcfHeader::VcfHeader() { Version(VcfFormat::CurrentVersion()); }
+
+VcfHeader::VcfHeader(const std::string& hdrText) { *this = VcfFormat::ParsedHeader(hdrText); }
+
+VcfHeader& VcfHeader::AddContigDefinition(PacBio::VCF::ContigDefinition contig)
+{
+    const auto found = contigLookup_.find(contig.Id());
+    if (found == contigLookup_.cend()) {
+        contigLookup_.insert({contig.Id(), contigDefinitions_.size()});
+        contigDefinitions_.push_back(std::move(contig));
+    } else
+        contigDefinitions_.at(found->second) = std::move(contig);
+    return *this;
+}
+
+VcfHeader& VcfHeader::AddFilterDefinition(PacBio::VCF::FilterDefinition filter)
+{
+    const auto found = filterLookup_.find(filter.Id());
+    if (found == filterLookup_.cend()) {
+        filterLookup_.insert({filter.Id(), filterDefinitions_.size()});
+        filterDefinitions_.push_back(std::move(filter));
+    } else
+        filterDefinitions_.at(found->second) = std::move(filter);
+    return *this;
+}
+
+VcfHeader& VcfHeader::AddFormatDefinition(PacBio::VCF::FormatDefinition format)
+{
+    const auto found = formatLookup_.find(format.Id());
+    if (found == formatLookup_.cend()) {
+        formatLookup_.insert({format.Id(), formatDefinitions_.size()});
+        formatDefinitions_.push_back(std::move(format));
+    } else
+        formatDefinitions_.at(found->second) = std::move(format);
+    return *this;
+}
+
+VcfHeader& VcfHeader::AddGeneralDefinition(PacBio::VCF::GeneralDefinition def)
+{
+    const auto found = generalLookup_.find(def.Id());
+    if (found == generalLookup_.cend()) {
+        generalLookup_.insert({def.Id(), generalDefinitions_.size()});
+        generalDefinitions_.push_back(std::move(def));
+    } else
+        generalDefinitions_.at(found->second) = std::move(def);
+    return *this;
+}
+
+VcfHeader& VcfHeader::AddInfoDefinition(PacBio::VCF::InfoDefinition info)
+{
+    const auto found = infoLookup_.find(info.Id());
+    if (found == infoLookup_.cend()) {
+        infoLookup_.insert({info.Id(), infoDefinitions_.size()});
+        infoDefinitions_.push_back(std::move(info));
+    } else
+        infoDefinitions_.at(found->second) = std::move(info);
+    return *this;
+}
+
+VcfHeader& VcfHeader::AddSample(std::string sample)
+{
+    const auto found = sampleLookup_.find(sample);
+    if (found == sampleLookup_.cend()) {
+        sampleLookup_.insert({sample, samples_.size()});
+        samples_.push_back(std::move(sample));
+    } else
+        samples_.at(found->second) = std::move(sample);
+    return *this;
+}
+
+const std::vector<PacBio::VCF::ContigDefinition>& VcfHeader::ContigDefinitions() const
+{
+    return contigDefinitions_;
+}
+
+const PacBio::VCF::ContigDefinition& VcfHeader::ContigDefinition(const std::string& id) const
+{
+    return contigDefinitions_.at(contigLookup_.at(id));
+}
+
+VcfHeader& VcfHeader::ContigDefinitions(std::vector<PacBio::VCF::ContigDefinition> defs)
+{
+    contigDefinitions_.clear();
+    contigLookup_.clear();
+    for (auto&& def : defs)
+        AddContigDefinition(std::move(def));
+    return *this;
+}
+
+const std::string& VcfHeader::FileDate() const
+{
+    return generalDefinitions_.at(generalLookup_.at("fileDate")).Text();
+}
+
+VcfHeader& VcfHeader::FileDate(std::string fileDate)
+{
+    AddGeneralDefinition({"fileDate", std::move(fileDate)});
+    return *this;
+}
+
+const std::vector<PacBio::VCF::FilterDefinition>& VcfHeader::FilterDefinitions() const
+{
+    return filterDefinitions_;
+}
+
+const PacBio::VCF::FilterDefinition& VcfHeader::FilterDefinition(const std::string& id) const
+{
+    return filterDefinitions_.at(filterLookup_.at(id));
+}
+
+VcfHeader& VcfHeader::FilterDefinitions(std::vector<PacBio::VCF::FilterDefinition> defs)
+{
+    filterDefinitions_.clear();
+    filterLookup_.clear();
+    for (auto&& def : defs)
+        AddFilterDefinition(std::move(def));
+    return *this;
+}
+
+const std::vector<PacBio::VCF::FormatDefinition>& VcfHeader::FormatDefinitions() const
+{
+    return formatDefinitions_;
+}
+
+const PacBio::VCF::FormatDefinition& VcfHeader::FormatDefinition(const std::string& id) const
+{
+    return formatDefinitions_.at(formatLookup_.at(id));
+}
+
+VcfHeader& VcfHeader::FormatDefinitions(std::vector<PacBio::VCF::FormatDefinition> defs)
+{
+    formatDefinitions_.clear();
+    formatLookup_.clear();
+    for (auto&& def : defs)
+        AddFormatDefinition(std::move(def));
+    return *this;
+}
+
+const std::vector<PacBio::VCF::GeneralDefinition>& VcfHeader::GeneralDefinitions() const
+{
+    return generalDefinitions_;
+}
+
+const PacBio::VCF::GeneralDefinition& VcfHeader::GeneralDefinition(const std::string& id) const
+{
+    return generalDefinitions_.at(generalLookup_.at(id));
+}
+
+VcfHeader& VcfHeader::GeneralDefinitions(std::vector<PacBio::VCF::GeneralDefinition> defs)
+{
+    generalDefinitions_.clear();
+    generalLookup_.clear();
+    for (auto&& def : defs)
+        AddGeneralDefinition(std::move(def));
+    return *this;
+}
+
+const std::vector<PacBio::VCF::InfoDefinition>& VcfHeader::InfoDefinitions() const
+{
+    return infoDefinitions_;
+}
+const PacBio::VCF::InfoDefinition& VcfHeader::InfoDefinition(const std::string& id) const
+{
+    return infoDefinitions_.at(infoLookup_.at(id));
+}
+
+VcfHeader& VcfHeader::InfoDefinitions(std::vector<PacBio::VCF::InfoDefinition> defs)
+{
+    infoDefinitions_.clear();
+    infoLookup_.clear();
+    for (auto&& def : defs)
+        AddInfoDefinition(std::move(def));
+    return *this;
+}
+
+size_t VcfHeader::NumLines() const
+{
+    // +1 for #CHROM line
+    return generalDefinitions_.size() + contigDefinitions_.size() + infoDefinitions_.size() +
+           filterDefinitions_.size() + formatDefinitions_.size() + 1;
+}
+
+const Sample& VcfHeader::SampleAt(size_t index) const { return samples_.at(index); }
+
+size_t VcfHeader::IndexOfSample(const Sample& sample) const { return sampleLookup_.at(sample); }
+
+const std::vector<Sample>& VcfHeader::Samples() const { return samples_; }
+
+VcfHeader& VcfHeader::Samples(std::vector<Sample> names)
+{
+    samples_.clear();
+    sampleLookup_.clear();
+    for (auto&& name : names)
+        AddSample(std::move(name));
+    return *this;
+}
+
+const std::string& VcfHeader::Version() const
+{
+    return generalDefinitions_.at(generalLookup_.at("fileformat")).Text();
+}
+
+VcfHeader& VcfHeader::Version(std::string version)
+{
+    AddGeneralDefinition({"fileformat", std::move(version)});
+    return *this;
+}
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfHeaderTypes.cpp b/src/vcf/VcfHeaderTypes.cpp

new file mode 100644 (file)

index 0000000..73ab8c2
--- /dev/null
+++ b/src/vcf/VcfHeaderTypes.cpp
@@ -0,0 +1,233 @@
+#include "../PbbamInternalConfig.h"
+
+#include "pbbam/vcf/VcfHeaderTypes.h"
+
+#include <cassert>
+#include <type_traits>
+
+#include <pbbam/vcf/VcfHeader.h>
+
+namespace PacBio {
+namespace VCF {
+
+// -------------------
+// ContigDefinition
+// -------------------
+
+static_assert(std::is_copy_constructible<ContigDefinition>::value,
+              "ContigDefinition(const ContigDefinition&) is not = default");
+static_assert(std::is_copy_assignable<ContigDefinition>::value,
+              "ContigDefinition& operator=(const ContigDefinition&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<ContigDefinition>::value,
+              "ContigDefinition(ContigDefinition&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<ContigDefinition>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+ContigDefinition::ContigDefinition(std::string id) : ContigDefinition{std::move(id), {}} {}
+
+ContigDefinition::ContigDefinition(std::string id,
+                                   std::vector<std::pair<std::string, std::string>> attributes)
+    : id_{std::move(id)}, attributes_{std::move(attributes)}
+{
+    if (id_.empty())
+        throw std::runtime_error{"VcfFormat: ##contig definition in header has empty ID field"};
+}
+
+ContigDefinition& ContigDefinition::AddAttribute(std::string id, std::string value)
+{
+    return AddAttribute(std::make_pair(std::move(id), std::move(value)));
+}
+
+ContigDefinition& ContigDefinition::AddAttribute(std::pair<std::string, std::string> attribute)
+{
+    attributes_.push_back(std::move(attribute));
+    return *this;
+}
+
+const std::vector<std::pair<std::string, std::string>>& ContigDefinition::Attributes() const
+{
+    return attributes_;
+}
+
+ContigDefinition& ContigDefinition::Attributes(
+    std::vector<std::pair<std::string, std::string>> attributes)
+{
+    attributes_ = std::move(attributes);
+    return *this;
+}
+
+const std::string& ContigDefinition::Id() const { return id_; }
+
+// -------------------
+// FilterDefinition
+// -------------------
+
+static_assert(std::is_copy_constructible<FilterDefinition>::value,
+              "FilterDefinition(const FilterDefinition&) is not = default");
+static_assert(std::is_copy_assignable<FilterDefinition>::value,
+              "FilterDefinition& operator=(const FilterDefinition&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<FilterDefinition>::value,
+              "FilterDefinition(FilterDefinition&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<FilterDefinition>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+FilterDefinition::FilterDefinition(std::string id, std::string description)
+    : id_{std::move(id)}, description_{std::move(description)}
+{
+    if (id_.empty())
+        throw std::runtime_error{"VcfFormat: FILTER definition in header has empty ID field"};
+
+    if (description_.empty())
+        throw std::runtime_error{
+            "VcfFormat: FILTER definition in header has empty Description field"};
+}
+
+const std::string& FilterDefinition::Description() const { return description_; }
+
+const std::string& FilterDefinition::Id() const { return id_; }
+
+// -------------------
+// FormatDefinition
+// -------------------
+
+static_assert(std::is_copy_constructible<FormatDefinition>::value,
+              "FormatDefinition(const FormatDefinition&) is not = default");
+static_assert(std::is_copy_assignable<FormatDefinition>::value,
+              "FormatDefinition& operator=(const FormatDefinition&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<FormatDefinition>::value,
+              "FormatDefinition(FormatDefinition&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<FormatDefinition>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+FormatDefinition::FormatDefinition(std::string id, std::string number, std::string type,
+                                   std::string description)
+    : id_{std::move(id)}
+    , number_{std::move(number)}
+    , type_{std::move(type)}
+    , description_{std::move(description)}
+{
+    if (id_.empty())
+        throw std::runtime_error{"VcfFormat: FORMAT definition in header has empty ID field"};
+
+    if (number_.empty())
+        throw std::runtime_error{"VcfFormat: FORMAT definition in header has empty Number field"};
+
+    if (type_.empty())
+        throw std::runtime_error{"VcfFormat: FORMAT definition in header has empty Type field"};
+
+    if (description_.empty())
+        throw std::runtime_error{
+            "VcfFormat: FORMAT definition in header has empty Description field"};
+}
+
+const std::string& FormatDefinition::Description() const { return description_; }
+
+const std::string& FormatDefinition::Id() const { return id_; }
+
+const std::string& FormatDefinition::Number() const { return number_; }
+
+const std::string& FormatDefinition::Type() const { return type_; }
+
+// -------------------
+// GeneralDefinition
+// -------------------
+
+static_assert(std::is_copy_constructible<GeneralDefinition>::value,
+              "GeneralDefinition(const GeneralDefinition&) is not = default");
+static_assert(std::is_copy_assignable<GeneralDefinition>::value,
+              "GeneralDefinition& operator=(const GeneralDefinition&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<GeneralDefinition>::value,
+              "GeneralDefinition(GeneralDefinition&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<GeneralDefinition>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+GeneralDefinition::GeneralDefinition(std::string id, std::string text)
+    : id_{std::move(id)}, text_{std::move(text)}
+{
+    if (id_.empty())
+        throw std::runtime_error{
+            "VcfFormat: general metadata definition in header has empty label"};
+
+    if (text_.empty())
+        throw std::runtime_error{
+            "VcfFormat: general metadata definition in header has empty value"};
+}
+
+const std::string& GeneralDefinition::Id() const { return id_; }
+
+const std::string& GeneralDefinition::Text() const { return text_; }
+
+// -------------------
+// InfoDefinition
+// -------------------
+
+static_assert(std::is_copy_constructible<InfoDefinition>::value,
+              "InfoDefinition(const InfoDefinition&) is not = default");
+static_assert(std::is_copy_assignable<InfoDefinition>::value,
+              "InfoDefinition& operator=(const InfoDefinition&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<InfoDefinition>::value,
+              "InfoDefinition(InfoDefinition&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<InfoDefinition>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+InfoDefinition::InfoDefinition(std::string id, std::string number, std::string type,
+                               std::string description, std::string source, std::string version)
+    : id_{std::move(id)}
+    , number_{std::move(number)}
+    , type_{std::move(type)}
+    , description_{std::move(description)}
+{
+    // verify required fields
+    if (id_.empty())
+        throw std::runtime_error{"VcfFormat: INFO definition in header has empty ID field"};
+
+    if (number_.empty())
+        throw std::runtime_error{"VcfFormat: INFO definition in header has empty Number field"};
+
+    if (type_.empty())
+        throw std::runtime_error{"VcfFormat: INFO definition in header has empty Type field"};
+
+    if (description_.empty())
+        throw std::runtime_error{
+            "VcfFormat: INFO definition in header has empty Description field"};
+
+    if (!source.empty()) source_ = std::move(source);
+    if (!version.empty()) version_ = std::move(version);
+}
+
+const std::string& InfoDefinition::Description() const { return description_; }
+
+const std::string& InfoDefinition::Id() const { return id_; }
+
+const std::string& InfoDefinition::Number() const { return number_; }
+
+const boost::optional<std::string>& InfoDefinition::Source() const { return source_; }
+
+InfoDefinition& InfoDefinition::Source(std::string s)
+{
+    source_ = std::move(s);
+    return *this;
+}
+
+const std::string& InfoDefinition::Type() const { return type_; }
+
+const boost::optional<std::string>& InfoDefinition::Version() const { return version_; }
+
+InfoDefinition& InfoDefinition::Version(std::string v)
+{
+    version_ = std::move(v);
+    return *this;
+}
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfQuery.cpp b/src/vcf/VcfQuery.cpp

new file mode 100644 (file)

index 0000000..11b7dd5
--- /dev/null
+++ b/src/vcf/VcfQuery.cpp
@@ -0,0 +1,33 @@
+#include "../PbbamInternalConfig.h"
+
+#include <pbbam/vcf/VcfQuery.h>
+
+#include <cassert>
+#include <type_traits>
+
+namespace PacBio {
+namespace VCF {
+
+static_assert(!std::is_copy_constructible<VcfQuery>::value,
+              "VcfQuery(const VcfQuery&) is not = delete");
+static_assert(!std::is_copy_assignable<VcfQuery>::value,
+              "VcfQuery& operator=(const VcfQuery&) is not = delete");
+
+static_assert(std::is_nothrow_move_constructible<VcfQuery>::value ==
+                  std::is_nothrow_move_constructible<VcfReader>::value,
+              "");
+static_assert(std::is_nothrow_move_assignable<VcfQuery>::value ==
+                  std::is_nothrow_move_assignable<VcfReader>::value,
+              "");
+
+VcfQuery::VcfQuery(std::string fn) : VcfQuery{VcfFile{std::move(fn)}} {}
+
+VcfQuery::VcfQuery(const VcfFile& file)
+    : PacBio::BAM::internal::QueryBase<VcfVariant>(), reader_{file}
+{
+}
+
+bool VcfQuery::GetNext(VcfVariant& var) { return reader_.GetNext(var); }
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfReader.cpp b/src/vcf/VcfReader.cpp

new file mode 100644 (file)

index 0000000..e53edbc
--- /dev/null
+++ b/src/vcf/VcfReader.cpp
@@ -0,0 +1,55 @@
+// Author: Derek Barnett
+
+#include "../PbbamInternalConfig.h"
+
+#include <pbbam/vcf/VcfReader.h>
+
+#include <cassert>
+#include <type_traits>
+
+namespace PacBio {
+namespace VCF {
+
+static_assert(!std::is_copy_constructible<VcfReader>::value,
+              "VcfReader(const VcfReader&) is not = delete");
+static_assert(!std::is_copy_assignable<VcfReader>::value,
+              "VcfReader& operator=(const VcfReader&) is not = delete");
+
+static_assert(std::is_nothrow_move_constructible<VcfReader>::value ==
+                  std::is_nothrow_move_constructible<std::ifstream>::value,
+              "");
+static_assert(std::is_nothrow_move_assignable<VcfReader>::value ==
+                  std::is_nothrow_move_assignable<std::ifstream>::value,
+              "");
+
+VcfReader::VcfReader(std::string fn) : VcfReader{VcfFile{std::move(fn)}} {}
+
+VcfReader::VcfReader(const VcfFile& file) : in_{file.Filename()}, header_{file.Header()}
+{
+    // skip header lines
+    const auto& header = file.Header();
+    std::string line;
+    for (size_t i = header.NumLines(); i > 0; --i)
+        std::getline(in_, line);
+
+    FetchNext();
+}
+
+void VcfReader::FetchNext()
+{
+    line_.clear();
+    std::getline(in_, line_);
+}
+
+bool VcfReader::GetNext(VcfVariant& var)
+{
+    if (line_.empty()) return false;
+    var = VcfVariant{line_};
+    FetchNext();
+    return true;
+}
+
+const VcfHeader& VcfReader::Header() const { return header_; }
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfSort.cpp b/src/vcf/VcfSort.cpp

new file mode 100644 (file)

index 0000000..19fa323
--- /dev/null
+++ b/src/vcf/VcfSort.cpp
@@ -0,0 +1,58 @@
+// Author: Derek Barnett
+
+#include "../PbbamInternalConfig.h"
+
+#include <pbbam/vcf/VcfSort.h>
+
+#include <algorithm>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <pbbam/vcf/VcfQuery.h>
+#include <pbbam/vcf/VcfWriter.h>
+
+namespace PacBio {
+namespace VCF {
+
+void SortFile(const VcfFile& file, const std::string& outputFilename)
+{
+    const auto& header = file.Header();
+
+    // configure contig sort order
+    std::unordered_map<std::string, size_t> contigLookup;
+    const auto& contigDefs = header.ContigDefinitions();
+    for (size_t i = 0; i < contigDefs.size(); ++i) {
+        const auto& contigId = contigDefs.at(i).Id();
+        contigLookup.insert(std::make_pair(contigId, i));
+    }
+
+    // read & sort variants
+    std::vector<VcfVariant> variants;
+    VcfQuery query{file};
+    for (const auto& v : query)
+        variants.push_back(v);
+
+    std::sort(variants.begin(), variants.end(),
+              [&contigLookup](const VcfVariant& lhs, const VcfVariant& rhs) {
+                  const auto lhsIdx = contigLookup.at(lhs.Chrom());
+                  const auto rhsIdx = contigLookup.at(rhs.Chrom());
+                  const auto lhsPos = lhs.Position();
+                  const auto rhsPos = rhs.Position();
+                  return std::tie(lhsIdx, lhsPos) < std::tie(rhsIdx, rhsPos);
+              });
+
+    // write results to file
+    VcfWriter writer{outputFilename, header};
+    for (const auto& var : variants)
+        writer.Write(var);
+}
+
+void SortFile(const std::string& inputFilename, const std::string& outputFilename)
+{
+    SortFile(VcfFile{inputFilename}, outputFilename);
+}
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfVariant.cpp b/src/vcf/VcfVariant.cpp

new file mode 100644 (file)

index 0000000..a0a8aa6
--- /dev/null
+++ b/src/vcf/VcfVariant.cpp
@@ -0,0 +1,258 @@
+// Author: Derek Barnett
+
+#include "../PbbamInternalConfig.h"
+
+#include <pbbam/vcf/VcfVariant.h>
+
+#include <cassert>
+#include <cmath>
+#include <type_traits>
+
+#include <pbbam/StringUtilities.h>
+#include <pbbam/vcf/VcfFormat.h>
+
+namespace PacBio {
+namespace VCF {
+
+static_assert(std::is_copy_constructible<VcfVariant>::value,
+              "VcfVariant(const VcfVariant&) is not = default");
+static_assert(std::is_copy_assignable<VcfVariant>::value,
+              "VcfVariant& operator=(const VcfVariant&) is not = default");
+
+static_assert(std::is_nothrow_move_constructible<VcfVariant>::value,
+              "VcfVariant(VcfVariant&&) is not = noexcept");
+static_assert(std::is_nothrow_move_assignable<VcfVariant>::value ==
+                  std::is_nothrow_move_assignable<std::string>::value,
+              "");
+
+VcfVariant::VcfVariant(const std::string& text) { *this = VcfFormat::ParsedVariant(text); }
+
+VcfVariant::VcfVariant() : pos_{PacBio::BAM::UnmappedPosition}, qual_{NAN}, filter_{"PASS"} {}
+
+VcfVariant::VcfVariant(std::string id, std::string chrom, PacBio::BAM::Position pos,
+                       std::string refAllele, std::string altAllele)
+    : chrom_{std::move(chrom)}
+    , pos_{pos}
+    , id_{std::move(id)}
+    , refAllele_{std::move(refAllele)}
+    , altAllele_{std::move(altAllele)}
+    , qual_{NAN}
+    , filter_{"PASS"}
+{
+}
+
+VcfVariant& VcfVariant::AddInfoField(InfoField field)
+{
+    const auto found = infoLookup_.find(field.id);
+    if (found == infoLookup_.cend()) {
+        infoLookup_.insert({field.id, infoFields_.size()});
+        infoFields_.push_back(std::move(field));
+    } else
+        infoFields_.at(found->second) = std::move(field);
+    return *this;
+}
+
+const std::string& VcfVariant::AltAllele() const { return altAllele_; }
+
+VcfVariant& VcfVariant::AltAllele(std::string altAllele)
+{
+    altAllele_ = std::move(altAllele);
+    return *this;
+}
+
+const std::string& VcfVariant::Chrom() const { return chrom_; }
+
+VcfVariant& VcfVariant::Chrom(std::string chrom)
+{
+    chrom_ = std::move(chrom);
+    return *this;
+}
+
+const std::string& VcfVariant::Filter() const { return filter_; }
+
+VcfVariant& VcfVariant::Filter(std::string filter)
+{
+    filter_ = std::move(filter);
+    return *this;
+}
+
+std::vector<std::string> VcfVariant::GenotypeIds() const { return format_; }
+
+VcfVariant& VcfVariant::GenotypeIds(std::vector<std::string> ids)
+{
+    genotypeDataLookup_.clear();
+
+    format_ = std::move(ids);
+    for (size_t i = 0; i < format_.size(); ++i)
+        genotypeDataLookup_.insert({format_.at(i), i});
+    return *this;
+}
+
+std::vector<GenotypeField> VcfVariant::Genotypes() const { return sampleGenotypes_; }
+
+VcfVariant& VcfVariant::Genotypes(std::vector<GenotypeField> genotypes)
+{
+    sampleGenotypes_ = std::move(genotypes);
+    return *this;
+}
+
+const boost::optional<std::string>& VcfVariant::GenotypeValue(const size_t sampleIndex,
+                                                              const std::string& id) const
+{
+    const auto& genotypeField = sampleGenotypes_.at(sampleIndex);
+    const auto genotypeDataIndex = genotypeDataLookup_.at(id);
+    const auto& genotypeData = genotypeField.data.at(genotypeDataIndex);
+    return genotypeData.value;
+}
+
+VcfVariant& VcfVariant::GenotypeValue(const size_t sampleIndex, const std::string& id,
+                                      boost::optional<std::string> value)
+{
+    auto& genotypeField = sampleGenotypes_.at(sampleIndex);
+    const auto genotypeDataIndex = genotypeDataLookup_.at(id);
+    auto& genotypeData = genotypeField.data.at(genotypeDataIndex);
+    genotypeData.value = std::move(value);
+    return *this;
+}
+
+const boost::optional<std::vector<std::string>>& VcfVariant::GenotypeValues(
+    const size_t sampleIndex, const std::string& id) const
+{
+    const auto& genotypeField = sampleGenotypes_.at(sampleIndex);
+    const auto genotypeDataIndex = genotypeDataLookup_.at(id);
+    const auto& genotypeData = genotypeField.data.at(genotypeDataIndex);
+    return genotypeData.values;
+}
+
+VcfVariant& VcfVariant::GenotypeValues(const size_t sampleIndex, const std::string& id,
+                                       boost::optional<std::vector<std::string>> values)
+{
+    auto& genotypeField = sampleGenotypes_.at(sampleIndex);
+    const auto genotypeDataIndex = genotypeDataLookup_.at(id);
+    auto& genotypeData = genotypeField.data.at(genotypeDataIndex);
+    genotypeData.values = std::move(values);
+    return *this;
+}
+
+bool VcfVariant::HasInfoField(const std::string& id) const
+{
+    const auto found = infoLookup_.find(id);
+    return found != infoLookup_.cend();
+}
+
+const std::string& VcfVariant::Id() const { return id_; }
+
+VcfVariant& VcfVariant::Id(std::string id)
+{
+    id_ = std::move(id);
+    return *this;
+}
+
+const std::vector<InfoField>& VcfVariant::InfoFields() const { return infoFields_; }
+
+VcfVariant& VcfVariant::InfoFields(std::vector<InfoField> fields)
+{
+    infoFields_.clear();
+    infoLookup_.clear();
+    for (auto&& field : fields)
+        AddInfoField(std::move(field));
+    return *this;
+}
+
+const boost::optional<std::string> VcfVariant::InfoValue(const std::string& id) const
+{
+    return infoFields_.at(infoLookup_.at(id)).value;
+}
+
+VcfVariant& VcfVariant::InfoValue(const std::string& id, boost::optional<std::string> value)
+{
+    infoFields_.at(infoLookup_.at(id)).value = std::move(value);
+    return *this;
+}
+
+const boost::optional<std::vector<std::string>> VcfVariant::InfoValues(const std::string& id) const
+{
+    return infoFields_.at(infoLookup_.at(id)).values;
+}
+
+VcfVariant& VcfVariant::InfoValues(const std::string& id,
+                                   boost::optional<std::vector<std::string>> values)
+{
+    infoFields_.at(infoLookup_.at(id)).values = std::move(values);
+    return *this;
+}
+
+bool VcfVariant::IsDeletion() const { return refAllele_.size() > altAllele_.size(); }
+
+bool VcfVariant::IsInsertion() const { return refAllele_.size() < altAllele_.size(); }
+
+bool VcfVariant::IsQualityMissing() const { return std::isnan(qual_); }
+
+bool VcfVariant::IsSampleHeterozygous(const size_t sampleIndex) const
+{
+    const auto data = GenotypeValue(sampleIndex, "GT");
+    auto fields = PacBio::BAM::Split(data.get(), '/');
+    if (fields.size() == 1) fields = PacBio::BAM::Split(data.get(), '|');
+
+    if (fields.size() != 2)
+        throw std::runtime_error{"VcfFormat: malformatted GT field: " + data.get()};
+
+    return fields.at(0) != fields.at(1);
+}
+
+bool VcfVariant::IsSamplePhased(const size_t sampleIndex) const
+{
+    const auto data = GenotypeValue(sampleIndex, "GT");
+    const auto phaseFound = data.get().find('|') != std::string::npos;
+    if (phaseFound) assert(data.get().find('/') == std::string::npos);
+    return phaseFound;
+}
+
+bool VcfVariant::IsSnp() const
+{
+    return refAllele_.size() == 1 && altAllele_.size() == 1 && refAllele_[0] != altAllele_[0];
+}
+
+PacBio::BAM::Position VcfVariant::Position() const { return pos_; }
+
+VcfVariant& VcfVariant::Position(PacBio::BAM::Position pos)
+{
+    pos_ = pos;
+    return *this;
+}
+
+float VcfVariant::Quality() const { return qual_; }
+
+VcfVariant& VcfVariant::Quality(float qual)
+{
+    qual_ = qual;
+    return *this;
+}
+
+const std::string& VcfVariant::RefAllele() const { return refAllele_; }
+
+VcfVariant& VcfVariant::RefAllele(std::string refAllele)
+{
+    refAllele_ = std::move(refAllele);
+    return *this;
+}
+
+VcfVariant& VcfVariant::RemoveInfoField(const std::string& id)
+{
+    const auto found = infoLookup_.find(id);
+    if (found == infoLookup_.cend()) return *this;
+
+    const auto currentFields = InfoFields();
+
+    infoFields_.clear();
+    infoLookup_.clear();
+
+    for (auto&& field : currentFields) {
+        if (field.id != id) AddInfoField(std::move(field));
+    }
+
+    return *this;
+}
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfWriter.cpp b/src/vcf/VcfWriter.cpp

new file mode 100644 (file)

index 0000000..f7fbc95
--- /dev/null
+++ b/src/vcf/VcfWriter.cpp
@@ -0,0 +1,56 @@
+// Author: Derek Barnett
+
+#include "../PbbamInternalConfig.h"
+
+#include <pbbam/vcf/VcfWriter.h>
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <type_traits>
+
+#include <pbbam/vcf/VcfFormat.h>
+#include <pbbam/vcf/VcfHeader.h>
+#include <pbbam/vcf/VcfVariant.h>
+#include "../FileProducer.h"
+
+namespace PacBio {
+namespace VCF {
+
+static_assert(!std::is_copy_constructible<VcfWriter>::value,
+              "VcfWriter(const VcfWriter&) is not = delete");
+static_assert(!std::is_copy_assignable<VcfWriter>::value,
+              "VcfWriter& operator=(const VcfWriter&) is not = delete");
+
+struct VcfWriter::VcfWriterPrivate : public PacBio::BAM::FileProducer
+{
+    VcfWriterPrivate(std::string fn, const VcfHeader& header)
+        : PacBio::BAM::FileProducer{std::move(fn)}, out_{TempFilename()}
+    {
+        out_ << VcfFormat::FormattedHeader(header) << '\n';
+    }
+
+    bool Write(const VcfVariant& var)
+    {
+        out_ << VcfFormat::FormattedVariant(var) << '\n';
+        return true;  // TODO: handle errors
+    }
+
+    std::ofstream out_;
+};
+
+VcfWriter::VcfWriter(std::string fn, const VcfHeader& header)
+    : d_{std::make_unique<VcfWriterPrivate>(std::move(fn), header)}
+{
+}
+
+VcfWriter::VcfWriter(VcfWriter&&) noexcept = default;
+
+VcfWriter& VcfWriter::operator=(VcfWriter&&) noexcept = default;
+
+VcfWriter::~VcfWriter() = default;
+
+bool VcfWriter::Write(const VcfVariant& var) { return d_->Write(var); }
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/subprojects/gtest.wrap b/subprojects/gtest.wrap

new file mode 100644 (file)

index 0000000..773a713
--- /dev/null
+++ b/subprojects/gtest.wrap
@@ -0,0 +1,10 @@
+[wrap-file]
+directory = googletest-release-1.8.0
+
+source_url = https://github.com/google/googletest/archive/release-1.8.0.zip
+source_filename = gtest-1.8.0.zip
+source_hash = f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf
+
+patch_url = https://wrapdb.mesonbuild.com/v1/projects/gtest/1.8.0/5/get_zip
+patch_filename = gtest-1.8.0-5-wrap.zip
+patch_hash = 7eeaede4aa2610a403313b74e04baf91ccfbaef03203d8f56312e22df1834ec5
diff --git a/subprojects/htslib.wrap b/subprojects/htslib.wrap

new file mode 100644 (file)

index 0000000..2a93dd3
--- /dev/null
+++ b/subprojects/htslib.wrap
@@ -0,0 +1,10 @@
+[wrap-file]
+directory = htslib-1.9
+
+source_url = https://github.com/samtools/htslib/archive/1.9.zip
+source_filename = htslib-1.9.zip
+source_hash = c4d3ae84014f8a80f5011521f391e917bc3b4f6ebd78e97f238472e95849ec14
+
+patch_url = https://wrapdb.mesonbuild.com/v1/projects/htslib/1.9/1/get_zip
+patch_filename = htslib-1.9-1-wrap.zip
+patch_hash = 02f4a3c64d668d4d09f8bb0f57eb33398e90e6901989f257be6a6716a15bdcdd
diff --git a/subprojects/pbcopper.wrap b/subprojects/pbcopper.wrap

new file mode 100644 (file)

index 0000000..9567a7f
--- /dev/null
+++ b/subprojects/pbcopper.wrap
@@ -0,0 +1,4 @@
+[wrap-git]
+directory=pbcopper
+url=https://github.com/PacificBiosciences/pbcopper.git
+revision=develop
diff --git a/subprojects/zlib.wrap b/subprojects/zlib.wrap

new file mode 100644 (file)

index 0000000..91c1d4d
--- /dev/null
+++ b/subprojects/zlib.wrap
@@ -0,0 +1,10 @@
+[wrap-file]
+directory = zlib-1.2.11
+
+source_url = http://zlib.net/fossils/zlib-1.2.11.tar.gz
+source_filename = zlib-1.2.11.tar.gz
+source_hash = c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1
+
+patch_url = https://wrapdb.mesonbuild.com/v1/projects/zlib/1.2.11/4/get_zip
+patch_filename = zlib-1.2.11-4-wrap.zip
+patch_hash = f733976fbfc59e0bcde01aa9469a24eeb16faf0a4280b17e9eaa60a301d75657
diff --git a/tests/data/aligned.bam b/tests/data/aligned.bam

new file mode 100644 (file)

index 0000000..34d81e5

Binary files /dev/null and b/tests/data/aligned.bam differ
diff --git a/tests/data/aligned.bam.bai b/tests/data/aligned.bam.bai

new file mode 100644 (file)

index 0000000..66ba855

Binary files /dev/null and b/tests/data/aligned.bam.bai differ
diff --git a/tests/data/aligned.bam.pbi b/tests/data/aligned.bam.pbi

new file mode 100644 (file)

index 0000000..f2cf207

Binary files /dev/null and b/tests/data/aligned.bam.pbi differ
diff --git a/tests/data/aligned.sam b/tests/data/aligned.sam

new file mode 100644 (file)

index 0000000..ad45e63
--- /dev/null
+++ b/tests/data/aligned.sam
@@ -0,0 +1,8 @@
+@HD    VN:1.3.1        SO:coordinate   pb:3.0.3
+@SQ    SN:lambda_NEB3011       LN:48502        M5:a1319ff90e994c8190a4fe6569d0822a
+@RG    ID:0d7b28fa     PL:PACBIO       DS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100      PU:singleInsertion      PM:SEQUEL
+@PG    ID:bwa  PN:bwa  VN:0.7.10-r1017-dirty   CL:bwa mem lambdaNEB.fa singleInsertion.fasta
+singleInsertion/100/0_49       2048    lambda_NEB3011  5211    60      3H8=1D19=1I21=59H       *       0       0       GGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGAT       *       NM:i:2  MD:Z:8^T40      AS:i:34 XS:i:0  RG:Z:0d7b28fa   SA:Z:lambda_NEB3011,9378,+,52S37=2D10=1I11=,60,3;       qe:i:49 qs:i:0  np:i:1  zm:i:100        rq:f:0.6        sn:B:f,0.2,0.2,0.2,0.2
+singleInsertion/200/0_49       2048    lambda_NEB3011  5211    60      3H8=1D19=1I21=59H       *       0       0       GGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGAT       *       NM:i:2  MD:Z:8^T40      AS:i:34 XS:i:0  RG:Z:0d7b28fa   SA:Z:lambda_NEB3011,9378,-,37=2D10=1I11=52S,60,3;       qe:i:49 qs:i:0  np:i:1  zm:i:200        rq:f:0.6        sn:B:f,0.2,0.2,0.2,0.2
+singleInsertion/100/0_111      0       lambda_NEB3011  9378    60      52S37=2D10=1I11=        *       0       0       TTTGGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGATAAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGAGCAGCACGGTAAACAGCGGCAA *       NM:i:3  MD:Z:37^TC21    AS:i:43 XS:i:0  RG:Z:0d7b28fa   SA:Z:lambda_NEB3011,5211,+,3S8=1D19=1I21=59S,60,2;      qe:i:111        qs:i:0  np:i:1  zm:i:100        rq:f:0.6        sn:B:f,0.2,0.2,0.2,0.2
+singleInsertion/100/0_111      16      lambda_NEB3011  9378    60      37=2D10=1I11=52S        *       0       0       AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGAGCAGCACGGTAAACAGCGGCAAATCAGCCAGTCCGGCATCAATTGGCCTCCTGACCGCTGTACCTGCAGCCAAA *       NM:i:3  MD:Z:37^TC21    AS:i:43 XS:i:0  RG:Z:0d7b28fa   SA:Z:lambda_NEB3011,5211,+,3S8=1D19=1I21=59S,60,2;      qe:i:111        qs:i:0  np:i:1  zm:i:100        rq:f:0.6        sn:B:f,0.2,0.2,0.2,0.2
diff --git a/tests/data/aligned2.bam b/tests/data/aligned2.bam

new file mode 100644 (file)

index 0000000..672e5e5

Binary files /dev/null and b/tests/data/aligned2.bam differ
diff --git a/tests/data/aligned2.bam.bai b/tests/data/aligned2.bam.bai

new file mode 100644 (file)

index 0000000..f954ab0

Binary files /dev/null and b/tests/data/aligned2.bam.bai differ
diff --git a/tests/data/aligned2.bam.pbi b/tests/data/aligned2.bam.pbi

new file mode 100644 (file)

index 0000000..c1e82de

Binary files /dev/null and b/tests/data/aligned2.bam.pbi differ
diff --git a/tests/data/barcoded_read_groups.bam b/tests/data/barcoded_read_groups.bam

new file mode 100644 (file)

index 0000000..4873f7e

Binary files /dev/null and b/tests/data/barcoded_read_groups.bam differ
diff --git a/tests/data/barcoded_read_groups.bam.pbi b/tests/data/barcoded_read_groups.bam.pbi

new file mode 100644 (file)

index 0000000..368d1d5

Binary files /dev/null and b/tests/data/barcoded_read_groups.bam.pbi differ
diff --git a/tests/data/bed/test.bed b/tests/data/bed/test.bed

new file mode 100644 (file)

index 0000000..c5f14de
--- /dev/null
+++ b/tests/data/bed/test.bed
@@ -0,0 +1,9 @@
+chr1   213941196       213942363
+chr1   213942363       213943530
+chr1   213943530       213944697
+chr2   158364697       158365864
+chr2   158365864       158367031
+chr3   127477031       127478198
+chr3   127478198       127479365
+chr3   127479365       127480532
+chr3   127480532       127481699
diff --git a/tests/data/bed/test.bed.gz b/tests/data/bed/test.bed.gz

new file mode 100644 (file)

index 0000000..d0cf3e9

Binary files /dev/null and b/tests/data/bed/test.bed.gz differ
diff --git a/tests/data/chemistry.xml b/tests/data/chemistry.xml

new file mode 100644 (file)

index 0000000..c6a6521
--- /dev/null
+++ b/tests/data/chemistry.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<MappingTable>
+  <Mapping>
+    <SequencingChemistry>FOUND</SequencingChemistry>
+    <BindingKit>1</BindingKit>
+    <SequencingKit>2</SequencingKit>
+    <SoftwareVersion>3.4</SoftwareVersion>
+  </Mapping>
+</MappingTable>
diff --git a/tests/data/chimera_minimal.fasta b/tests/data/chimera_minimal.fasta

new file mode 100644 (file)

index 0000000..f2eb86c
--- /dev/null
+++ b/tests/data/chimera_minimal.fasta
@@ -0,0 +1,220 @@
+>Barcode0--0_Cluster1_Phase1_NumReads297
+GCAGGTGCCTTTGCAGAAACAAAGTCAGGGTTCTTCAAGTCACAAAGGGAAGGGCAGGAA
+CAACTCTTGCCTCTCAGTCCCACACAAGGCAGCTGTCTCACACTATAGAAAAAAATATTC
+ATGAACAAATTCGTATCTGTCACAGTGAGGGGTCACACTTTAAACAGCCCATCGCATGCT
+CAATACATCCAATGGAAAGAAACCCCATAGCACAGCTGTGTCCACTGTTCCGCCCAACAC
+CCAACACACATCAGGCCCTCCAGGCTCTCACCTTTACAAGCTGTGAGAGACACATCAGAG
+CCCTGGGCACTGTCACTGCCTGGGGTAGAACAAAAACAGAACCTGGTCAGATCCCACAGA
+AGATGTGGCTAGAGGAGGAATTGTGGGGTGGGTGAGCTCCCCCATGGGCTCCCAAACACA
+ATATCCCAAGGACCTCAGGCATCAGCCTCCTTCATACTTACTTGCAGCCTGAGAGTAGCT
+CCCTCCTTTTCTATCTGTGGGAAGAAAATGTCCTGTGAGATACCAGAAAGGAGTCAGGGC
+CTTAAGGTCCTAGAGGAACCTCCAAGTCTTGGACCTCAGAGAAGTTTCCAGAAATGTGTG
+ACTGCAGACCCAGGGCGGGATCAGGAAACATGAAGAAAGCAGGTGTGGGTCCTGGACCAA
+CCGCCCTCCTGAAGGTCCTCAGGGACCTTCCCCTGTGACTTGTGACTGCTGGGATCAGGT
+CCCATCACCGCTGTAATCAAGGTGATAAATCTGTCCTTCATTTTAACAGGTGCTTTACAA
+AAGAGTAAGTGCTGGCACACAGGGCCCAGGCTGGGTAGGCCCATAATTGTGGGTGGTGCT
+TCCCAGTAACGAGGCAGGGCACACTTCTACCTGGGTCTTGGAACCCTCAGTGAGACAAGA
+AATCTCAGACCCACCCTTCACCCCTTCCCCACCTGAGCTCTTCCTCCTCCACATCACAGC
+AGCGACCACAGCTCCAGTGATCACAGCTCCAAAGAGAACCAGGCCAGCAATGATGCCCAC
+GATGGGGATGGTGGGCTGGGAAGACGGCTCTGGGAAAAGAGGGGAAGGTGAGGGGCCCTG
+ACCCTGCTAAAGGTCAGAGAGGCTCCTGCTTTCCCTAAAAGACATGACACCCCCGTCTCC
+CTCCTTACCCCATCTCAGGGTGAGGGGCTTGGGCAAACCCTCATGCTGCACATGGCAGGT
+GTATCTCTGCTCCTGTCCAGAAGGCACCACCACAGCCGCCCACTTCTGGAAGGTTCCATC
+CCCTGCAGGCCTGGTCTCCACGAGCTCCGTGTCCTGGGTCTGGTCCTCCCCATCCCGCTG
+CCAGGTCAGTGTGATCTCCGCAGGGTAGAAGCTCAGGGCCCAGCACCTCAGGGTGGCTTC
+ATGGTCAGAGACAGCGTGGTGAGTCATATGCGTTTTGGGGGCGTCTGTCAGGAAGAGTCA
+GATCATTCAGGCATTTTGCATCTGTCATGGGACACTCCTCCAGCACACATGTGGCTATCT
+TGAGAATGGACAGGACACCTGGGATGGGGAAGGGAGCACAGAACCCAGACACCAGCCTGG
+ACACAGGCACCTGGGATAATCTCCTATTCCGTGGAAAATTCTAGTCCCTGAAGAGGGAAC
+AGCGACTTCTGGTCCTGACCTGAGTGGAGGCTGAAGGACTCAGAAGTGCTGGACTCAGAC
+CCCCACACACATTGAGTGTGAAGCAGAGAACAAGGCCTGAGAGGAAAAGTCACGGGCCCA
+AGGCTGCTGCCTGTGTGTGTCAAAGGGAACCACTCATCAGTATTCGAGGGATCGTCTTCC
+CGTCATTCCTTCAGAGATTTTATCCCTTAATTGTGTCAGAGAGCAGGGCGGAACCTCAGA
+GTCACTCTCTGGTACAGGATCTGGAAACCCAGGAGGATTCCTCTCCCTCAGGACCAGAGG
+GAGGGCGATATTCTAGTGTTGGTCCCAATTGTCTCCCCTCCTTGTGGGAGGCCAGCCCGG
+GAGATCTACAGGCGATCAGGGAGGCGCCCCGTGGCCCCTGGTACCCGTGCGCTGCAGCGT
+CTCCTTCCCGTTCTCCAGGTATCTGCGGAGCCACTCCACGCACGTGCCCTCCAGGTAGGC
+TCTCAACTGCTCCGCCACATGGGCCGCCTCCCACTTGTGCTTGGTGGTCTGAGCTGCCAT
+GTCCGCCGCGGTCCAAGAGCGCAGGTCCTCTTTCAGGGCGATGTAATCCTTGCCGTCGTA
+GGCGTACTGGTGGTACCCGCGGAGGAAGCGCCAGTCCGACCCCACGTCGCAGCCATACAT
+CCTCTGGACGGTGTGAGAACCTGGCCCGGACCCCGCGGTCAGCCCGGTCCCCCGAGCCCC
+GCCCCGCCCCGACCAACCTGGGGGGATTTTTGGCCTAAACTGAAAATGAAACCGGGTAAA
+GGCGCCTGGGCCTCTCCCGGGGCAAGGGTCTCGGGGTCCCGCGGCTTCGGGGCGGATCTC
+GGACCCGGAGACTGTGGGCGACCTGGCCCGTCCGTGGGGGATGAGAGGTCGTGACCTGCG
+CCCCGGGCCGGGGTCACTCACCGGCCTCGCTCTGGTTGTAGTAGCCGCGCAGGGTCCCCA
+GGTCCACTCGGTGAGTCTGTGAGTGGGCCTTCACTTTCCGTGTCTCCCCGTCCCAATACT
+CCGGACCCTCCTGCTCTATCCACGGCGCCCGCGGCTCCATCCTCTGGCTCGCGGCGTCGC
+TGTCGAACCGCACGAACTGCGTGTCGTCCACGTAGCCCACTGCGATGAAGCGGGGCTCCC
+CGCGGCCGGGCCGGGACACGGATGTGAAGAAATACCTCATGGAGTGAGAGCCTGGGGACG
+AGGAGTGGCTGAGACCCGCCCGACCCTCCTCCCGGCGCGGCTTCCCGGGTCCTGCGCCCC
+CGCCAGGCGGGCCCGTTGCTTCTCCCCACAGAGGCCGTTTCCCTCCCGACCCCGCACTCA
+CCCGCCCAGGTCTGGGTCAGGGCCAGAGCCCCCGAGAGTAGCAGGACGAGGGTTCGGGGC
+GCCATGACGGCCATCCTCGGCGTCTGGGGAGAATCTGAGTCCCGGTGGGTGCGTGCGGAC
+TTTAGAACCGCGACCGCGACGACACTGATTGGCTTCTCTGGAAACCCGACACCCAATGGG
+AGTGAGAACTGGGTCCGCGTCGTGAGTATCCA
+>Barcode0--0_Cluster3_Phase1_NumReads294
+GCAGGTGCCTTTGCAGAAACAAAGTCAGGGTTCTTCAAGTCACAAAGGGAAGGGCAGGAA
+CAACTCTTGCCTCTCAGTCCCACACAAGGCAGCTGTCTCACACTATAGAAAAAAATATTC
+ATGAACAAATTCATATCCATCACAGTGAGGGGTCACACCTTAAACAGCCCATCGCATGCT
+CAATACATCCAATGCAAAGAAACCCCATAGCACAGCTGTGTCCACTGTTCCGCCCAACAC
+CCAACACACATTAGGTCCTCCAAGCTCTCACCTTTACAAGCTGTGAGGGACACATCAGAG
+CCCTGGGCACTGTCACTGCCTGGGGTAGAACAAAAACAGAACCTGGTCAGATCCCACAGA
+AGATGTGGCTAGAGGAGGAATTGTGAGGTGGGTGGGCTCCCCCATGGGCTCCCAAACACA
+ATATCCCAAGGACCTCAGGCATCAGCCTCCTTCATACTTACTTGCAGCCTGAGTGTAACT
+CCCTCCTTTTCTATCTGTGAGAAGAAAATGTCCTGTGAGATACCAGAAAGGAGCCAGGGC
+CTTAAGGTCCTAGAGGAACCTCCTAGTCTTGGACCCCAGAGAAGTTTCCAGAAATGTGTG
+ACTGCAGACCCAGGGCGGGATCAGGAAACATGAAGAAAGCAGGTGTGGGTCCTGGACCAA
+TAGCCCTCCTGAGGTCTGTCCTCAGGGACCTTCCCCTGTGACTTGTGACTGCTGGGATCA
+GGTCCCATCACCGCCGTAATCAAGGTGATAAATCTGTCCTTCATTTTAACAGGTGCTTTA
+CAAAAGAGTAAGTGCTGGCACACAGGGCCCAGACTGGGTAGGCCCATGATTGTGGACGGT
+GCTTCCCAGTAATGAGACAGGGCACATTTCTAGCTGGGGCTTGGAACCCTCAGTGAGACA
+AGAAATCTCAGACCCCACCCTTCACCCCTTCTCCACCTGAGCTCTTCCTCCTCCACATCA
+CGGCAGCGACCACAGCTCCAGTGATCACAGCTCCAAGGAGAACCAGGCCAGCAATGATGC
+CCACGATGGGGATGGTGGGCTGGGAAGACAGCTCTGGGAAAAGAGGGGAAGGTGAGGGGC
+CCTGACCCTGCTAAAGGTCTCCAGAGAGGCTCCTGCTTTCCCTAAGAGACATGACACCCC
+CATCTCCCTCCTTACCCCATCTCAGGGTGAGGGGCTTGGGCAGACCCTCATGCTGCACAT
+GGCAGGTGTATCTCTGCTCCTCTCCAGAAGGCACCACCACAGCCGCCCACTTCTGGAAGG
+TTCCATCCCCTGCAGGCCTGGTCTCCACGAGCTCCGTGTCCTGGGTCTGGTCCTCCCCAT
+CCCGCTGCCAGGTCAGTGTGATCTCCGCAGGGTAGAAGCCCAGGGCCCAGCACCTCAGGG
+TGGCCTCATGGTCAGAGATGGGGTGGTGGGTCATATGTGTCTTGGGGGGGTCTGACGGGA
+AGAGTCAGAAAATTCAGGCATTTTGCATCTGTCATGGGACACTCCACCAGCACGCATGTG
+GCCATCTTGAGAATGGACAGGACACCCGGGATGGGGAAGAGAGCACAGAACCCAGACACC
+AGCCTGGACACAGGCACCTGGGATAATCTTCTATTCCCTGAGAAGGGAACAGCGACTTCT
+GGTCCTGACCTGAGTGGAGGCTGAGGGACTCAGAAGTGCTGGACTCAGACCCCCACACAC
+ATTGAGTGTGAAGCAGAGAACAAGGCCTGAGAGGAAAAGTCACGGGCCCAAGGCTGCTGC
+CGGTGTCAAAGGGAACCACTCATCAGTATTCGAGGGATCGTCTTCCCGTCACTCCTTCAG
+AGATTTTATCCCTTAATTGTGTCAGAGAGCAGGGCGGAACCTCAGAGTCACTCTCTGGTA
+CAGGATCTGGAACCCAGGAGGATTCCTCTCCCTCAGGACCAGAGGGAGGGTGATATTCTA
+GTGTTGGTCCCAATTGTCTCCCCTCCTTGTGGGAGGCCAGCCCGGGAGATCTACAGGCGA
+TCAGGGAGGCGCCCCGTGGCCCCTGGTACCCGTGCGCTGCAGCGTCTCCTTCCCGTTCTC
+CAGGTATCTGCGGAGCCACTCCACGCACGTGCCATCCAGGTAGGCTCTCAACTGCTCCGC
+CTCATGGGCCGCCTCCCACTTGCGCTTGGTGATCTGAGCCGCCATGTCCGCCGCGGTCCA
+AGAGCGCAGGTCCTCGTTCAGGGCGATGTAATCCTTGCCGTCGTAGGCGTCCTGCCGGTA
+CCCGCGGAGGAAGCGCCCGTCCGACCCCACGTCGCAGCCATACATTATCTGGATGGTGTG
+AGAACCTGGCCCCGACCCCGCGGTCAGCCCAGTCCCCCGAGCCCCGCCCAGCCCCGACCA
+ACCCGGGGGGATTTTTGGCCTAAACTGAAAATGAAACCGGGTAAAGGCGCCTGGGCCTCT
+CCCGGGGCAAGGGTCTCGGGGTCCCGCGGCTTCGGGGTGGATCTCGGACCCGGAGACTGT
+GGGCGACCTGGCCCGTCCGTGGGGGATGAGGGGTCCTGACCTGCGCCCCCGGCCGGGGTC
+ACTCACCGGCCTCGCTCTGGTTGTAGTAGCCGCGCAGGGTCCCCAGGTCCACTCGGTCAG
+TCTGTGACTGGGCCTTCACATTCCGTGTCTCCTGGTCCCAATACTCCGGCCCCTCCTGCT
+CTATCCACGGCGCCCGCGGCTCCATCCTCTGGCTCGCGGCGTCGCTGTCGAACCGCACGA
+ACTGCGTGTCGTCCACGTAGCCCACGGCGATGAAGCGGGGCTCCCCGCGGCCGGGCCGGG
+ACACGGATGTGAAGAAATACCTCATGGAGTGGGAGCCTGGGGGCGAGCAGTGGCTGAGAC
+CTGCCCGACCCTCGTCCCGGCGCGGCTCCCCCGGTCCTGCGCCCCCGCCAGGAGGGCCCC
+TTGCTTCTCCCCGCAGAGGCGGTTTCCCTCCCGACCCCGCACTCACCCGCCCAGGTCTGG
+GTCAGGGCCAGGGCCCCCGAGAGTAGCAGGAGGAGGGTTCGGGGCGCCATGACGGCCATC
+CTCGGCGTCTGGGGAGAATCTGAGTCCCGGTGGGTGCGTGCGGGCTTTAGAACAGCGACC
+GCGACGACACTGATTGGCTTCTCTGGAAACCCGACACCCAATGGGAGTGAGAACTGGGTC
+CGCGTCGTGAGTATCCA
+>Barcode0--0_Cluster0_Phase2_NumReads92
+CTGGGGAGGAAACACAGGTCAGCATGGGAACAGGGGTCACAGTGGACACGGGGGTGGGCT
+GTCTCTCCACCTCCTCACATTATGCTAACAGGGACGCAGACACATTCAGGTGCCTTTGCA
+GAAAGAGATGCCAGAGGCTCTTGAAGTCACAAAGGGGAGGAGTGAAGAAATCCTGCATCT
+CAGTCCCTCACAAGACAGCTGTCTCAGGCTACAGAAAACAACAGTCATGAACAAATTCTG
+GTTAGTCATGGTAAGTGATGACACTCTAAACAGCCCACCACACACGCGAAACATCCCAAT
+CAAAGAATCTCCATTACCCAGGCCTTTCCCCTCTGCCCCCTCCCCACCCCACCCCCCCCG
+CCCACTCTAGACCCCAAGAATCTCACCTTTTCAAGCTGTGAGAGACACATCAGAGCCCTG
+GGCACTGTCGCTGGCTGGAGTAGAACAAAAACAGGACCTGGTCAGAGCCCGCAGGAGACG
+TGGGACAGGAGGAATTATGGGGTGGGTGAGCTCCTCCACACTCCCACCCCCACCACTTAC
+ACGCAGCCTGAGAGTAGCTCCCTCCTTTTCCACCTGTGGGAAGAAAATGTCCTGTGAGGG
+GACTGGGAGGAAGCAGGGCCATGAGATCTTAGAGGAACCTCCTCGTCTTGGACCCAAAAG
+GAATTTCCAGAAGTATGACTACAGACCCAAGGCAGGATCAGGAAACACGAGGAAAGCAAG
+TGTGGGTCCTGGACCAACTGCCCTCCTAAGGTCTGTCCTTAGCAGGGACCTTCCCCTGAC
+TCATGAATGCTGAAATCAGGACCCCAACACCACAACCATCAAGGTGATACATCCGTCCTT
+CATTGTCACATGTGCTGCACAAAAGAGTAAGTGCTGGCACACAGGGTCCCAGGCTGCATT
+AGCCCCTGTGTGGATGCTGCTTCCCAGTAATGAGGCAGGGAACACTTCTACCTGGGGCTT
+GAAACCCCCAGTGGGACAAGAAAACCCAGACCCCACCCCTCACCCCTTCCCTACCTGAGC
+TCTTCCTCCTACACATCACAGTAGCGACCACAGCTCCGATGACCACAACTGCTAGGACAG
+CCAGGCCAGCAACAATGCCCACGATGGGGATGGTGGACTGGGAAGATGGCTCTGGGAAAG
+GAGGGGAAGACGAGGGGCCCTGACCCTGCTGAAGGGCTCCAGAAGGGCTCCTGCTTTCCC
+TGAGAAGAGATATGACCCCTCATCCCCCTCCTTACCCCATCTCAGGGTGAGGGGCTTCGG
+CAGCCCCTCATGCTGTACATGGCATGTGTATCTCTGCTCTTCTCCAGAAGGCACCACCAC
+AGCTGCCCACTTCTGGAAGGTTCTATCTCCTGCTGGTCTGGTCTCCACAAGCTCAGTGTC
+CTGAGTTTGGTCCTCGCCATCCCGCTGCCAGGTCAGTGTGATCTCCGCAGGGTAGAAGCC
+CAGGGCCCAGCACCTCAGGGTGGCCTCATGGTCAGAGATGGGGTGGTGGGTCACGTGTGT
+CTTTGGGGGGTCTGATGGGAAGAGTCAGAAAATTCAGGCGCTTTGCATCTCTCATAGGAC
+ACCCTAGGACCACCCATGTGACCAGCCTGAGAATGGACAGGACACCTGGGGTGGGGAAGG
+GGCACAGAACCCAGACACCAGCCTGGACGCAGGCACCTGGGATAATCTCCTATTCATTGG
+AAAGTTCGAGTCTCTGAGCGGGGAACAGGGACTTCTGCTCCTGATCTGAGTGGAGGTAAA
+GTGACTCAGAAGTGCTGGAATCAGAGCCCCAAACACACTGAGTGTGAGGCAGAGAACAAG
+GCCTGAGAGGAAAAGTCACGGTTCCCAAGGCTGCTGCAGGGGTCAAAGAGGACCCCTGAT
+CAGTATTCTAGGGACTGTCTTCCCCTCCATTTCCTCAGAGACGTCATCCCTTAATTGTCC
+TAGAGAGAAGAGGGGGCCCTCAGAGGAAACTCAGGAAAACTCATGCCATTCTCCATTCAA
+GGGAGGGCGACATTCTAGCGCTGATCCCATTTTCCTCCTCTTCTCGTGGGAGGCCATCCC
+CGGCGACCTATAGGAGATGGGGAAGGCTCCCCACTGCCCCTGGTACCCGCGCGCTGCAGC
+GTCTCCTTCCCGTTCTCCAGGTATCTGCGGAGCCACTCCACGCACAGGCCCTCCAGGTAG
+GCTCTCAGCTGCTCCGCCACACGGGCCGCCTCCCACTTGCGCTGGGTGATCTGAGCCGCG
+GTGTCCGCCGCGGTCCAGGAGCTCAGGTCCTCGTTCAGGGCGATGTAATCCTTGCCGTCG
+TAGGCTAACTGGTTATGCCCGCGGAGGAGGCGCCCGTCCGGCCCCAGGTCGCAGCCATAC
+ATCGTCTGCCAAGTGTGAGACCCTGGCCCCGGCCCCGCGGTCAGCCCCGTCCCCCGAGCC
+CCGCCCCGCCCCGACCAACCCGCGGGGATTTTGGCCTCAACTGAAAATGAAACCGGGTAA
+ACGCGCCTGGGGCTCTCGCCGGTCGAGGGTCTGGGCGGGTCCCGCGGCCTCAGGGAGGCG
+GATCTCGGACCCGGAGACTCGGGGCGACCCGGGCCGTACGTGGGGGATGGGGAGTCGTGA
+CCTGCGCCCCGGGCCGGGGTCACTCACCGGCCTCGCTCTGGTTGTAGTAGCCGCGCAGGT
+TCCGCAGGCTCTCTCGGTCAGTCTGTGCCTGGGCCTTGTAGATCTGTGTGTTCCGGTCCC
+AATACTCCGGCCCCTCCTGCTCTATCCACGGCGCCCGCGGCTCCTCTCTCGGACTCGCGG
+CGTCGCTGTCGAACCTCACGAACTGCGTGTCGTCCACGTAGCCCACTGCGATGAAGCGGG
+GCTCCCCGCGGCCGGGCCGGGACATGGCGGTGTAGAAATACCTCATGGAGTGGGAGCCTG
+GGGGCGAGGAGGGGCTGAGACCCGCCAGACCCTCCTCCCGGCGCGGCTCCCCGGGTCCTG
+CGCCCCCGCCTGCGGTCCCCTCGCTCCTCCCCACAGAGGCCATTTCCCTCCCGACCCGCA
+CTCACCGGCCCAGGTCTCGGTCAGGGCCAGGGCCCCCCAGAGCAGCAGGAGGAGGGTTCG
+GGGTGCCGTGACCCGCATCTCGGTGTCTGAGGAGACTCTGAGTCCGGGTGGGTGCGTGGG
+GACTTTAGAACTGGGACCCCGGCGACACTGATTGGCTTCTCTAGACACCCGACACCCAAT
+GGGAGTGGGAAATGGGGACGCGTCACGAGTATCCTGGAAGAAGGACCCGACATAGGTTGG
+GAGAAGAAGTGAAACTCGTGGGAGTGGGGAATCCCCAACGCTGCGCCTCCCCAATGCAGA
+CGCGGCCCTCGGAGCCTGAGACCCTGAGAGCCCCGTCCGGGACATGGGACTTCGTCCTGA
+TCCCTCTTCTCCTACACCAGCCTCTTTGTCACACTGTCTGCC
+>Barcode0--0_Cluster1_Phase3_NumReads56
+GCAGGTGCCTTTGCAGAAACAAAGTCAGGGTTCTTCAAGTCACAAAGGGAAGGGCAGGAA
+CAACTCTTGCCTCTCAGTCCCACACAAGGCAGCTGTCTCACACTATAGAAAAAAATATTC
+ATGAACAAATTCGTATCTGTCACAGTGAGGGGTCACACTTTAAACAGCCCATCGCATGCT
+CAATACATCCAATGGAAAGAAACCCCATAGCACAGCTGTGTCCACTGTTCCGCCCAACAC
+CCAACACACATCAGGCCCTCCAGGCTCTCACCTTTACAAGCTGTGAGAGACACATCAGAG
+CCCTGGGCACTGTCACTGCCTGGGGTAGAACAAAAACAGAACCTGGTCAGATCCCACAGA
+AGATGTGGCTAGAGGAGGAATTGTGGGGTGGGTGAGCTCCCCCATGGGCTCCCAAACACA
+ATATCCCAAGGACCTCAGGCATCAGCCTCCTTCATACTTACTTGCAGCCTGAGAGTAGCT
+CCCTCCTTTTCTATCTGTGGGAAGAAAATGTCCTGTGAGATACCAGAAAGGAGTCAGGGC
+CTTAAGGTCCTAGAGGAACCTCCAAGTCTTGGACCTCAGAGAAGTTTCCAGAAATGTGTG
+ACTGCAGACCCAGGGCGGGATCAGGAAACATGAAGAAAGCAGGTGTGGGTCCTGGACCAA
+CCGCCCTCCTGAAGGTCCTCAGGGACCTTCCCCTGTGACTTGTGACTGCTGGGATCAGGT
+CCCATCACCGCTGTAATCAAGGTGATAAATCTGTCCTTCATTTTAACAGGTGCTTTACAA
+AAGAGTAAGTGCTGGCACACAGGGCCCAGGCTGGGTAGGCCCATAATTGTGGGTGGTGCT
+TCCCAGTAACGAGGCAGGGCACACTTCTACCTGGGTCTTGGAACCCTCAGTGAGACAAGA
+AATCTCAGACCCACCCTTCACCCCTTCCCCACCTGAGCTCTTCCTCCTCCACATCACAGC
+AGCGACCACAGCTCCAGTGATCACAGCTCCAAAGAGAACCAGGCCAGCAATGATGCCCAC
+GATGGGGATGGTGGGCTGGGAAGACGGCTCTGGGAAAAGAGGGGAAGGTGAGGGGCCCTG
+ACCCTGCTAAAGGTCAGAGAGGCTCCTGCTTTCCCTAAAAGACATGACACCCCCGTCTCC
+CTCCTTACCCCATCTCAGGGTGAGGGGCTTGGGCAAACCCTCATGCTGCACATGGCAGGT
+GTATCTCTGCTCCTGTCCAGAAGGCACCACCACAGCCGCCCACTTCTGGAAGGTTCCATC
+CCCTGCAGGCCTGGTCTCCACGAGCTCCGTGTCCTGGGTCTGGTCCTCCCCATCCCGCTG
+CCAGGTCAGTGTGATCTCCGCAGGGTAGAAGCTCAGGGCCCAGCACCTCAGGGTGGCTTC
+ATGGTCAGAGACAGCGTGGTGAGTCATATGCGTTTTGGGGGCGTCTGTCAGGAAGAGTCA
+GATCATTCAGGCATTTTGCATCTGTCATGGGACACTCCTCCAGCACACATGTGGCTATCT
+TGAGAATGGACAGGACACCTGGGATGGGGAAGGGAGCACAGAACCCAGACACCAGCCTGG
+ACACAGGCACCTGGGATAATCTCCTATTCCGTGGAAAATTCTAGTCCCTGAAGAGGGAAC
+AGCGACTTCTGGTCCTGACCTGAGTGGAGGCTGAAGGACTCAGAAGTGCTGGACTCAGAC
+CCCCACACACATTGAGTGTGAAGCAGAGAACAAGGCCTGAGAGGAAAAGTCACGGGCCCA
+AGGCTGCTGCCTGTGTGTGTCAAAGGGAACCACTCATCAGTATTCGAGGGATCGTCTTCC
+CGTCATTCCTTCAGAGATTTTATCCCTTAATTGTGTCAGAGAGCAGGGCGGAACCTCAGA
+GTCACTCTCTGGTACAGGATCTGGAACCCAGGAGGATTCCTCTCCCTCAGGACCAGAGGG
+AGGGCGATATTCTAGTGTTGGTCCCAATTGTCTCCCCTCCTTGTGGGAGGCCAGCCCGGG
+AGATCTACAGGCGATCAGGGAGGCGCCCCGTGGCCCCTGGTACCCGTGCGCTGCAGCGTC
+TCCTTCCCGTTCTCCAGGTATCTGCGGAGCCACTCCACGCACGTGCCATCCAGGTAGGCT
+CTCAACTGCTCCGCCTCATGGGCCGCCTCCCACTTGCGCTTGGTGATCTGAGCCGCCATG
+TCCGCCGCGGTCCAAGAGCGCAGGTCCTCGTTCAGGGCGATGTAATCCTTGCCGTCGTAG
+GCGTCCTGCCGGTACCCGCGGAGGAAGCGCCCGTCCGACCCCACGTCGCAGCCATACATT
+ATCTGGATGGTGTGAGAACCTGGCCCCGACCCCGCGGTCAGCCCAGTCCCCCGAGCCCCG
+CCCAGCCCCGACCAACCCGGGGGGATTTTTGGCCTAAACTGAAAATGAAACCGGGTAAAG
+GCGCCTGGGCCTCTCCCGGGGCAAGGGTCTCGGGGTCCCGCGGCTTCGGGGTGGATCTCG
+GACCCGGAGACTGTGGGCGACCTGGCCCGTCCGTGGGGGATGAGGGGTCCTGACCTGCGC
+CCCCGGCCGGGGTCACTCACCGGCCTCGCTCTGGTTGTAGTAGCCGCGCAGGGTCCCCAG
+GTCCACTCGGTCAGTCTGTGACTGGGCCTTCACATTCCGTGTCTCCTGGTCCCAATACTC
+CGGCCCCTCCTGCTCTATCCACGGCGCCCGCGGCTCCATCCTCTGGCTCGCGGCGTCGCT
+GTCGAACCGCACGAACTGCGTGTCGTCCACGTAGCCCACGGCGATGAAGCGGGGCTCCCC
+GCGGCCGGGCCGGGACACGGATGTGAAGAAATACCTCATGGAGTGGGAGCCTGGGGGCGA
+GCAGTGGCTGAGACCTGCCCGACCCTCGTCCCGGCGCGGCTCCCCCGGTCCTGCGCCCCC
+GCCAGGAGGGCCCCTTGCTTCTCCCCGCAGAGGCGGTTTCCCTCCCGACCCCGCACTCAC
+CCGCCCAGGTCTGGGTCAGGGCCAGGGCCCCCGAGAGTAGCAGGAGGAGGGTTCGGGGCG
+CCATGACGGCCATCCTCGGCGTCTGGGGAGAATCTGAGTCCCGGTGGGTGCGTGCGGGCT
+TTAGAACAGCGACCGCGACGACACTGATTGGCTTCTCTGGAAACCCGACACCCAATGGGA
+GTGAGAACTGGGTCCGCGTCGTGAGTATCCA
+\ No newline at end of file
diff --git a/tests/data/chunking/chunking.subreadset.xml b/tests/data/chunking/chunking.subreadset.xml

new file mode 100644 (file)

index 0000000..74d38c1
--- /dev/null
+++ b/tests/data/chunking/chunking.subreadset.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:SubreadSet xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" CreatedAt="2019-07-02T18:22:25" MetaType="PacBio.DataSet.SubreadSet" Name="DataSet_SubreadSet" Tags="" TimeStampedName="pacbio_dataset_subreadset-190702_162225445" UniqueId="649d564c-bef3-f7e0-80fd-c1b623e51121" Version="3.0.1" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam" TimeStampedName="pacbio_subreadfile_subreadbamfile-190702_162225442" UniqueId="55a3a60c-acdb-4c6e-b404-22a25853f205">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-190702_162225441" UniqueId="6229e63f-901f-4078-8b3d-f5ca7714dda5"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam" TimeStampedName="pacbio_subreadfile_subreadbamfile-190702_162225442" UniqueId="9ce13c61-5b02-4b1d-8c0e-d8b469f565e9">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-190702_162225441" UniqueId="aedb108f-35ef-4cce-a8f4-a1c420bfbd70"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam" TimeStampedName="pacbio_subreadfile_subreadbamfile-190702_162225442" UniqueId="0c0537e7-f25d-419c-8c4b-526eaf61fdbb">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-190702_162225442" UniqueId="10fb8805-20a8-42d7-aaaf-28b5b8ae704f"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="movie" Operator="=" Value="m64004_190414_193017"/>
+                               <pbbase:Property Name="zm" Operator="lt" Value="1816"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>933955</pbds:TotalLength>
+               <pbds:NumRecords>1220</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:SubreadSet>
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam

new file mode 100644 (file)

index 0000000..ca926df

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..cc9f24f

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam

new file mode 100644 (file)

index 0000000..2a8e7e0

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..9a52ae2

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam

new file mode 100644 (file)

index 0000000..2af53d1

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..a887e19

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi differ
diff --git a/tests/data/clip_to_query.bam b/tests/data/clip_to_query.bam

new file mode 100644 (file)

index 0000000..7e88330

Binary files /dev/null and b/tests/data/clip_to_query.bam differ
diff --git a/tests/data/dataset/ali1.xml b/tests/data/dataset/ali1.xml

new file mode 100644 (file)

index 0000000..ab0a82a
--- /dev/null
+++ b/tests/data/dataset/ali1.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:AlignmentSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Alignments BAM" Description="Points to an example Alignments BAM file." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:///mnt/path/to/alignments0.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments0.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments1.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments1.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSets>
+               <pbds:DataSet UniqueId="ab95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="HighQuality Read Alignments">
+                       <pbds:Filters> <!-- These Filters are in addition to those above. This provides a means to subset and label the parent DataSet further. -->
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="rq" Value="0.85" Operator=">"/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+               <pbds:DataSet UniqueId="ac95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="Alignments to chromosome 1">
+                       <pbds:Filters>
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="RNAME" Value="chr1" Operator="=="/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+       </pbds:DataSets>
+</pbds:AlignmentSet>
diff --git a/tests/data/dataset/ali2.xml b/tests/data/dataset/ali2.xml

new file mode 100644 (file)

index 0000000..c35f9ec
--- /dev/null
+++ b/tests/data/dataset/ali2.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:AlignmentSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Alignments BAM" Description="Points to an example Alignments BAM file." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:///mnt/path/to/alignments2.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments2.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments3.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments3.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSets>
+               <pbds:DataSet UniqueId="ab95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="HighQuality Read Alignments">
+                       <pbds:Filters> <!-- These Filters are in addition to those above. This provides a means to subset and label the parent DataSet further. -->
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="rq" Value="0.85" Operator=">"/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+               <pbds:DataSet UniqueId="ac95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="Alignments to chromosome 1">
+                       <pbds:Filters>
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="RNAME" Value="chr1" Operator="=="/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+       </pbds:DataSets>
+</pbds:AlignmentSet>
diff --git a/tests/data/dataset/ali3.xml b/tests/data/dataset/ali3.xml

new file mode 100644 (file)

index 0000000..f58d25f
--- /dev/null
+++ b/tests/data/dataset/ali3.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:AlignmentSet CreatedAt="2015-01-27T09:00:01"  MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Alignments BAM" Description="Points to an example Alignments BAM file." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:///mnt/path/to/alignments2.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments2.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments3.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments3.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSets>
+               <pbds:DataSet UniqueId="ab95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="HighQuality Read Alignments">
+                       <pbds:Filters> <!-- These Filters are in addition to those above. This provides a means to subset and label the parent DataSet further. -->
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="rq" Value="0.75" Operator=">"/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+               <pbds:DataSet UniqueId="ac95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="Alignments to chromosome 1">
+                       <pbds:Filters>
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="RNAME" Value="chr1" Operator="=="/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+       </pbds:DataSets>
+</pbds:AlignmentSet>
diff --git a/tests/data/dataset/ali4.xml b/tests/data/dataset/ali4.xml

new file mode 100644 (file)

index 0000000..ab0a82a
--- /dev/null
+++ b/tests/data/dataset/ali4.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:AlignmentSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Alignments BAM" Description="Points to an example Alignments BAM file." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:///mnt/path/to/alignments0.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments0.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments1.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments1.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSets>
+               <pbds:DataSet UniqueId="ab95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="HighQuality Read Alignments">
+                       <pbds:Filters> <!-- These Filters are in addition to those above. This provides a means to subset and label the parent DataSet further. -->
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="rq" Value="0.85" Operator=">"/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+               <pbds:DataSet UniqueId="ac95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="Alignments to chromosome 1">
+                       <pbds:Filters>
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="RNAME" Value="chr1" Operator="=="/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+       </pbds:DataSets>
+</pbds:AlignmentSet>
diff --git a/tests/data/dataset/bam_mapping.bam b/tests/data/dataset/bam_mapping.bam

new file mode 100644 (file)

index 0000000..2d4ae7b

Binary files /dev/null and b/tests/data/dataset/bam_mapping.bam differ
diff --git a/tests/data/dataset/bam_mapping.bam.pbi b/tests/data/dataset/bam_mapping.bam.pbi

new file mode 100644 (file)

index 0000000..fe7c3be

Binary files /dev/null and b/tests/data/dataset/bam_mapping.bam.pbi differ
diff --git a/tests/data/dataset/bam_mapping_1.bam b/tests/data/dataset/bam_mapping_1.bam

new file mode 100644 (file)

index 0000000..1e9670e

Binary files /dev/null and b/tests/data/dataset/bam_mapping_1.bam differ
diff --git a/tests/data/dataset/bam_mapping_1.bam.pbi b/tests/data/dataset/bam_mapping_1.bam.pbi

new file mode 100644 (file)

index 0000000..d99a174

Binary files /dev/null and b/tests/data/dataset/bam_mapping_1.bam.pbi differ
diff --git a/tests/data/dataset/bam_mapping_2.bam b/tests/data/dataset/bam_mapping_2.bam

new file mode 100644 (file)

index 0000000..09678ea

Binary files /dev/null and b/tests/data/dataset/bam_mapping_2.bam differ
diff --git a/tests/data/dataset/bam_mapping_2.bam.pbi b/tests/data/dataset/bam_mapping_2.bam.pbi

new file mode 100644 (file)

index 0000000..d1765ef

Binary files /dev/null and b/tests/data/dataset/bam_mapping_2.bam.pbi differ
diff --git a/tests/data/dataset/bam_mapping_new.bam b/tests/data/dataset/bam_mapping_new.bam

new file mode 100644 (file)

index 0000000..3039331

Binary files /dev/null and b/tests/data/dataset/bam_mapping_new.bam differ
diff --git a/tests/data/dataset/bam_mapping_new.bam.pbi b/tests/data/dataset/bam_mapping_new.bam.pbi

new file mode 100644 (file)

index 0000000..82d497c

Binary files /dev/null and b/tests/data/dataset/bam_mapping_new.bam.pbi differ
diff --git a/tests/data/dataset/bam_mapping_staggered.xml b/tests/data/dataset/bam_mapping_staggered.xml

new file mode 100644 (file)

index 0000000..879c193
--- /dev/null
+++ b/tests/data/dataset/bam_mapping_staggered.xml
@@ -0,0 +1,35 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<pbds:DataSet CreatedAt="2015-05-13T10:58:26" MetaType="PacBio.DataSet.DataSet" Name="" Tags="" UniqueId="30f72098-bc5b-e06b-566c-8b28dda909a8" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"  xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource ResourceId="file:tests/data/bam_mapping_1.bam">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:tests/data/bam_mapping_1.bam.bai"/>
+                       </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource ResourceId="file:tests/data/bam_mapping_2.bam">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:tests/data/bam_mapping_2.bam.bai"/>
+                       </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:DataSets>
+        <pbds:DataSet CreatedAt="2015-05-13T10:58:26" UniqueId="c5402d06-4643-057c-e300-fe229b4e8909" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+            <pbbase:ExternalResources>
+                <pbbase:ExternalResource ResourceId="file:tests/data/bam_mapping_2.bam">
+                               <pbbase:FileIndices>
+                                       <pbbase:FileIndex ResourceId="file:tests/data/bam_mapping_2.bam.bai"/>
+                               </pbbase:FileIndices>
+                </pbbase:ExternalResource>
+            </pbbase:ExternalResources>
+        </pbds:DataSet>
+        <pbds:DataSet CreatedAt="2015-05-13T10:58:26" UniqueId="f8b54a55-5fb7-706f-ab35-39afc9c86924" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+            <pbbase:ExternalResources>
+                <pbbase:ExternalResource ResourceId="file:tests/data/bam_mapping_1.bam">
+                               <pbbase:FileIndices>
+                                       <pbbase:FileIndex ResourceId="file:tests/data/bam_mapping_1.bam.bai"/>
+                               </pbbase:FileIndices>
+                </pbbase:ExternalResource>
+            </pbbase:ExternalResources>
+        </pbds:DataSet>
+    </pbds:DataSets>
+</pbds:DataSet>
diff --git a/tests/data/dataset/barcode.dataset.xml b/tests/data/dataset/barcode.dataset.xml

new file mode 100644 (file)

index 0000000..1fbbb18
--- /dev/null
+++ b/tests/data/dataset/barcode.dataset.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:BarcodeSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.BarcodeSet" Name="DataSet_BarcodeSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Barcodes FASTA" Description="Points to an example Barcodes FASTA file." MetaType="BarcodeFile.BarcodeFastaFile" ResourceId="file:///mnt/path/to/barcode.fasta" Tags="Example"/>
+       </pbbase:ExternalResources>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>400</pbds:TotalLength>
+               <pbds:NumRecords>30</pbds:NumRecords>
+               <pbds:BarcodeConstruction>paired</pbds:BarcodeConstruction>
+       </pbds:DataSetMetadata>
+</pbds:BarcodeSet>
diff --git a/tests/data/dataset/biosample.subreadset.xml b/tests/data/dataset/biosample.subreadset.xml

new file mode 100644 (file)

index 0000000..d9f44ea
--- /dev/null
+++ b/tests/data/dataset/biosample.subreadset.xml
@@ -0,0 +1,748 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:ConsensusReadSet CreatedAt="2018-09-28T08:38:45" MetaType="PacBio.DataSet.ConsensusReadSet" Name="3260208_188nM-GTAC_2xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494-Cell1 (CCS) AND 3260208_188nM-GTAC_3xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494-Cell1 (CCS) AND 3260208_188nM-GTAC_4xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494-Cell1 (CCS)" Tags="subreadset,CCS" TimeStampedName="pacbio_dataset_consensusreadset-180928_083845009" UniqueId="f5b66713-1f30-d08c-9f92-c2ae40cc281f" Version="3.0.1" 
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" 
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" 
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" 
+    xmlns:pbrk="http://pacificbiosciences.com/PacBioReagentKit.xsd" 
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource MetaType="PacBio.ConsensusReadFile.ConsensusReadBamFile" ResourceId="m54075_180905_221350.ccs.bam" TimeStampedName="pacbio_consensusreadfile_consensusreadbamfile-180928_082814184" UniqueId="c1281369-6a6b-427f-8c25-0f0a4dd49127" Version="3.0.1">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="m54075_180905_221350.ccs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-180928_082814184" UniqueId="da34e30a-1717-45a1-bece-5624c4787677" Version="3.0.1"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource MetaType="PacBio.ConsensusReadFile.ConsensusReadBamFile" ResourceId="m54075_180905_225130.ccs.bam" TimeStampedName="pacbio_consensusreadfile_consensusreadbamfile-180928_082936624" UniqueId="0e8b9a59-e58b-40e2-82f5-eadc84e9f98d" Version="3.0.1">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="m54075_180905_225130.ccs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-180928_082936624" UniqueId="04c14584-ed44-49bb-862a-1c9d8e1a958a" Version="3.0.1"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource MetaType="PacBio.ConsensusReadFile.ConsensusReadBamFile" ResourceId="m54075_180905_233034.ccs.bam" TimeStampedName="pacbio_consensusreadfile_consensusreadbamfile-180928_083049559" UniqueId="6d3b4190-ad18-4b7a-a0f0-567f0d964d2b" Version="3.0.1">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="m54075_180905_233034.ccs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-180928_083049559" UniqueId="5797a188-12b9-4b72-b779-d904289ce945" Version="3.0.1"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:DataSets>
+        <pbds:ConsensusReadSet CreatedAt="2018-09-28T08:28:14.184Z" MetaType="PacBio.DataSet.ConsensusReadSet" Name="3260208_188nM-GTAC_2xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494-Cell1 (CCS)" Tags="subreadset,CCS" TimeStampedName="3260208_188nM-GTAC_2xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494-Cell1 (CCS)-180928_082814184" UniqueId="599aa3a9-42f5-b85b-6933-d0bcb6e5311a" Version="3.0.1" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+            <pbbase:ExternalResources>
+                <pbbase:ExternalResource MetaType="PacBio.ConsensusReadFile.ConsensusReadBamFile" ResourceId="m54075_180905_221350.ccs.bam" TimeStampedName="pacbio_consensusreadfile_consensusreadbamfile-180928_082814184" UniqueId="c1281369-6a6b-427f-8c25-0f0a4dd49127" Version="3.0.1">
+                    <pbbase:FileIndices>
+                        <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="m54075_180905_221350.ccs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-180928_082814184" UniqueId="da34e30a-1717-45a1-bece-5624c4787677" Version="3.0.1"/>
+                    </pbbase:FileIndices>
+                </pbbase:ExternalResource>
+            </pbbase:ExternalResources>
+            <pbds:DataSetMetadata>
+                <pbds:TotalLength>12813</pbds:TotalLength>
+                <pbds:NumRecords>12</pbds:NumRecords>
+                <pbmeta:Collections>
+                    <pbmeta:CollectionMetadata Context="m54075_180905_221350" CreatedAt="0001-01-01T00:00:00" InstrumentId="60001" InstrumentName="Inst60001" MetaType="CollectionMetadata" ModifiedAt="0001-01-01T00:00:00" Status="Ready" TimeStampedName="60001-CollectionMetadata-2015-07-09T09:30:47.833Z" UniqueId="ab0a16df-1bd5-4f09-aa98-35cbedde995d">
+                        <pbmeta:InstCtrlVer>6.0.0.SNAPSHOT39495</pbmeta:InstCtrlVer>
+                        <pbmeta:SigProcVer>6.0.0.SNAPSHOT39495</pbmeta:SigProcVer>
+                        <pbmeta:RunDetails>
+                            <pbmeta:TimeStampedName>r54075_20180905_221306</pbmeta:TimeStampedName>
+                            <pbmeta:Name>3260208_188nM-GTAC_2xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494</pbmeta:Name>
+                            <pbmeta:CreatedBy>unknown</pbmeta:CreatedBy>
+                            <pbmeta:WhenCreated>2015-07-09T09:30:47.833Z</pbmeta:WhenCreated>
+                            <pbmeta:StartedBy>unknown</pbmeta:StartedBy>
+                        </pbmeta:RunDetails>
+                        <pbmeta:WellSample CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="3260208_188nM-GTAC_2xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494">
+                            <pbmeta:WellName>SamplePlate-1-A-1</pbmeta:WellName>
+                            <pbmeta:Concentration>0</pbmeta:Concentration>
+                            <pbmeta:SampleReuseEnabled>false</pbmeta:SampleReuseEnabled>
+                            <pbmeta:StageHotstartEnabled>false</pbmeta:StageHotstartEnabled>
+                            <pbmeta:SizeSelectionEnabled>false</pbmeta:SizeSelectionEnabled>
+                            <pbmeta:UseCount>0</pbmeta:UseCount>
+                        </pbmeta:WellSample>
+                        <pbmeta:Automation Name="manualcellprep.py">
+                            <pbbase:AutomationParameters>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="CouplerLaserPower" SimpleValue="7" ValueDataType="Double"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="MovieLength" SimpleValue="15" ValueDataType="Double"/>
+                                <pbbase:AutomationParameter CreatedAt="2018-03-19T20:02:47.574Z" ModifiedAt="0001-01-01T00:00:00" Name="Exposure" SimpleValue="0.01" ValueDataType="String"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="SNRCut" SimpleValue="2.5" ValueDataType="Double"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="ActiveLaserPower" SimpleValue="False" ValueDataType="String"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="sequencingPixelROI" SimpleValue="[[0,0,1080,1920]]" ValueDataType="JSON"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="traceFilePixelROI" SimpleValue="[[0,0,8,32],[71,0,8,32],[142,0,8,32],[213,0,8,32],[284,0,8,32],[355,0,8,32],[426,0,8,32],[497,0,8,32],[568,0,8,32],[639,0,8,32],[710,0,8,32],[781,0,8,32],[852,0,8,32],[923,0,8,32],[994,0,8,32],[1072,0,8,32],[0,128,8,32],[71,128,8,32],[142,128,8,32],[213,128,8,32],[284,128,8,32],[355,128,8,32],[426,128,8,32],[497,128,8,32],[568,128,8,32],[639,128,8,32],[710,128,8,32],[781,128,8,32],[852,128,8,32],[923,128,8,32],[994,128,8,32],[1072,128,8,32],[0,256,8,32],[71,256,8,32],[142,256,8,32],[213,256,8,32],[284,256,8,32],[355,256,8,32],[426,256,8,32],[497,256,8,32],[568,256,8,32],[639,256,8,32],[710,256,8,32],[781,256,8,32],[852,256,8,32],[923,256,8,32],[994,256,8,32],[1072,256,8,32],[0,384,8,32],[71,384,8,32],[142,384,8,32],[213,384,8,32],[284,384,8,32],[355,384,8,32],[426,384,8,32],[497,384,8,32],[568,384,8,32],[639,384,8,32],[710,384,8,32],[781,384,8,32],[852,384,8,32],[923,384,8,32],[994,384,8,32],[1072,384,8,32],[0,512,8,32],[71,512,8,32],[142,512,8,32],[213,512,8,32],[284,512,8,32],[355,512,8,32],[426,512,8,32],[497,512,8,32],[568,512,8,32],[639,512,8,32],[710,512,8,32],[781,512,8,32],[852,512,8,32],[923,512,8,32],[994,512,8,32],[1072,512,8,32],[0,640,8,32],[71,640,8,32],[142,640,8,32],[213,640,8,32],[284,640,8,32],[355,640,8,32],[426,640,8,32],[497,640,8,32],[568,640,8,32],[639,640,8,32],[710,640,8,32],[781,640,8,32],[852,640,8,32],[923,640,8,32],[994,640,8,32],[1072,640,8,32],[0,768,8,32],[71,768,8,32],[142,768,8,32],[213,768,8,32],[284,768,8,32],[355,768,8,32],[426,768,8,32],[497,768,8,32],[568,768,8,32],[639,768,8,32],[710,768,8,32],[781,768,8,32],[852,768,8,32],[923,768,8,32],[994,768,8,32],[1072,768,8,32],[0,896,8,32],[71,896,8,32],[142,896,8,32],[213,896,8,32],[284,896,8,32],[355,896,8,32],[426,896,8,32],[497,896,8,32],[568,896,8,32],[639,896,8,32],[710,896,8,32],[781,896,8,32],[852,896,8,32],[923,896,8,32],[994,896,8,32],[1072,896,8,32],[0,1024,8,32],[71,1024,8,32],[142,1024,8,32],[213,1024,8,32],[284,1024,8,32],[355,1024,8,32],[426,1024,8,32],[497,1024,8,32],[568,1024,8,32],[639,1024,8,32],[710,1024,8,32],[781,1024,8,32],[852,1024,8,32],[923,1024,8,32],[994,1024,8,32],[1072,1024,8,32],[0,1152,8,32],[71,1152,8,32],[142,1152,8,32],[213,1152,8,32],[284,1152,8,32],[355,1152,8,32],[426,1152,8,32],[497,1152,8,32],[568,1152,8,32],[639,1152,8,32],[710,1152,8,32],[781,1152,8,32],[852,1152,8,32],[923,1152,8,32],[994,1152,8,32],[1072,1152,8,32],[0,1280,8,32],[71,1280,8,32],[142,1280,8,32],[213,1280,8,32],[284,1280,8,32],[355,1280,8,32],[426,1280,8,32],[497,1280,8,32],[568,1280,8,32],[639,1280,8,32],[710,1280,8,32],[781,1280,8,32],[852,1280,8,32],[923,1280,8,32],[994,1280,8,32],[1072,1280,8,32],[0,1408,8,32],[71,1408,8,32],[142,1408,8,32],[213,1408,8,32],[284,1408,8,32],[355,1408,8,32],[426,1408,8,32],[497,1408,8,32],[568,1408,8,32],[639,1408,8,32],[710,1408,8,32],[781,1408,8,32],[852,1408,8,32],[923,1408,8,32],[994,1408,8,32],[1072,1408,8,32],[0,1536,8,32],[71,1536,8,32],[142,1536,8,32],[213,1536,8,32],[284,1536,8,32],[355,1536,8,32],[426,1536,8,32],[497,1536,8,32],[568,1536,8,32],[639,1536,8,32],[710,1536,8,32],[781,1536,8,32],[852,1536,8,32],[923,1536,8,32],[994,1536,8,32],[1072,1536,8,32],[0,1664,8,32],[71,1664,8,32],[142,1664,8,32],[213,1664,8,32],[284,1664,8,32],[355,1664,8,32],[426,1664,8,32],[497,1664,8,32],[568,1664,8,32],[639,1664,8,32],[710,1664,8,32],[781,1664,8,32],[852,1664,8,32],[923,1664,8,32],[994,1664,8,32],[1072,1664,8,32],[0,1792,8,32],[71,1792,8,32],[142,1792,8,32],[213,1792,8,32],[284,1792,8,32],[355,1792,8,32],[426,1792,8,32],[497,1792,8,32],[568,1792,8,32],[639,1792,8,32],[710,1792,8,32],[781,1792,8,32],[852,1792,8,32],[923,1792,8,32],[994,1792,8,32],[1072,1792,8,32],[0,1888,8,32],[71,1888,8,32],[142,1888,8,32],[213,1888,8,32],[284,1888,8,32],[355,1888,8,32],[426,1888,8,32],[497,1888,8,32],[568,1888,8,32],[639,1888,8,32],[710,1888,8,32],[781,1888,8,32],[852,1888,8,32],[923,1888,8,32],[994,1888,8,32],[1072,1888,8,32]]" ValueDataType="JSON"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="CellAtStage" SimpleValue="True" ValueDataType="String"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="SkipAlignment" SimpleValue="False" ValueDataType="String"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="UseStageHotStart" SimpleValue="False" ValueDataType="String"/>
+                                <pbbase:AutomationParameter Name="CellNFCIndex" SimpleValue="1" ValueDataType="Int32"/>
+                                <pbbase:AutomationParameter Name="CollectionNumber" SimpleValue="0" ValueDataType="Int32"/>
+                                <pbbase:AutomationParameter Name="HasN2Switch" SimpleValue="False" ValueDataType="Boolean"/>
+                                <pbbase:AutomationParameter Name="TipSearchMaxDuration" SimpleValue="576" ValueDataType="Int32"/>
+                            </pbbase:AutomationParameters>
+                        </pbmeta:Automation>
+                        <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                        <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                        <pbmeta:SetNumber>0</pbmeta:SetNumber>
+                        <pbmeta:CellPac Barcode="BA233119" Description="The Sequel SMRT Cell 1M (4/Pack) are nanofabricated consumables each patterned with 1,000,000 wells called zero-mode waveguides (ZMWs). They are packaged together in a streamlined 4 SMRT Cell format (4/tray). One SMRT Cell is utilized in each sequencing reaction, and experiments can be run in single or in batch mode, giving your projects a greater level of flexibility." ExpirationDate="2018-06-08" LotNumber="321693" MovieTimeGrade="LR" Name="SMRT® Cell 1M v2 LR (4/Pack)" PartNumber="101-008-001" Version="2.0">
+                            <pbbase:ChipLayout>SequEL_4.0_RTO3</pbbase:ChipLayout>
+                        </pbmeta:CellPac>
+                        <pbmeta:ControlKit Barcode="444444101084300062030" ChipType="1mChip" Description="The Sequel DNA Internal Control 2.0 contains a fixed template of 2 kb length bound to Sequel Binding Kit 2.0 for use as an internal sequencing control. Reagent quantities provide spike-in controls for a minimum of 24 samples." ExpirationDate="2030-06-20" LotNumber="444444" Name="Sequel® DNA Internal Control 2.0" PartNumber="101-084-300" Tags="Control Kit, CCK" Version="2.0">
+                            <pbbase:CustomSequence>&gt;left_adapter\nTAGAGAGAGAAAAGGAGGAGGAGGCAACAACAACAACTCTCTCTA\n&gt;right_adapter\nTAGAGAGAGAAAAGGAGGAGGAGGCAACAACAACAACTCTCTCTA\n&gt;custom_sequence\nTGTCTAGGTCATCTCAACGTAGCTTTGACATATAACTTATCTAAAGTAATCCCTGCACACCTGTATGCATTATGCTTGCTATACACGCGGACACAGGCATATCATTTATTTTTTGCCATGTCCATTAATTGTTCAATAATTTTACTCACGGTATTTAATTTGATGTTGTGTTATATAGAATTGGAATTAAACTTATAAGGATGCTTAGACGTTGCATTATAAAAGTTTATGTACTAAGTATTTAAGACATTGGCATATGATTATAGCTTGACATTATTAAAAATTAATTAATTAAATCTCACACAATACTTATTCAAGACATTTTTACTAAGATAACCAAAGGAATGCGAACAAAATAATACTTAAAATATAAGACTTAGAAGTAATATGATCCAATAGTTACATATAGTACACTAAGTTCCTAAATTATATAACTTTAAAACAAAGTTACGAAATTTGGAAATAATTTTATTTAATCATATTTTCATAATAATGAAATACTGTTTATTTCAGTGGCGAAAAGAGATAATACGATTTTATAGTGATAGAATATCCTTGAAATATCTAAAGATAAAATTAGAAACTTTCTCTTTTCGCTGTAAAGCTATATGACTTAAAAATAACTTATACGCAAAGTATATTGCAGTGGAAACCCAAGAGTATAGTAGCCATGTAATCTCGGGTTCGAAACTACACGCCGCGCACGTAGTCAGATGGTCTGAAACTTGTCTGGGGCTGTTTGTTGACGGATGGAGACTTCACTAAGTGGCGTCAGGCGATGCGCACACACGGGACTCAATCCCGTAGCATGTTATGTGTCGTTCGAAACTCGTGCGTTCGAGATTTACGCCACATTGCCGGCTGGTCCAAGGACGTTATCTACCAGATGATACGGTCCAATTCGTAAGTTTGACTCACATAGTCGCGAACCGCGGAGCTGGAGAACAATAATTACCGGATGATTAGTTGACCATACGCACTATCATGCTCCGTGACTCAGTTTCCGCCATGGAGTTCTCACAGCCCCGTGTGTACCATAACTGCAGTAAGTAAGGACCTTGTTCGGAGGCCGACTCGTATTTCATATGATCTTAGTCTCGCCACCTTATCGCACGAATTGGGGGTGTCTTTTAGCCGACTCCGGCACGATCCGCCGGGAAGTTACTCGACCAGTTGCGGGACGCCCTAGTATGTTCGTATTACGTTCGATGCGTAAGCACCCCAGAGATTTTTGGCGGACGTTTCGGTAAATCATAGTAGAACCGGAGCGGTAAAGCTATTGATAACACGCAGGGACGAGCCAGTCGTCTAAGCTCCTCAGGGGTACCGTTCGCCGGACTACAGCCTGTCCCCGGCGGCCGCAACTGGGCTGCGATCCAGCCCCCGCTCCAAAAGGATGACTCGACCTTGCGCCTCGCGTACTCTGCTCTCGAGCTGTCTCCGTGGGCAATGCCGGCTCACGCTGTGGGGAACCCTGGACGCCCGGGCCGAGCCGACGTGGCCCCGCCCAGGCCTTTTCGTCGATCGCAGCTATGTACCCTGTGCTGGCCAGCGCTACTGCGCCGGCCATTAGCGGTGCGCTCTCGACTCGGCCCCAACGTAGACGGCGTCGCTGGCCGGATTCAAAGAAGTGAGCTACTACCATCGCGTGACGCCCTGCGGGCCTGAGTAACCGTGCACGAAGGACACCCCGTTCGTGGCGGGGGTTGCCTCCGCGACGGTCGCCAACGTTGGGGGTCGGTGCATTCAGGCGACGAGGGACCGCTGGTTTCCGGAGAGCGGCCTGTGCTCACACAGGTGCGGTCCATGGGGCCTGTGGATCCGGTTCTCCCACGCGTAGCGCCGGCGTTAGCATGGACGCTAAATAAGTATACGCCGGCAAAGGGAGTGTAGGCCGGCCCGAGGGCAATCGCGGTTACCGGGGTGGGGGAGCTCCCCGCACCAGCCTTGATGTGGTGTGCGAGCG</pbbase:CustomSequence>
+                        </pbmeta:ControlKit>
+                        <pbmeta:TemplatePrepKit Barcode="333333100902400092215" Description="The SMRTbell® Template Prep Kit contains reagent supplies to perform SMRTbell library preparations of primer-annealed SMRTbell libraries for insert sizes ranging from 500 bp to over 20 kb." ExpirationDate="2015-09-22" IsRestricted="true" LotNumber="333333" MaxInsertSize="20000" MinInsertSize="500" Name="CRISPR/Cas9 Placeholder w/ asymmetric adapters" PartNumber="100-902-400" Tags="Template Prep Kit, TPK" Version="1.0">
+                            <pbbase:LeftAdaptorSequence>ATCTCTCTCAATTTTTTTTTTTTTTTTTTTTTTTAAGAGAGAGAT</pbbase:LeftAdaptorSequence>
+                            <pbbase:LeftPrimerSequence>aacggaggaggagga</pbbase:LeftPrimerSequence>
+                            <pbbase:RightAdaptorSequence>ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT</pbbase:RightAdaptorSequence>
+                            <pbbase:RightPrimerSequence>aacggaggaggagga</pbbase:RightPrimerSequence>
+                        </pbmeta:TemplatePrepKit>
+                        <pbmeta:BindingKit Barcode="777777100862200062030" ChipType="1mChip" Description="The Sequel Binding Kit 2.0 contains reagent supplies to bind prepared DNA template libraries to the Sequel Polymerase 2.0 in preparation for sequencing on the Sequel System. The result is a DNA polymerase/template complex. Sequel Binding Kit 1.0 should be used only with Sequel Sequencing Kit 2.0. Reagent quantities support 24 binding reactions." ExpirationDate="2030-06-20" LotNumber="777777" Name="Sequel® Binding Kit 2.0" PartNumber="100-862-200" Tags="Binding Kit, BDK" Version="2.0"/>
+                        <pbmeta:SequencingKitPlate Barcode="010765999861800030818" Description="The DNA Sequencing Kit contains a sequencing reagent plate with chemistry for single molecule real-time sequencing on the PacBio Sequel®. Reagent quantities support 8 sequencing reactions to be used in conjunction with SMRT® Cell 4Pac(s).  (8 Cells max/Each Row supplies reagents for 1 Sequel SMRT Cell)" ExpirationDate="2018-03-08" IsRestricted="true" LotNumber="010765" Name="Sequel® Dev Sequencing Plate" PartNumber="999-861-800" Tags="Sequencing Kit, SQK" Version="3.0">
+                            <pbrk:ReagentTubes Barcode="012197100619600033122" ExpirationDate="2022-03-31" LotNumber="012197" Name="Sequel® SMRT®Cell Oil" PartNumber="100-619-600"/>
+                        </pbmeta:SequencingKitPlate>
+                        <pbmeta:Primary>
+                            <pbmeta:AutomationName>SequelAlpha</pbmeta:AutomationName>
+                            <pbmeta:ConfigFileName>SqlPoC_SubCrf_2C2A-t2.xml</pbmeta:ConfigFileName>
+                            <pbmeta:OutputOptions>
+                                <pbmeta:ResultsFolder>326/3260208/r54075_20180905_221306/1_A01/</pbmeta:ResultsFolder>
+                                <pbmeta:CollectionPathUri>/pbi/collections/326/3260208/r54075_20180905_221306/1_A01/</pbmeta:CollectionPathUri>
+                                <pbmeta:CopyFiles>
+                                    <pbmeta:CollectionFileCopy>Trace</pbmeta:CollectionFileCopy>
+                                    <pbmeta:CollectionFileCopy>Fasta</pbmeta:CollectionFileCopy>
+                                    <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                                    <pbmeta:CollectionFileCopy>StatsH5</pbmeta:CollectionFileCopy>
+                                </pbmeta:CopyFiles>
+                                <pbmeta:Readout>Pulses</pbmeta:Readout>
+                                <pbmeta:MetricsVerbosity>High</pbmeta:MetricsVerbosity>
+                                <pbmeta:TransferResource>
+                                    <pbmeta:Id>srs-pbi-collections</pbmeta:Id>
+                                    <pbmeta:TransferScheme>SRS</pbmeta:TransferScheme>
+                                    <pbmeta:Name>PBI Collections Xfer test using Rsync+SSH</pbmeta:Name>
+                                    <pbmeta:Description>Test Location for writing Transfer services to write to. Should be used by internal tools (PA SIM) and ICS tests</pbmeta:Description>
+                                    <pbmeta:DestPath>/pbi/collections</pbmeta:DestPath>
+                                </pbmeta:TransferResource>
+                            </pbmeta:OutputOptions>
+                        </pbmeta:Primary>
+                        <pbmeta:Secondary>
+                            <pbmeta:AutomationName>NoRS_Standard_Edna.1</pbmeta:AutomationName>
+                            <pbmeta:AutomationParameters>
+                                <pbmeta:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="Reference" SimpleValue="EColi5k_001_BsaI_AS_tc6_scr_tc6_unrolled_circular_12x_l123900" ValueDataType="String"/>
+                            </pbmeta:AutomationParameters>
+                            <pbmeta:CellCountInJob>0</pbmeta:CellCountInJob>
+                        </pbmeta:Secondary>
+                        <pbmeta:UserDefinedFields>
+                            <pbbase:DataEntities Name="LIMS_IMPORT" SimpleValue="1234TestFolder" ValueDataType="String"/>
+                            <pbbase:DataEntities Name="USER" SimpleValue="TestUser" ValueDataType="String"/>
+                            <pbbase:DataEntities ValueDataType="String"/>
+                            <pbbase:DataEntities ValueDataType="String"/>
+                            <pbbase:DataEntities ValueDataType="String"/>
+                            <pbbase:DataEntities ValueDataType="String"/>
+                        </pbmeta:UserDefinedFields>
+                        <pbmeta:ComponentVersions>
+                            <pbmeta:VersionInfo Name="ics" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="iui" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="chemistry" Version="6.0.0.SNAPSHOT39452"/>
+                            <pbmeta:VersionInfo Name="pa" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="paws" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="ppa" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="realtime" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="transfer" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="smrtlink-analysisservices-gui" Version="6.0.0.45618"/>
+                            <pbmeta:VersionInfo Name="smrtimisc" Version="6.0.0.45621"/>
+                            <pbmeta:VersionInfo Name="smrtlink" Version="6.0.0.45621"/>
+                            <pbmeta:VersionInfo Name="smrttools" Version="6.0.0.45580"/>
+                            <pbmeta:VersionInfo Name="smrtinub" Version="6.0.0.45580"/>
+                            <pbmeta:VersionInfo Name="smrtview" Version="6.0.0.45580"/>
+                        </pbmeta:ComponentVersions>
+                    </pbmeta:CollectionMetadata>
+                </pbmeta:Collections>
+            </pbds:DataSetMetadata>
+        </pbds:ConsensusReadSet>
+        <pbds:ConsensusReadSet CreatedAt="2018-09-28T08:29:36.624Z" MetaType="PacBio.DataSet.ConsensusReadSet" Name="3260208_188nM-GTAC_3xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494-Cell1 (CCS)" Tags="subreadset,CCS" TimeStampedName="3260208_188nM-GTAC_3xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494-Cell1 (CCS)-180928_082936624" UniqueId="a0204c3f-155b-cf03-fc07-968f07a686d4" Version="3.0.1" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+            <pbbase:ExternalResources>
+                <pbbase:ExternalResource MetaType="PacBio.ConsensusReadFile.ConsensusReadBamFile" ResourceId="m54075_180905_225130.ccs.bam" TimeStampedName="pacbio_consensusreadfile_consensusreadbamfile-180928_082936624" UniqueId="0e8b9a59-e58b-40e2-82f5-eadc84e9f98d" Version="3.0.1">
+                    <pbbase:FileIndices>
+                        <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="m54075_180905_225130.ccs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-180928_082936624" UniqueId="04c14584-ed44-49bb-862a-1c9d8e1a958a" Version="3.0.1"/>
+                    </pbbase:FileIndices>
+                </pbbase:ExternalResource>
+            </pbbase:ExternalResources>
+            <pbds:DataSetMetadata>
+                <pbds:TotalLength>15995</pbds:TotalLength>
+                <pbds:NumRecords>12</pbds:NumRecords>
+                <pbmeta:Collections>
+                    <pbmeta:CollectionMetadata Context="m54075_180905_225130" CreatedAt="0001-01-01T00:00:00" InstrumentId="60001" InstrumentName="Inst60001" MetaType="CollectionMetadata" ModifiedAt="0001-01-01T00:00:00" Status="Ready" TimeStampedName="60001-CollectionMetadata-2015-07-09T09:30:47.833Z" UniqueId="cd8e9b35-e755-4b6b-9878-ba45e6700781">
+                        <pbmeta:InstCtrlVer>6.0.0.SNAPSHOT39495</pbmeta:InstCtrlVer>
+                        <pbmeta:SigProcVer>6.0.0.SNAPSHOT39495</pbmeta:SigProcVer>
+                        <pbmeta:RunDetails>
+                            <pbmeta:TimeStampedName>r54075_20180905_225050</pbmeta:TimeStampedName>
+                            <pbmeta:Name>3260208_188nM-GTAC_3xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494</pbmeta:Name>
+                            <pbmeta:CreatedBy>unknown</pbmeta:CreatedBy>
+                            <pbmeta:WhenCreated>2015-07-09T09:30:47.833Z</pbmeta:WhenCreated>
+                            <pbmeta:StartedBy>unknown</pbmeta:StartedBy>
+                        </pbmeta:RunDetails>
+                        <pbmeta:WellSample CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="3260208_188nM-GTAC_3xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494">
+                            <pbmeta:WellName>SamplePlate-1-A-1</pbmeta:WellName>
+                            <pbmeta:Concentration>0</pbmeta:Concentration>
+                            <pbmeta:SampleReuseEnabled>false</pbmeta:SampleReuseEnabled>
+                            <pbmeta:StageHotstartEnabled>false</pbmeta:StageHotstartEnabled>
+                            <pbmeta:SizeSelectionEnabled>false</pbmeta:SizeSelectionEnabled>
+                            <pbmeta:UseCount>0</pbmeta:UseCount>
+                        </pbmeta:WellSample>
+                        <pbmeta:Automation Name="manualcellprep.py">
+                            <pbbase:AutomationParameters>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="CouplerLaserPower" SimpleValue="7" ValueDataType="Double"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="MovieLength" SimpleValue="15" ValueDataType="Double"/>
+                                <pbbase:AutomationParameter CreatedAt="2018-03-19T20:02:47.574Z" ModifiedAt="0001-01-01T00:00:00" Name="Exposure" SimpleValue="0.01" ValueDataType="String"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="SNRCut" SimpleValue="2.5" ValueDataType="Double"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="ActiveLaserPower" SimpleValue="False" ValueDataType="String"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="sequencingPixelROI" SimpleValue="[[0,0,1080,1920]]" ValueDataType="JSON"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="traceFilePixelROI" SimpleValue="[[0,0,8,32],[71,0,8,32],[142,0,8,32],[213,0,8,32],[284,0,8,32],[355,0,8,32],[426,0,8,32],[497,0,8,32],[568,0,8,32],[639,0,8,32],[710,0,8,32],[781,0,8,32],[852,0,8,32],[923,0,8,32],[994,0,8,32],[1072,0,8,32],[0,128,8,32],[71,128,8,32],[142,128,8,32],[213,128,8,32],[284,128,8,32],[355,128,8,32],[426,128,8,32],[497,128,8,32],[568,128,8,32],[639,128,8,32],[710,128,8,32],[781,128,8,32],[852,128,8,32],[923,128,8,32],[994,128,8,32],[1072,128,8,32],[0,256,8,32],[71,256,8,32],[142,256,8,32],[213,256,8,32],[284,256,8,32],[355,256,8,32],[426,256,8,32],[497,256,8,32],[568,256,8,32],[639,256,8,32],[710,256,8,32],[781,256,8,32],[852,256,8,32],[923,256,8,32],[994,256,8,32],[1072,256,8,32],[0,384,8,32],[71,384,8,32],[142,384,8,32],[213,384,8,32],[284,384,8,32],[355,384,8,32],[426,384,8,32],[497,384,8,32],[568,384,8,32],[639,384,8,32],[710,384,8,32],[781,384,8,32],[852,384,8,32],[923,384,8,32],[994,384,8,32],[1072,384,8,32],[0,512,8,32],[71,512,8,32],[142,512,8,32],[213,512,8,32],[284,512,8,32],[355,512,8,32],[426,512,8,32],[497,512,8,32],[568,512,8,32],[639,512,8,32],[710,512,8,32],[781,512,8,32],[852,512,8,32],[923,512,8,32],[994,512,8,32],[1072,512,8,32],[0,640,8,32],[71,640,8,32],[142,640,8,32],[213,640,8,32],[284,640,8,32],[355,640,8,32],[426,640,8,32],[497,640,8,32],[568,640,8,32],[639,640,8,32],[710,640,8,32],[781,640,8,32],[852,640,8,32],[923,640,8,32],[994,640,8,32],[1072,640,8,32],[0,768,8,32],[71,768,8,32],[142,768,8,32],[213,768,8,32],[284,768,8,32],[355,768,8,32],[426,768,8,32],[497,768,8,32],[568,768,8,32],[639,768,8,32],[710,768,8,32],[781,768,8,32],[852,768,8,32],[923,768,8,32],[994,768,8,32],[1072,768,8,32],[0,896,8,32],[71,896,8,32],[142,896,8,32],[213,896,8,32],[284,896,8,32],[355,896,8,32],[426,896,8,32],[497,896,8,32],[568,896,8,32],[639,896,8,32],[710,896,8,32],[781,896,8,32],[852,896,8,32],[923,896,8,32],[994,896,8,32],[1072,896,8,32],[0,1024,8,32],[71,1024,8,32],[142,1024,8,32],[213,1024,8,32],[284,1024,8,32],[355,1024,8,32],[426,1024,8,32],[497,1024,8,32],[568,1024,8,32],[639,1024,8,32],[710,1024,8,32],[781,1024,8,32],[852,1024,8,32],[923,1024,8,32],[994,1024,8,32],[1072,1024,8,32],[0,1152,8,32],[71,1152,8,32],[142,1152,8,32],[213,1152,8,32],[284,1152,8,32],[355,1152,8,32],[426,1152,8,32],[497,1152,8,32],[568,1152,8,32],[639,1152,8,32],[710,1152,8,32],[781,1152,8,32],[852,1152,8,32],[923,1152,8,32],[994,1152,8,32],[1072,1152,8,32],[0,1280,8,32],[71,1280,8,32],[142,1280,8,32],[213,1280,8,32],[284,1280,8,32],[355,1280,8,32],[426,1280,8,32],[497,1280,8,32],[568,1280,8,32],[639,1280,8,32],[710,1280,8,32],[781,1280,8,32],[852,1280,8,32],[923,1280,8,32],[994,1280,8,32],[1072,1280,8,32],[0,1408,8,32],[71,1408,8,32],[142,1408,8,32],[213,1408,8,32],[284,1408,8,32],[355,1408,8,32],[426,1408,8,32],[497,1408,8,32],[568,1408,8,32],[639,1408,8,32],[710,1408,8,32],[781,1408,8,32],[852,1408,8,32],[923,1408,8,32],[994,1408,8,32],[1072,1408,8,32],[0,1536,8,32],[71,1536,8,32],[142,1536,8,32],[213,1536,8,32],[284,1536,8,32],[355,1536,8,32],[426,1536,8,32],[497,1536,8,32],[568,1536,8,32],[639,1536,8,32],[710,1536,8,32],[781,1536,8,32],[852,1536,8,32],[923,1536,8,32],[994,1536,8,32],[1072,1536,8,32],[0,1664,8,32],[71,1664,8,32],[142,1664,8,32],[213,1664,8,32],[284,1664,8,32],[355,1664,8,32],[426,1664,8,32],[497,1664,8,32],[568,1664,8,32],[639,1664,8,32],[710,1664,8,32],[781,1664,8,32],[852,1664,8,32],[923,1664,8,32],[994,1664,8,32],[1072,1664,8,32],[0,1792,8,32],[71,1792,8,32],[142,1792,8,32],[213,1792,8,32],[284,1792,8,32],[355,1792,8,32],[426,1792,8,32],[497,1792,8,32],[568,1792,8,32],[639,1792,8,32],[710,1792,8,32],[781,1792,8,32],[852,1792,8,32],[923,1792,8,32],[994,1792,8,32],[1072,1792,8,32],[0,1888,8,32],[71,1888,8,32],[142,1888,8,32],[213,1888,8,32],[284,1888,8,32],[355,1888,8,32],[426,1888,8,32],[497,1888,8,32],[568,1888,8,32],[639,1888,8,32],[710,1888,8,32],[781,1888,8,32],[852,1888,8,32],[923,1888,8,32],[994,1888,8,32],[1072,1888,8,32]]" ValueDataType="JSON"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="CellAtStage" SimpleValue="True" ValueDataType="String"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="SkipAlignment" SimpleValue="False" ValueDataType="String"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="UseStageHotStart" SimpleValue="False" ValueDataType="String"/>
+                                <pbbase:AutomationParameter Name="CellNFCIndex" SimpleValue="1" ValueDataType="Int32"/>
+                                <pbbase:AutomationParameter Name="CollectionNumber" SimpleValue="0" ValueDataType="Int32"/>
+                                <pbbase:AutomationParameter Name="HasN2Switch" SimpleValue="False" ValueDataType="Boolean"/>
+                                <pbbase:AutomationParameter Name="TipSearchMaxDuration" SimpleValue="576" ValueDataType="Int32"/>
+                            </pbbase:AutomationParameters>
+                        </pbmeta:Automation>
+                        <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                        <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                        <pbmeta:SetNumber>0</pbmeta:SetNumber>
+                        <pbmeta:CellPac Barcode="BA233119" Description="The Sequel SMRT Cell 1M (4/Pack) are nanofabricated consumables each patterned with 1,000,000 wells called zero-mode waveguides (ZMWs). They are packaged together in a streamlined 4 SMRT Cell format (4/tray). One SMRT Cell is utilized in each sequencing reaction, and experiments can be run in single or in batch mode, giving your projects a greater level of flexibility." ExpirationDate="2018-06-08" LotNumber="321693" MovieTimeGrade="LR" Name="SMRT® Cell 1M v2 LR (4/Pack)" PartNumber="101-008-001" Version="2.0">
+                            <pbbase:ChipLayout>SequEL_4.0_RTO3</pbbase:ChipLayout>
+                        </pbmeta:CellPac>
+                        <pbmeta:ControlKit Barcode="444444101084300062030" ChipType="1mChip" Description="The Sequel DNA Internal Control 2.0 contains a fixed template of 2 kb length bound to Sequel Binding Kit 2.0 for use as an internal sequencing control. Reagent quantities provide spike-in controls for a minimum of 24 samples." ExpirationDate="2030-06-20" LotNumber="444444" Name="Sequel® DNA Internal Control 2.0" PartNumber="101-084-300" Tags="Control Kit, CCK" Version="2.0">
+                            <pbbase:CustomSequence>&gt;left_adapter\nTAGAGAGAGAAAAGGAGGAGGAGGCAACAACAACAACTCTCTCTA\n&gt;right_adapter\nTAGAGAGAGAAAAGGAGGAGGAGGCAACAACAACAACTCTCTCTA\n&gt;custom_sequence\nTGTCTAGGTCATCTCAACGTAGCTTTGACATATAACTTATCTAAAGTAATCCCTGCACACCTGTATGCATTATGCTTGCTATACACGCGGACACAGGCATATCATTTATTTTTTGCCATGTCCATTAATTGTTCAATAATTTTACTCACGGTATTTAATTTGATGTTGTGTTATATAGAATTGGAATTAAACTTATAAGGATGCTTAGACGTTGCATTATAAAAGTTTATGTACTAAGTATTTAAGACATTGGCATATGATTATAGCTTGACATTATTAAAAATTAATTAATTAAATCTCACACAATACTTATTCAAGACATTTTTACTAAGATAACCAAAGGAATGCGAACAAAATAATACTTAAAATATAAGACTTAGAAGTAATATGATCCAATAGTTACATATAGTACACTAAGTTCCTAAATTATATAACTTTAAAACAAAGTTACGAAATTTGGAAATAATTTTATTTAATCATATTTTCATAATAATGAAATACTGTTTATTTCAGTGGCGAAAAGAGATAATACGATTTTATAGTGATAGAATATCCTTGAAATATCTAAAGATAAAATTAGAAACTTTCTCTTTTCGCTGTAAAGCTATATGACTTAAAAATAACTTATACGCAAAGTATATTGCAGTGGAAACCCAAGAGTATAGTAGCCATGTAATCTCGGGTTCGAAACTACACGCCGCGCACGTAGTCAGATGGTCTGAAACTTGTCTGGGGCTGTTTGTTGACGGATGGAGACTTCACTAAGTGGCGTCAGGCGATGCGCACACACGGGACTCAATCCCGTAGCATGTTATGTGTCGTTCGAAACTCGTGCGTTCGAGATTTACGCCACATTGCCGGCTGGTCCAAGGACGTTATCTACCAGATGATACGGTCCAATTCGTAAGTTTGACTCACATAGTCGCGAACCGCGGAGCTGGAGAACAATAATTACCGGATGATTAGTTGACCATACGCACTATCATGCTCCGTGACTCAGTTTCCGCCATGGAGTTCTCACAGCCCCGTGTGTACCATAACTGCAGTAAGTAAGGACCTTGTTCGGAGGCCGACTCGTATTTCATATGATCTTAGTCTCGCCACCTTATCGCACGAATTGGGGGTGTCTTTTAGCCGACTCCGGCACGATCCGCCGGGAAGTTACTCGACCAGTTGCGGGACGCCCTAGTATGTTCGTATTACGTTCGATGCGTAAGCACCCCAGAGATTTTTGGCGGACGTTTCGGTAAATCATAGTAGAACCGGAGCGGTAAAGCTATTGATAACACGCAGGGACGAGCCAGTCGTCTAAGCTCCTCAGGGGTACCGTTCGCCGGACTACAGCCTGTCCCCGGCGGCCGCAACTGGGCTGCGATCCAGCCCCCGCTCCAAAAGGATGACTCGACCTTGCGCCTCGCGTACTCTGCTCTCGAGCTGTCTCCGTGGGCAATGCCGGCTCACGCTGTGGGGAACCCTGGACGCCCGGGCCGAGCCGACGTGGCCCCGCCCAGGCCTTTTCGTCGATCGCAGCTATGTACCCTGTGCTGGCCAGCGCTACTGCGCCGGCCATTAGCGGTGCGCTCTCGACTCGGCCCCAACGTAGACGGCGTCGCTGGCCGGATTCAAAGAAGTGAGCTACTACCATCGCGTGACGCCCTGCGGGCCTGAGTAACCGTGCACGAAGGACACCCCGTTCGTGGCGGGGGTTGCCTCCGCGACGGTCGCCAACGTTGGGGGTCGGTGCATTCAGGCGACGAGGGACCGCTGGTTTCCGGAGAGCGGCCTGTGCTCACACAGGTGCGGTCCATGGGGCCTGTGGATCCGGTTCTCCCACGCGTAGCGCCGGCGTTAGCATGGACGCTAAATAAGTATACGCCGGCAAAGGGAGTGTAGGCCGGCCCGAGGGCAATCGCGGTTACCGGGGTGGGGGAGCTCCCCGCACCAGCCTTGATGTGGTGTGCGAGCG</pbbase:CustomSequence>
+                        </pbmeta:ControlKit>
+                        <pbmeta:TemplatePrepKit Barcode="333333100902400092215" Description="The SMRTbell® Template Prep Kit contains reagent supplies to perform SMRTbell library preparations of primer-annealed SMRTbell libraries for insert sizes ranging from 500 bp to over 20 kb." ExpirationDate="2015-09-22" IsRestricted="true" LotNumber="333333" MaxInsertSize="20000" MinInsertSize="500" Name="CRISPR/Cas9 Placeholder w/ asymmetric adapters" PartNumber="100-902-400" Tags="Template Prep Kit, TPK" Version="1.0">
+                            <pbbase:LeftAdaptorSequence>ATCTCTCTCAATTTTTTTTTTTTTTTTTTTTTTTAAGAGAGAGAT</pbbase:LeftAdaptorSequence>
+                            <pbbase:LeftPrimerSequence>aacggaggaggagga</pbbase:LeftPrimerSequence>
+                            <pbbase:RightAdaptorSequence>ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT</pbbase:RightAdaptorSequence>
+                            <pbbase:RightPrimerSequence>aacggaggaggagga</pbbase:RightPrimerSequence>
+                        </pbmeta:TemplatePrepKit>
+                        <pbmeta:BindingKit Barcode="777777100862200062030" ChipType="1mChip" Description="The Sequel Binding Kit 2.0 contains reagent supplies to bind prepared DNA template libraries to the Sequel Polymerase 2.0 in preparation for sequencing on the Sequel System. The result is a DNA polymerase/template complex. Sequel Binding Kit 1.0 should be used only with Sequel Sequencing Kit 2.0. Reagent quantities support 24 binding reactions." ExpirationDate="2030-06-20" LotNumber="777777" Name="Sequel® Binding Kit 2.0" PartNumber="100-862-200" Tags="Binding Kit, BDK" Version="2.0"/>
+                        <pbmeta:SequencingKitPlate Barcode="010765999861800030818" Description="The DNA Sequencing Kit contains a sequencing reagent plate with chemistry for single molecule real-time sequencing on the PacBio Sequel®. Reagent quantities support 8 sequencing reactions to be used in conjunction with SMRT® Cell 4Pac(s).  (8 Cells max/Each Row supplies reagents for 1 Sequel SMRT Cell)" ExpirationDate="2018-03-08" IsRestricted="true" LotNumber="010765" Name="Sequel® Dev Sequencing Plate" PartNumber="999-861-800" Tags="Sequencing Kit, SQK" Version="3.0">
+                            <pbrk:ReagentTubes Barcode="012197100619600033122" ExpirationDate="2022-03-31" LotNumber="012197" Name="Sequel® SMRT®Cell Oil" PartNumber="100-619-600"/>
+                        </pbmeta:SequencingKitPlate>
+                        <pbmeta:Primary>
+                            <pbmeta:AutomationName>SequelAlpha</pbmeta:AutomationName>
+                            <pbmeta:ConfigFileName>SqlPoC_SubCrf_2C2A-t2.xml</pbmeta:ConfigFileName>
+                            <pbmeta:OutputOptions>
+                                <pbmeta:ResultsFolder>326/3260208/r54075_20180905_225050/1_A01/</pbmeta:ResultsFolder>
+                                <pbmeta:CollectionPathUri>/pbi/collections/326/3260208/r54075_20180905_225050/1_A01/</pbmeta:CollectionPathUri>
+                                <pbmeta:CopyFiles>
+                                    <pbmeta:CollectionFileCopy>Trace</pbmeta:CollectionFileCopy>
+                                    <pbmeta:CollectionFileCopy>Fasta</pbmeta:CollectionFileCopy>
+                                    <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                                    <pbmeta:CollectionFileCopy>StatsH5</pbmeta:CollectionFileCopy>
+                                </pbmeta:CopyFiles>
+                                <pbmeta:Readout>Pulses</pbmeta:Readout>
+                                <pbmeta:MetricsVerbosity>High</pbmeta:MetricsVerbosity>
+                                <pbmeta:TransferResource>
+                                    <pbmeta:Id>srs-pbi-collections</pbmeta:Id>
+                                    <pbmeta:TransferScheme>SRS</pbmeta:TransferScheme>
+                                    <pbmeta:Name>PBI Collections Xfer test using Rsync+SSH</pbmeta:Name>
+                                    <pbmeta:Description>Test Location for writing Transfer services to write to. Should be used by internal tools (PA SIM) and ICS tests</pbmeta:Description>
+                                    <pbmeta:DestPath>/pbi/collections</pbmeta:DestPath>
+                                </pbmeta:TransferResource>
+                            </pbmeta:OutputOptions>
+                        </pbmeta:Primary>
+                        <pbmeta:Secondary>
+                            <pbmeta:AutomationName>NoRS_Standard_Edna.1</pbmeta:AutomationName>
+                            <pbmeta:AutomationParameters>
+                                <pbmeta:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="Reference" SimpleValue="EColi5k_001_BsaI_AS_tc6_scr_tc6_unrolled_circular_12x_l123900" ValueDataType="String"/>
+                            </pbmeta:AutomationParameters>
+                            <pbmeta:CellCountInJob>0</pbmeta:CellCountInJob>
+                        </pbmeta:Secondary>
+                        <pbmeta:UserDefinedFields>
+                            <pbbase:DataEntities Name="LIMS_IMPORT" SimpleValue="1234TestFolder" ValueDataType="String"/>
+                            <pbbase:DataEntities Name="USER" SimpleValue="TestUser" ValueDataType="String"/>
+                            <pbbase:DataEntities ValueDataType="String"/>
+                            <pbbase:DataEntities ValueDataType="String"/>
+                            <pbbase:DataEntities ValueDataType="String"/>
+                            <pbbase:DataEntities ValueDataType="String"/>
+                        </pbmeta:UserDefinedFields>
+                        <pbmeta:ComponentVersions>
+                            <pbmeta:VersionInfo Name="ics" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="iui" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="chemistry" Version="6.0.0.SNAPSHOT39452"/>
+                            <pbmeta:VersionInfo Name="pa" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="paws" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="ppa" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="realtime" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="transfer" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="smrtlink-analysisservices-gui" Version="6.0.0.45618"/>
+                            <pbmeta:VersionInfo Name="smrtimisc" Version="6.0.0.45621"/>
+                            <pbmeta:VersionInfo Name="smrtlink" Version="6.0.0.45621"/>
+                            <pbmeta:VersionInfo Name="smrttools" Version="6.0.0.45580"/>
+                            <pbmeta:VersionInfo Name="smrtinub" Version="6.0.0.45580"/>
+                            <pbmeta:VersionInfo Name="smrtview" Version="6.0.0.45580"/>
+                        </pbmeta:ComponentVersions>
+                    </pbmeta:CollectionMetadata>
+                </pbmeta:Collections>
+            </pbds:DataSetMetadata>
+        </pbds:ConsensusReadSet>
+        <pbds:ConsensusReadSet CreatedAt="2018-09-28T08:30:49.559Z" MetaType="PacBio.DataSet.ConsensusReadSet" Name="3260208_188nM-GTAC_4xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494-Cell1 (CCS)" Tags="subreadset,CCS" TimeStampedName="3260208_188nM-GTAC_4xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494-Cell1 (CCS)-180928_083049559" UniqueId="a76370b8-88e7-99d5-98d7-37022ec9ffc6" Version="3.0.1" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+            <pbbase:ExternalResources>
+                <pbbase:ExternalResource MetaType="PacBio.ConsensusReadFile.ConsensusReadBamFile" ResourceId="m54075_180905_233034.ccs.bam" TimeStampedName="pacbio_consensusreadfile_consensusreadbamfile-180928_083049559" UniqueId="6d3b4190-ad18-4b7a-a0f0-567f0d964d2b" Version="3.0.1">
+                    <pbbase:FileIndices>
+                        <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="m54075_180905_233034.ccs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-180928_083049559" UniqueId="5797a188-12b9-4b72-b779-d904289ce945" Version="3.0.1"/>
+                    </pbbase:FileIndices>
+                </pbbase:ExternalResource>
+            </pbbase:ExternalResources>
+            <pbds:DataSetMetadata>
+                <pbds:TotalLength>13118</pbds:TotalLength>
+                <pbds:NumRecords>12</pbds:NumRecords>
+                <pbmeta:Collections>
+                    <pbmeta:CollectionMetadata Context="m54075_180905_233034" CreatedAt="0001-01-01T00:00:00" InstrumentId="60001" InstrumentName="Inst60001" MetaType="CollectionMetadata" ModifiedAt="0001-01-01T00:00:00" Status="Ready" TimeStampedName="60001-CollectionMetadata-2015-07-09T09:30:47.833Z" UniqueId="ca4bfe85-3047-433f-a9ce-e8cb63d8b27f">
+                        <pbmeta:InstCtrlVer>6.0.0.SNAPSHOT39495</pbmeta:InstCtrlVer>
+                        <pbmeta:SigProcVer>6.0.0.SNAPSHOT39495</pbmeta:SigProcVer>
+                        <pbmeta:RunDetails>
+                            <pbmeta:TimeStampedName>r54075_20180905_232954</pbmeta:TimeStampedName>
+                            <pbmeta:Name>3260208_188nM-GTAC_4xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494</pbmeta:Name>
+                            <pbmeta:CreatedBy>unknown</pbmeta:CreatedBy>
+                            <pbmeta:WhenCreated>2015-07-09T09:30:47.833Z</pbmeta:WhenCreated>
+                            <pbmeta:StartedBy>unknown</pbmeta:StartedBy>
+                        </pbmeta:RunDetails>
+                        <pbmeta:WellSample CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="3260208_188nM-GTAC_4xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494">
+                            <pbmeta:WellName>SamplePlate-1-A-1</pbmeta:WellName>
+                            <pbmeta:Concentration>0</pbmeta:Concentration>
+                            <pbmeta:SampleReuseEnabled>false</pbmeta:SampleReuseEnabled>
+                            <pbmeta:StageHotstartEnabled>false</pbmeta:StageHotstartEnabled>
+                            <pbmeta:SizeSelectionEnabled>false</pbmeta:SizeSelectionEnabled>
+                            <pbmeta:UseCount>0</pbmeta:UseCount>
+                        </pbmeta:WellSample>
+                        <pbmeta:Automation Name="manualcellprep.py">
+                            <pbbase:AutomationParameters>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="CouplerLaserPower" SimpleValue="7" ValueDataType="Double"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="MovieLength" SimpleValue="15" ValueDataType="Double"/>
+                                <pbbase:AutomationParameter CreatedAt="2018-03-19T20:02:47.574Z" ModifiedAt="0001-01-01T00:00:00" Name="Exposure" SimpleValue="0.01" ValueDataType="String"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="SNRCut" SimpleValue="2.5" ValueDataType="Double"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="ActiveLaserPower" SimpleValue="False" ValueDataType="String"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="sequencingPixelROI" SimpleValue="[[0,0,1080,1920]]" ValueDataType="JSON"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="traceFilePixelROI" SimpleValue="[[0,0,8,32],[71,0,8,32],[142,0,8,32],[213,0,8,32],[284,0,8,32],[355,0,8,32],[426,0,8,32],[497,0,8,32],[568,0,8,32],[639,0,8,32],[710,0,8,32],[781,0,8,32],[852,0,8,32],[923,0,8,32],[994,0,8,32],[1072,0,8,32],[0,128,8,32],[71,128,8,32],[142,128,8,32],[213,128,8,32],[284,128,8,32],[355,128,8,32],[426,128,8,32],[497,128,8,32],[568,128,8,32],[639,128,8,32],[710,128,8,32],[781,128,8,32],[852,128,8,32],[923,128,8,32],[994,128,8,32],[1072,128,8,32],[0,256,8,32],[71,256,8,32],[142,256,8,32],[213,256,8,32],[284,256,8,32],[355,256,8,32],[426,256,8,32],[497,256,8,32],[568,256,8,32],[639,256,8,32],[710,256,8,32],[781,256,8,32],[852,256,8,32],[923,256,8,32],[994,256,8,32],[1072,256,8,32],[0,384,8,32],[71,384,8,32],[142,384,8,32],[213,384,8,32],[284,384,8,32],[355,384,8,32],[426,384,8,32],[497,384,8,32],[568,384,8,32],[639,384,8,32],[710,384,8,32],[781,384,8,32],[852,384,8,32],[923,384,8,32],[994,384,8,32],[1072,384,8,32],[0,512,8,32],[71,512,8,32],[142,512,8,32],[213,512,8,32],[284,512,8,32],[355,512,8,32],[426,512,8,32],[497,512,8,32],[568,512,8,32],[639,512,8,32],[710,512,8,32],[781,512,8,32],[852,512,8,32],[923,512,8,32],[994,512,8,32],[1072,512,8,32],[0,640,8,32],[71,640,8,32],[142,640,8,32],[213,640,8,32],[284,640,8,32],[355,640,8,32],[426,640,8,32],[497,640,8,32],[568,640,8,32],[639,640,8,32],[710,640,8,32],[781,640,8,32],[852,640,8,32],[923,640,8,32],[994,640,8,32],[1072,640,8,32],[0,768,8,32],[71,768,8,32],[142,768,8,32],[213,768,8,32],[284,768,8,32],[355,768,8,32],[426,768,8,32],[497,768,8,32],[568,768,8,32],[639,768,8,32],[710,768,8,32],[781,768,8,32],[852,768,8,32],[923,768,8,32],[994,768,8,32],[1072,768,8,32],[0,896,8,32],[71,896,8,32],[142,896,8,32],[213,896,8,32],[284,896,8,32],[355,896,8,32],[426,896,8,32],[497,896,8,32],[568,896,8,32],[639,896,8,32],[710,896,8,32],[781,896,8,32],[852,896,8,32],[923,896,8,32],[994,896,8,32],[1072,896,8,32],[0,1024,8,32],[71,1024,8,32],[142,1024,8,32],[213,1024,8,32],[284,1024,8,32],[355,1024,8,32],[426,1024,8,32],[497,1024,8,32],[568,1024,8,32],[639,1024,8,32],[710,1024,8,32],[781,1024,8,32],[852,1024,8,32],[923,1024,8,32],[994,1024,8,32],[1072,1024,8,32],[0,1152,8,32],[71,1152,8,32],[142,1152,8,32],[213,1152,8,32],[284,1152,8,32],[355,1152,8,32],[426,1152,8,32],[497,1152,8,32],[568,1152,8,32],[639,1152,8,32],[710,1152,8,32],[781,1152,8,32],[852,1152,8,32],[923,1152,8,32],[994,1152,8,32],[1072,1152,8,32],[0,1280,8,32],[71,1280,8,32],[142,1280,8,32],[213,1280,8,32],[284,1280,8,32],[355,1280,8,32],[426,1280,8,32],[497,1280,8,32],[568,1280,8,32],[639,1280,8,32],[710,1280,8,32],[781,1280,8,32],[852,1280,8,32],[923,1280,8,32],[994,1280,8,32],[1072,1280,8,32],[0,1408,8,32],[71,1408,8,32],[142,1408,8,32],[213,1408,8,32],[284,1408,8,32],[355,1408,8,32],[426,1408,8,32],[497,1408,8,32],[568,1408,8,32],[639,1408,8,32],[710,1408,8,32],[781,1408,8,32],[852,1408,8,32],[923,1408,8,32],[994,1408,8,32],[1072,1408,8,32],[0,1536,8,32],[71,1536,8,32],[142,1536,8,32],[213,1536,8,32],[284,1536,8,32],[355,1536,8,32],[426,1536,8,32],[497,1536,8,32],[568,1536,8,32],[639,1536,8,32],[710,1536,8,32],[781,1536,8,32],[852,1536,8,32],[923,1536,8,32],[994,1536,8,32],[1072,1536,8,32],[0,1664,8,32],[71,1664,8,32],[142,1664,8,32],[213,1664,8,32],[284,1664,8,32],[355,1664,8,32],[426,1664,8,32],[497,1664,8,32],[568,1664,8,32],[639,1664,8,32],[710,1664,8,32],[781,1664,8,32],[852,1664,8,32],[923,1664,8,32],[994,1664,8,32],[1072,1664,8,32],[0,1792,8,32],[71,1792,8,32],[142,1792,8,32],[213,1792,8,32],[284,1792,8,32],[355,1792,8,32],[426,1792,8,32],[497,1792,8,32],[568,1792,8,32],[639,1792,8,32],[710,1792,8,32],[781,1792,8,32],[852,1792,8,32],[923,1792,8,32],[994,1792,8,32],[1072,1792,8,32],[0,1888,8,32],[71,1888,8,32],[142,1888,8,32],[213,1888,8,32],[284,1888,8,32],[355,1888,8,32],[426,1888,8,32],[497,1888,8,32],[568,1888,8,32],[639,1888,8,32],[710,1888,8,32],[781,1888,8,32],[852,1888,8,32],[923,1888,8,32],[994,1888,8,32],[1072,1888,8,32]]" ValueDataType="JSON"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="CellAtStage" SimpleValue="True" ValueDataType="String"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="SkipAlignment" SimpleValue="False" ValueDataType="String"/>
+                                <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="UseStageHotStart" SimpleValue="False" ValueDataType="String"/>
+                                <pbbase:AutomationParameter Name="CellNFCIndex" SimpleValue="1" ValueDataType="Int32"/>
+                                <pbbase:AutomationParameter Name="CollectionNumber" SimpleValue="0" ValueDataType="Int32"/>
+                                <pbbase:AutomationParameter Name="HasN2Switch" SimpleValue="False" ValueDataType="Boolean"/>
+                                <pbbase:AutomationParameter Name="TipSearchMaxDuration" SimpleValue="576" ValueDataType="Int32"/>
+                            </pbbase:AutomationParameters>
+                        </pbmeta:Automation>
+                        <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                        <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                        <pbmeta:SetNumber>0</pbmeta:SetNumber>
+                        <pbmeta:CellPac Barcode="BA233119" Description="The Sequel SMRT Cell 1M (4/Pack) are nanofabricated consumables each patterned with 1,000,000 wells called zero-mode waveguides (ZMWs). They are packaged together in a streamlined 4 SMRT Cell format (4/tray). One SMRT Cell is utilized in each sequencing reaction, and experiments can be run in single or in batch mode, giving your projects a greater level of flexibility." ExpirationDate="2018-06-08" LotNumber="321693" MovieTimeGrade="LR" Name="SMRT® Cell 1M v2 LR (4/Pack)" PartNumber="101-008-001" Version="2.0">
+                            <pbbase:ChipLayout>SequEL_4.0_RTO3</pbbase:ChipLayout>
+                        </pbmeta:CellPac>
+                        <pbmeta:ControlKit Barcode="444444101084300062030" ChipType="1mChip" Description="The Sequel DNA Internal Control 2.0 contains a fixed template of 2 kb length bound to Sequel Binding Kit 2.0 for use as an internal sequencing control. Reagent quantities provide spike-in controls for a minimum of 24 samples." ExpirationDate="2030-06-20" LotNumber="444444" Name="Sequel® DNA Internal Control 2.0" PartNumber="101-084-300" Tags="Control Kit, CCK" Version="2.0">
+                            <pbbase:CustomSequence>&gt;left_adapter\nTAGAGAGAGAAAAGGAGGAGGAGGCAACAACAACAACTCTCTCTA\n&gt;right_adapter\nTAGAGAGAGAAAAGGAGGAGGAGGCAACAACAACAACTCTCTCTA\n&gt;custom_sequence\nTGTCTAGGTCATCTCAACGTAGCTTTGACATATAACTTATCTAAAGTAATCCCTGCACACCTGTATGCATTATGCTTGCTATACACGCGGACACAGGCATATCATTTATTTTTTGCCATGTCCATTAATTGTTCAATAATTTTACTCACGGTATTTAATTTGATGTTGTGTTATATAGAATTGGAATTAAACTTATAAGGATGCTTAGACGTTGCATTATAAAAGTTTATGTACTAAGTATTTAAGACATTGGCATATGATTATAGCTTGACATTATTAAAAATTAATTAATTAAATCTCACACAATACTTATTCAAGACATTTTTACTAAGATAACCAAAGGAATGCGAACAAAATAATACTTAAAATATAAGACTTAGAAGTAATATGATCCAATAGTTACATATAGTACACTAAGTTCCTAAATTATATAACTTTAAAACAAAGTTACGAAATTTGGAAATAATTTTATTTAATCATATTTTCATAATAATGAAATACTGTTTATTTCAGTGGCGAAAAGAGATAATACGATTTTATAGTGATAGAATATCCTTGAAATATCTAAAGATAAAATTAGAAACTTTCTCTTTTCGCTGTAAAGCTATATGACTTAAAAATAACTTATACGCAAAGTATATTGCAGTGGAAACCCAAGAGTATAGTAGCCATGTAATCTCGGGTTCGAAACTACACGCCGCGCACGTAGTCAGATGGTCTGAAACTTGTCTGGGGCTGTTTGTTGACGGATGGAGACTTCACTAAGTGGCGTCAGGCGATGCGCACACACGGGACTCAATCCCGTAGCATGTTATGTGTCGTTCGAAACTCGTGCGTTCGAGATTTACGCCACATTGCCGGCTGGTCCAAGGACGTTATCTACCAGATGATACGGTCCAATTCGTAAGTTTGACTCACATAGTCGCGAACCGCGGAGCTGGAGAACAATAATTACCGGATGATTAGTTGACCATACGCACTATCATGCTCCGTGACTCAGTTTCCGCCATGGAGTTCTCACAGCCCCGTGTGTACCATAACTGCAGTAAGTAAGGACCTTGTTCGGAGGCCGACTCGTATTTCATATGATCTTAGTCTCGCCACCTTATCGCACGAATTGGGGGTGTCTTTTAGCCGACTCCGGCACGATCCGCCGGGAAGTTACTCGACCAGTTGCGGGACGCCCTAGTATGTTCGTATTACGTTCGATGCGTAAGCACCCCAGAGATTTTTGGCGGACGTTTCGGTAAATCATAGTAGAACCGGAGCGGTAAAGCTATTGATAACACGCAGGGACGAGCCAGTCGTCTAAGCTCCTCAGGGGTACCGTTCGCCGGACTACAGCCTGTCCCCGGCGGCCGCAACTGGGCTGCGATCCAGCCCCCGCTCCAAAAGGATGACTCGACCTTGCGCCTCGCGTACTCTGCTCTCGAGCTGTCTCCGTGGGCAATGCCGGCTCACGCTGTGGGGAACCCTGGACGCCCGGGCCGAGCCGACGTGGCCCCGCCCAGGCCTTTTCGTCGATCGCAGCTATGTACCCTGTGCTGGCCAGCGCTACTGCGCCGGCCATTAGCGGTGCGCTCTCGACTCGGCCCCAACGTAGACGGCGTCGCTGGCCGGATTCAAAGAAGTGAGCTACTACCATCGCGTGACGCCCTGCGGGCCTGAGTAACCGTGCACGAAGGACACCCCGTTCGTGGCGGGGGTTGCCTCCGCGACGGTCGCCAACGTTGGGGGTCGGTGCATTCAGGCGACGAGGGACCGCTGGTTTCCGGAGAGCGGCCTGTGCTCACACAGGTGCGGTCCATGGGGCCTGTGGATCCGGTTCTCCCACGCGTAGCGCCGGCGTTAGCATGGACGCTAAATAAGTATACGCCGGCAAAGGGAGTGTAGGCCGGCCCGAGGGCAATCGCGGTTACCGGGGTGGGGGAGCTCCCCGCACCAGCCTTGATGTGGTGTGCGAGCG</pbbase:CustomSequence>
+                        </pbmeta:ControlKit>
+                        <pbmeta:TemplatePrepKit Barcode="333333100902400092215" Description="The SMRTbell® Template Prep Kit contains reagent supplies to perform SMRTbell library preparations of primer-annealed SMRTbell libraries for insert sizes ranging from 500 bp to over 20 kb." ExpirationDate="2015-09-22" IsRestricted="true" LotNumber="333333" MaxInsertSize="20000" MinInsertSize="500" Name="CRISPR/Cas9 Placeholder w/ asymmetric adapters" PartNumber="100-902-400" Tags="Template Prep Kit, TPK" Version="1.0">
+                            <pbbase:LeftAdaptorSequence>ATCTCTCTCAATTTTTTTTTTTTTTTTTTTTTTTAAGAGAGAGAT</pbbase:LeftAdaptorSequence>
+                            <pbbase:LeftPrimerSequence>aacggaggaggagga</pbbase:LeftPrimerSequence>
+                            <pbbase:RightAdaptorSequence>ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT</pbbase:RightAdaptorSequence>
+                            <pbbase:RightPrimerSequence>aacggaggaggagga</pbbase:RightPrimerSequence>
+                        </pbmeta:TemplatePrepKit>
+                        <pbmeta:BindingKit Barcode="777777100862200062030" ChipType="1mChip" Description="The Sequel Binding Kit 2.0 contains reagent supplies to bind prepared DNA template libraries to the Sequel Polymerase 2.0 in preparation for sequencing on the Sequel System. The result is a DNA polymerase/template complex. Sequel Binding Kit 1.0 should be used only with Sequel Sequencing Kit 2.0. Reagent quantities support 24 binding reactions." ExpirationDate="2030-06-20" LotNumber="777777" Name="Sequel® Binding Kit 2.0" PartNumber="100-862-200" Tags="Binding Kit, BDK" Version="2.0"/>
+                        <pbmeta:SequencingKitPlate Barcode="010765999861800030818" Description="The DNA Sequencing Kit contains a sequencing reagent plate with chemistry for single molecule real-time sequencing on the PacBio Sequel®. Reagent quantities support 8 sequencing reactions to be used in conjunction with SMRT® Cell 4Pac(s).  (8 Cells max/Each Row supplies reagents for 1 Sequel SMRT Cell)" ExpirationDate="2018-03-08" IsRestricted="true" LotNumber="010765" Name="Sequel® Dev Sequencing Plate" PartNumber="999-861-800" Tags="Sequencing Kit, SQK" Version="3.0">
+                            <pbrk:ReagentTubes Barcode="012197100619600033122" ExpirationDate="2022-03-31" LotNumber="012197" Name="Sequel® SMRT®Cell Oil" PartNumber="100-619-600"/>
+                        </pbmeta:SequencingKitPlate>
+                        <pbmeta:Primary>
+                            <pbmeta:AutomationName>SequelAlpha</pbmeta:AutomationName>
+                            <pbmeta:ConfigFileName>SqlPoC_SubCrf_2C2A-t2.xml</pbmeta:ConfigFileName>
+                            <pbmeta:OutputOptions>
+                                <pbmeta:ResultsFolder>326/3260208/r54075_20180905_232954/1_A01/</pbmeta:ResultsFolder>
+                                <pbmeta:CollectionPathUri>/pbi/collections/326/3260208/r54075_20180905_232954/1_A01/</pbmeta:CollectionPathUri>
+                                <pbmeta:CopyFiles>
+                                    <pbmeta:CollectionFileCopy>Trace</pbmeta:CollectionFileCopy>
+                                    <pbmeta:CollectionFileCopy>Fasta</pbmeta:CollectionFileCopy>
+                                    <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                                    <pbmeta:CollectionFileCopy>StatsH5</pbmeta:CollectionFileCopy>
+                                </pbmeta:CopyFiles>
+                                <pbmeta:Readout>Pulses</pbmeta:Readout>
+                                <pbmeta:MetricsVerbosity>High</pbmeta:MetricsVerbosity>
+                                <pbmeta:TransferResource>
+                                    <pbmeta:Id>srs-pbi-collections</pbmeta:Id>
+                                    <pbmeta:TransferScheme>SRS</pbmeta:TransferScheme>
+                                    <pbmeta:Name>PBI Collections Xfer test using Rsync+SSH</pbmeta:Name>
+                                    <pbmeta:Description>Test Location for writing Transfer services to write to. Should be used by internal tools (PA SIM) and ICS tests</pbmeta:Description>
+                                    <pbmeta:DestPath>/pbi/collections</pbmeta:DestPath>
+                                </pbmeta:TransferResource>
+                            </pbmeta:OutputOptions>
+                        </pbmeta:Primary>
+                        <pbmeta:Secondary>
+                            <pbmeta:AutomationName>NoRS_Standard_Edna.1</pbmeta:AutomationName>
+                            <pbmeta:AutomationParameters>
+                                <pbmeta:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="Reference" SimpleValue="EColi5k_001_BsaI_AS_tc6_scr_tc6_unrolled_circular_12x_l123900" ValueDataType="String"/>
+                            </pbmeta:AutomationParameters>
+                            <pbmeta:CellCountInJob>0</pbmeta:CellCountInJob>
+                        </pbmeta:Secondary>
+                        <pbmeta:UserDefinedFields>
+                            <pbbase:DataEntities Name="LIMS_IMPORT" SimpleValue="1234TestFolder" ValueDataType="String"/>
+                            <pbbase:DataEntities Name="USER" SimpleValue="TestUser" ValueDataType="String"/>
+                            <pbbase:DataEntities ValueDataType="String"/>
+                            <pbbase:DataEntities ValueDataType="String"/>
+                            <pbbase:DataEntities ValueDataType="String"/>
+                            <pbbase:DataEntities ValueDataType="String"/>
+                        </pbmeta:UserDefinedFields>
+                        <pbmeta:ComponentVersions>
+                            <pbmeta:VersionInfo Name="ics" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="iui" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="chemistry" Version="6.0.0.SNAPSHOT39452"/>
+                            <pbmeta:VersionInfo Name="pa" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="paws" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="ppa" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="realtime" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="transfer" Version="6.0.0.SNAPSHOT39495"/>
+                            <pbmeta:VersionInfo Name="smrtlink-analysisservices-gui" Version="6.0.0.45618"/>
+                            <pbmeta:VersionInfo Name="smrtimisc" Version="6.0.0.45621"/>
+                            <pbmeta:VersionInfo Name="smrtlink" Version="6.0.0.45621"/>
+                            <pbmeta:VersionInfo Name="smrttools" Version="6.0.0.45580"/>
+                            <pbmeta:VersionInfo Name="smrtinub" Version="6.0.0.45580"/>
+                            <pbmeta:VersionInfo Name="smrtview" Version="6.0.0.45580"/>
+                        </pbmeta:ComponentVersions>
+                    </pbmeta:CollectionMetadata>
+                </pbmeta:Collections>
+            </pbds:DataSetMetadata>
+        </pbds:ConsensusReadSet>
+    </pbds:DataSets>
+    <pbds:DataSetMetadata>
+        <pbds:TotalLength>0</pbds:TotalLength>
+        <pbds:NumRecords>36</pbds:NumRecords>
+        <pbsample:BioSamples>
+            <pbsample:BioSample Name="test test"/>
+        </pbsample:BioSamples>
+        <pbmeta:Collections>
+            <pbmeta:CollectionMetadata Context="m54075_180905_221350" CreatedAt="0001-01-01T00:00:00" InstrumentId="60001" InstrumentName="Inst60001" MetaType="CollectionMetadata" ModifiedAt="0001-01-01T00:00:00" Status="Ready" TimeStampedName="60001-CollectionMetadata-2015-07-09T09:30:47.833Z" UniqueId="ab0a16df-1bd5-4f09-aa98-35cbedde995d">
+                <pbmeta:InstCtrlVer>6.0.0.SNAPSHOT39495</pbmeta:InstCtrlVer>
+                <pbmeta:SigProcVer>6.0.0.SNAPSHOT39495</pbmeta:SigProcVer>
+                <pbmeta:RunDetails>
+                    <pbmeta:TimeStampedName>r54075_20180905_221306</pbmeta:TimeStampedName>
+                    <pbmeta:Name>3260208_188nM-GTAC_2xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494</pbmeta:Name>
+                    <pbmeta:CreatedBy>unknown</pbmeta:CreatedBy>
+                    <pbmeta:WhenCreated>2015-07-09T09:30:47.833Z</pbmeta:WhenCreated>
+                    <pbmeta:StartedBy>unknown</pbmeta:StartedBy>
+                </pbmeta:RunDetails>
+                <pbmeta:WellSample CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="3260208_188nM-GTAC_2xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494">
+                    <pbmeta:WellName>SamplePlate-1-A-1</pbmeta:WellName>
+                    <pbmeta:Concentration>0</pbmeta:Concentration>
+                    <pbmeta:SampleReuseEnabled>false</pbmeta:SampleReuseEnabled>
+                    <pbmeta:StageHotstartEnabled>false</pbmeta:StageHotstartEnabled>
+                    <pbmeta:SizeSelectionEnabled>false</pbmeta:SizeSelectionEnabled>
+                    <pbmeta:UseCount>0</pbmeta:UseCount>
+                    <pbsample:BioSamples>
+                        <pbsample:BioSample Name="  UCLA 1023 "/>
+                    </pbsample:BioSamples>
+                </pbmeta:WellSample>
+                <pbmeta:Automation Name="manualcellprep.py">
+                    <pbbase:AutomationParameters>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="CouplerLaserPower" SimpleValue="7" ValueDataType="Double"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="MovieLength" SimpleValue="15" ValueDataType="Double"/>
+                        <pbbase:AutomationParameter CreatedAt="2018-03-19T20:02:47.574Z" ModifiedAt="0001-01-01T00:00:00" Name="Exposure" SimpleValue="0.01" ValueDataType="String"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="SNRCut" SimpleValue="2.5" ValueDataType="Double"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="ActiveLaserPower" SimpleValue="False" ValueDataType="String"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="sequencingPixelROI" SimpleValue="[[0,0,1080,1920]]" ValueDataType="JSON"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="traceFilePixelROI" SimpleValue="[[0,0,8,32],[71,0,8,32],[142,0,8,32],[213,0,8,32],[284,0,8,32],[355,0,8,32],[426,0,8,32],[497,0,8,32],[568,0,8,32],[639,0,8,32],[710,0,8,32],[781,0,8,32],[852,0,8,32],[923,0,8,32],[994,0,8,32],[1072,0,8,32],[0,128,8,32],[71,128,8,32],[142,128,8,32],[213,128,8,32],[284,128,8,32],[355,128,8,32],[426,128,8,32],[497,128,8,32],[568,128,8,32],[639,128,8,32],[710,128,8,32],[781,128,8,32],[852,128,8,32],[923,128,8,32],[994,128,8,32],[1072,128,8,32],[0,256,8,32],[71,256,8,32],[142,256,8,32],[213,256,8,32],[284,256,8,32],[355,256,8,32],[426,256,8,32],[497,256,8,32],[568,256,8,32],[639,256,8,32],[710,256,8,32],[781,256,8,32],[852,256,8,32],[923,256,8,32],[994,256,8,32],[1072,256,8,32],[0,384,8,32],[71,384,8,32],[142,384,8,32],[213,384,8,32],[284,384,8,32],[355,384,8,32],[426,384,8,32],[497,384,8,32],[568,384,8,32],[639,384,8,32],[710,384,8,32],[781,384,8,32],[852,384,8,32],[923,384,8,32],[994,384,8,32],[1072,384,8,32],[0,512,8,32],[71,512,8,32],[142,512,8,32],[213,512,8,32],[284,512,8,32],[355,512,8,32],[426,512,8,32],[497,512,8,32],[568,512,8,32],[639,512,8,32],[710,512,8,32],[781,512,8,32],[852,512,8,32],[923,512,8,32],[994,512,8,32],[1072,512,8,32],[0,640,8,32],[71,640,8,32],[142,640,8,32],[213,640,8,32],[284,640,8,32],[355,640,8,32],[426,640,8,32],[497,640,8,32],[568,640,8,32],[639,640,8,32],[710,640,8,32],[781,640,8,32],[852,640,8,32],[923,640,8,32],[994,640,8,32],[1072,640,8,32],[0,768,8,32],[71,768,8,32],[142,768,8,32],[213,768,8,32],[284,768,8,32],[355,768,8,32],[426,768,8,32],[497,768,8,32],[568,768,8,32],[639,768,8,32],[710,768,8,32],[781,768,8,32],[852,768,8,32],[923,768,8,32],[994,768,8,32],[1072,768,8,32],[0,896,8,32],[71,896,8,32],[142,896,8,32],[213,896,8,32],[284,896,8,32],[355,896,8,32],[426,896,8,32],[497,896,8,32],[568,896,8,32],[639,896,8,32],[710,896,8,32],[781,896,8,32],[852,896,8,32],[923,896,8,32],[994,896,8,32],[1072,896,8,32],[0,1024,8,32],[71,1024,8,32],[142,1024,8,32],[213,1024,8,32],[284,1024,8,32],[355,1024,8,32],[426,1024,8,32],[497,1024,8,32],[568,1024,8,32],[639,1024,8,32],[710,1024,8,32],[781,1024,8,32],[852,1024,8,32],[923,1024,8,32],[994,1024,8,32],[1072,1024,8,32],[0,1152,8,32],[71,1152,8,32],[142,1152,8,32],[213,1152,8,32],[284,1152,8,32],[355,1152,8,32],[426,1152,8,32],[497,1152,8,32],[568,1152,8,32],[639,1152,8,32],[710,1152,8,32],[781,1152,8,32],[852,1152,8,32],[923,1152,8,32],[994,1152,8,32],[1072,1152,8,32],[0,1280,8,32],[71,1280,8,32],[142,1280,8,32],[213,1280,8,32],[284,1280,8,32],[355,1280,8,32],[426,1280,8,32],[497,1280,8,32],[568,1280,8,32],[639,1280,8,32],[710,1280,8,32],[781,1280,8,32],[852,1280,8,32],[923,1280,8,32],[994,1280,8,32],[1072,1280,8,32],[0,1408,8,32],[71,1408,8,32],[142,1408,8,32],[213,1408,8,32],[284,1408,8,32],[355,1408,8,32],[426,1408,8,32],[497,1408,8,32],[568,1408,8,32],[639,1408,8,32],[710,1408,8,32],[781,1408,8,32],[852,1408,8,32],[923,1408,8,32],[994,1408,8,32],[1072,1408,8,32],[0,1536,8,32],[71,1536,8,32],[142,1536,8,32],[213,1536,8,32],[284,1536,8,32],[355,1536,8,32],[426,1536,8,32],[497,1536,8,32],[568,1536,8,32],[639,1536,8,32],[710,1536,8,32],[781,1536,8,32],[852,1536,8,32],[923,1536,8,32],[994,1536,8,32],[1072,1536,8,32],[0,1664,8,32],[71,1664,8,32],[142,1664,8,32],[213,1664,8,32],[284,1664,8,32],[355,1664,8,32],[426,1664,8,32],[497,1664,8,32],[568,1664,8,32],[639,1664,8,32],[710,1664,8,32],[781,1664,8,32],[852,1664,8,32],[923,1664,8,32],[994,1664,8,32],[1072,1664,8,32],[0,1792,8,32],[71,1792,8,32],[142,1792,8,32],[213,1792,8,32],[284,1792,8,32],[355,1792,8,32],[426,1792,8,32],[497,1792,8,32],[568,1792,8,32],[639,1792,8,32],[710,1792,8,32],[781,1792,8,32],[852,1792,8,32],[923,1792,8,32],[994,1792,8,32],[1072,1792,8,32],[0,1888,8,32],[71,1888,8,32],[142,1888,8,32],[213,1888,8,32],[284,1888,8,32],[355,1888,8,32],[426,1888,8,32],[497,1888,8,32],[568,1888,8,32],[639,1888,8,32],[710,1888,8,32],[781,1888,8,32],[852,1888,8,32],[923,1888,8,32],[994,1888,8,32],[1072,1888,8,32]]" ValueDataType="JSON"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="CellAtStage" SimpleValue="True" ValueDataType="String"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="SkipAlignment" SimpleValue="False" ValueDataType="String"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="UseStageHotStart" SimpleValue="False" ValueDataType="String"/>
+                        <pbbase:AutomationParameter Name="CellNFCIndex" SimpleValue="1" ValueDataType="Int32"/>
+                        <pbbase:AutomationParameter Name="CollectionNumber" SimpleValue="0" ValueDataType="Int32"/>
+                        <pbbase:AutomationParameter Name="HasN2Switch" SimpleValue="False" ValueDataType="Boolean"/>
+                        <pbbase:AutomationParameter Name="TipSearchMaxDuration" SimpleValue="576" ValueDataType="Int32"/>
+                    </pbbase:AutomationParameters>
+                </pbmeta:Automation>
+                <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                <pbmeta:SetNumber>0</pbmeta:SetNumber>
+                <pbmeta:CellPac Barcode="BA233119" Description="The Sequel SMRT Cell 1M (4/Pack) are nanofabricated consumables each patterned with 1,000,000 wells called zero-mode waveguides (ZMWs). They are packaged together in a streamlined 4 SMRT Cell format (4/tray). One SMRT Cell is utilized in each sequencing reaction, and experiments can be run in single or in batch mode, giving your projects a greater level of flexibility." ExpirationDate="2018-06-08" LotNumber="321693" MovieTimeGrade="LR" Name="SMRT® Cell 1M v2 LR (4/Pack)" PartNumber="101-008-001" Version="2.0">
+                    <pbbase:ChipLayout>SequEL_4.0_RTO3</pbbase:ChipLayout>
+                </pbmeta:CellPac>
+                <pbmeta:ControlKit Barcode="444444101084300062030" ChipType="1mChip" Description="The Sequel DNA Internal Control 2.0 contains a fixed template of 2 kb length bound to Sequel Binding Kit 2.0 for use as an internal sequencing control. Reagent quantities provide spike-in controls for a minimum of 24 samples." ExpirationDate="2030-06-20" LotNumber="444444" Name="Sequel® DNA Internal Control 2.0" PartNumber="101-084-300" Tags="Control Kit, CCK" Version="2.0">
+                    <pbbase:CustomSequence>&gt;left_adapter\nTAGAGAGAGAAAAGGAGGAGGAGGCAACAACAACAACTCTCTCTA\n&gt;right_adapter\nTAGAGAGAGAAAAGGAGGAGGAGGCAACAACAACAACTCTCTCTA\n&gt;custom_sequence\nTGTCTAGGTCATCTCAACGTAGCTTTGACATATAACTTATCTAAAGTAATCCCTGCACACCTGTATGCATTATGCTTGCTATACACGCGGACACAGGCATATCATTTATTTTTTGCCATGTCCATTAATTGTTCAATAATTTTACTCACGGTATTTAATTTGATGTTGTGTTATATAGAATTGGAATTAAACTTATAAGGATGCTTAGACGTTGCATTATAAAAGTTTATGTACTAAGTATTTAAGACATTGGCATATGATTATAGCTTGACATTATTAAAAATTAATTAATTAAATCTCACACAATACTTATTCAAGACATTTTTACTAAGATAACCAAAGGAATGCGAACAAAATAATACTTAAAATATAAGACTTAGAAGTAATATGATCCAATAGTTACATATAGTACACTAAGTTCCTAAATTATATAACTTTAAAACAAAGTTACGAAATTTGGAAATAATTTTATTTAATCATATTTTCATAATAATGAAATACTGTTTATTTCAGTGGCGAAAAGAGATAATACGATTTTATAGTGATAGAATATCCTTGAAATATCTAAAGATAAAATTAGAAACTTTCTCTTTTCGCTGTAAAGCTATATGACTTAAAAATAACTTATACGCAAAGTATATTGCAGTGGAAACCCAAGAGTATAGTAGCCATGTAATCTCGGGTTCGAAACTACACGCCGCGCACGTAGTCAGATGGTCTGAAACTTGTCTGGGGCTGTTTGTTGACGGATGGAGACTTCACTAAGTGGCGTCAGGCGATGCGCACACACGGGACTCAATCCCGTAGCATGTTATGTGTCGTTCGAAACTCGTGCGTTCGAGATTTACGCCACATTGCCGGCTGGTCCAAGGACGTTATCTACCAGATGATACGGTCCAATTCGTAAGTTTGACTCACATAGTCGCGAACCGCGGAGCTGGAGAACAATAATTACCGGATGATTAGTTGACCATACGCACTATCATGCTCCGTGACTCAGTTTCCGCCATGGAGTTCTCACAGCCCCGTGTGTACCATAACTGCAGTAAGTAAGGACCTTGTTCGGAGGCCGACTCGTATTTCATATGATCTTAGTCTCGCCACCTTATCGCACGAATTGGGGGTGTCTTTTAGCCGACTCCGGCACGATCCGCCGGGAAGTTACTCGACCAGTTGCGGGACGCCCTAGTATGTTCGTATTACGTTCGATGCGTAAGCACCCCAGAGATTTTTGGCGGACGTTTCGGTAAATCATAGTAGAACCGGAGCGGTAAAGCTATTGATAACACGCAGGGACGAGCCAGTCGTCTAAGCTCCTCAGGGGTACCGTTCGCCGGACTACAGCCTGTCCCCGGCGGCCGCAACTGGGCTGCGATCCAGCCCCCGCTCCAAAAGGATGACTCGACCTTGCGCCTCGCGTACTCTGCTCTCGAGCTGTCTCCGTGGGCAATGCCGGCTCACGCTGTGGGGAACCCTGGACGCCCGGGCCGAGCCGACGTGGCCCCGCCCAGGCCTTTTCGTCGATCGCAGCTATGTACCCTGTGCTGGCCAGCGCTACTGCGCCGGCCATTAGCGGTGCGCTCTCGACTCGGCCCCAACGTAGACGGCGTCGCTGGCCGGATTCAAAGAAGTGAGCTACTACCATCGCGTGACGCCCTGCGGGCCTGAGTAACCGTGCACGAAGGACACCCCGTTCGTGGCGGGGGTTGCCTCCGCGACGGTCGCCAACGTTGGGGGTCGGTGCATTCAGGCGACGAGGGACCGCTGGTTTCCGGAGAGCGGCCTGTGCTCACACAGGTGCGGTCCATGGGGCCTGTGGATCCGGTTCTCCCACGCGTAGCGCCGGCGTTAGCATGGACGCTAAATAAGTATACGCCGGCAAAGGGAGTGTAGGCCGGCCCGAGGGCAATCGCGGTTACCGGGGTGGGGGAGCTCCCCGCACCAGCCTTGATGTGGTGTGCGAGCG</pbbase:CustomSequence>
+                </pbmeta:ControlKit>
+                <pbmeta:TemplatePrepKit Barcode="333333100902400092215" Description="The SMRTbell® Template Prep Kit contains reagent supplies to perform SMRTbell library preparations of primer-annealed SMRTbell libraries for insert sizes ranging from 500 bp to over 20 kb." ExpirationDate="2015-09-22" IsRestricted="true" LotNumber="333333" MaxInsertSize="20000" MinInsertSize="500" Name="CRISPR/Cas9 Placeholder w/ asymmetric adapters" PartNumber="100-902-400" Tags="Template Prep Kit, TPK" Version="1.0">
+                    <pbbase:LeftAdaptorSequence>ATCTCTCTCAATTTTTTTTTTTTTTTTTTTTTTTAAGAGAGAGAT</pbbase:LeftAdaptorSequence>
+                    <pbbase:LeftPrimerSequence>aacggaggaggagga</pbbase:LeftPrimerSequence>
+                    <pbbase:RightAdaptorSequence>ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT</pbbase:RightAdaptorSequence>
+                    <pbbase:RightPrimerSequence>aacggaggaggagga</pbbase:RightPrimerSequence>
+                </pbmeta:TemplatePrepKit>
+                <pbmeta:BindingKit Barcode="777777100862200062030" ChipType="1mChip" Description="The Sequel Binding Kit 2.0 contains reagent supplies to bind prepared DNA template libraries to the Sequel Polymerase 2.0 in preparation for sequencing on the Sequel System. The result is a DNA polymerase/template complex. Sequel Binding Kit 1.0 should be used only with Sequel Sequencing Kit 2.0. Reagent quantities support 24 binding reactions." ExpirationDate="2030-06-20" LotNumber="777777" Name="Sequel® Binding Kit 2.0" PartNumber="100-862-200" Tags="Binding Kit, BDK" Version="2.0"/>
+                <pbmeta:SequencingKitPlate Barcode="010765999861800030818" Description="The DNA Sequencing Kit contains a sequencing reagent plate with chemistry for single molecule real-time sequencing on the PacBio Sequel®. Reagent quantities support 8 sequencing reactions to be used in conjunction with SMRT® Cell 4Pac(s).  (8 Cells max/Each Row supplies reagents for 1 Sequel SMRT Cell)" ExpirationDate="2018-03-08" IsRestricted="true" LotNumber="010765" Name="Sequel® Dev Sequencing Plate" PartNumber="999-861-800" Tags="Sequencing Kit, SQK" Version="3.0">
+                    <pbrk:ReagentTubes Barcode="012197100619600033122" ExpirationDate="2022-03-31" LotNumber="012197" Name="Sequel® SMRT®Cell Oil" PartNumber="100-619-600"/>
+                </pbmeta:SequencingKitPlate>
+                <pbmeta:Primary>
+                    <pbmeta:AutomationName>SequelAlpha</pbmeta:AutomationName>
+                    <pbmeta:ConfigFileName>SqlPoC_SubCrf_2C2A-t2.xml</pbmeta:ConfigFileName>
+                    <pbmeta:OutputOptions>
+                        <pbmeta:ResultsFolder>326/3260208/r54075_20180905_221306/1_A01/</pbmeta:ResultsFolder>
+                        <pbmeta:CollectionPathUri>/pbi/collections/326/3260208/r54075_20180905_221306/1_A01/</pbmeta:CollectionPathUri>
+                        <pbmeta:CopyFiles>
+                            <pbmeta:CollectionFileCopy>Trace</pbmeta:CollectionFileCopy>
+                            <pbmeta:CollectionFileCopy>Fasta</pbmeta:CollectionFileCopy>
+                            <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                            <pbmeta:CollectionFileCopy>StatsH5</pbmeta:CollectionFileCopy>
+                        </pbmeta:CopyFiles>
+                        <pbmeta:Readout>Pulses</pbmeta:Readout>
+                        <pbmeta:MetricsVerbosity>High</pbmeta:MetricsVerbosity>
+                        <pbmeta:TransferResource>
+                            <pbmeta:Id>srs-pbi-collections</pbmeta:Id>
+                            <pbmeta:TransferScheme>SRS</pbmeta:TransferScheme>
+                            <pbmeta:Name>PBI Collections Xfer test using Rsync+SSH</pbmeta:Name>
+                            <pbmeta:Description>Test Location for writing Transfer services to write to. Should be used by internal tools (PA SIM) and ICS tests</pbmeta:Description>
+                            <pbmeta:DestPath>/pbi/collections</pbmeta:DestPath>
+                        </pbmeta:TransferResource>
+                    </pbmeta:OutputOptions>
+                </pbmeta:Primary>
+                <pbmeta:Secondary>
+                    <pbmeta:AutomationName>NoRS_Standard_Edna.1</pbmeta:AutomationName>
+                    <pbmeta:AutomationParameters>
+                        <pbmeta:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="Reference" SimpleValue="EColi5k_001_BsaI_AS_tc6_scr_tc6_unrolled_circular_12x_l123900" ValueDataType="String"/>
+                    </pbmeta:AutomationParameters>
+                    <pbmeta:CellCountInJob>0</pbmeta:CellCountInJob>
+                </pbmeta:Secondary>
+                <pbmeta:UserDefinedFields>
+                    <pbbase:DataEntities Name="LIMS_IMPORT" SimpleValue="1234TestFolder" ValueDataType="String"/>
+                    <pbbase:DataEntities Name="USER" SimpleValue="TestUser" ValueDataType="String"/>
+                    <pbbase:DataEntities ValueDataType="String"/>
+                    <pbbase:DataEntities ValueDataType="String"/>
+                    <pbbase:DataEntities ValueDataType="String"/>
+                    <pbbase:DataEntities ValueDataType="String"/>
+                </pbmeta:UserDefinedFields>
+                <pbmeta:ComponentVersions>
+                    <pbmeta:VersionInfo Name="ics" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="iui" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="chemistry" Version="6.0.0.SNAPSHOT39452"/>
+                    <pbmeta:VersionInfo Name="pa" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="paws" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="ppa" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="realtime" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="transfer" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="smrtlink-analysisservices-gui" Version="6.0.0.45618"/>
+                    <pbmeta:VersionInfo Name="smrtimisc" Version="6.0.0.45621"/>
+                    <pbmeta:VersionInfo Name="smrtlink" Version="6.0.0.45621"/>
+                    <pbmeta:VersionInfo Name="smrttools" Version="6.0.0.45580"/>
+                    <pbmeta:VersionInfo Name="smrtinub" Version="6.0.0.45580"/>
+                    <pbmeta:VersionInfo Name="smrtview" Version="6.0.0.45580"/>
+                </pbmeta:ComponentVersions>
+            </pbmeta:CollectionMetadata>
+            <pbmeta:CollectionMetadata Context="m54075_180905_225130" CreatedAt="0001-01-01T00:00:00" InstrumentId="60001" InstrumentName="Inst60001" MetaType="CollectionMetadata" ModifiedAt="0001-01-01T00:00:00" Status="Ready" TimeStampedName="60001-CollectionMetadata-2015-07-09T09:30:47.833Z" UniqueId="cd8e9b35-e755-4b6b-9878-ba45e6700781">
+                <pbmeta:InstCtrlVer>6.0.0.SNAPSHOT39495</pbmeta:InstCtrlVer>
+                <pbmeta:SigProcVer>6.0.0.SNAPSHOT39495</pbmeta:SigProcVer>
+                <pbmeta:RunDetails>
+                    <pbmeta:TimeStampedName>r54075_20180905_225050</pbmeta:TimeStampedName>
+                    <pbmeta:Name>3260208_188nM-GTAC_3xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494</pbmeta:Name>
+                    <pbmeta:CreatedBy>unknown</pbmeta:CreatedBy>
+                    <pbmeta:WhenCreated>2015-07-09T09:30:47.833Z</pbmeta:WhenCreated>
+                    <pbmeta:StartedBy>unknown</pbmeta:StartedBy>
+                </pbmeta:RunDetails>
+                <pbmeta:WellSample CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="3260208_188nM-GTAC_3xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494">
+                    <pbmeta:WellName>SamplePlate-1-A-1</pbmeta:WellName>
+                    <pbmeta:Concentration>0</pbmeta:Concentration>
+                    <pbmeta:SampleReuseEnabled>false</pbmeta:SampleReuseEnabled>
+                    <pbmeta:StageHotstartEnabled>false</pbmeta:StageHotstartEnabled>
+                    <pbmeta:SizeSelectionEnabled>false</pbmeta:SizeSelectionEnabled>
+                    <pbmeta:UseCount>0</pbmeta:UseCount>
+                    <pbsample:BioSamples>
+                        <pbsample:BioSample Name="test test "/>
+                    </pbsample:BioSamples>
+                </pbmeta:WellSample>
+                <pbmeta:Automation Name="manualcellprep.py">
+                    <pbbase:AutomationParameters>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="CouplerLaserPower" SimpleValue="7" ValueDataType="Double"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="MovieLength" SimpleValue="15" ValueDataType="Double"/>
+                        <pbbase:AutomationParameter CreatedAt="2018-03-19T20:02:47.574Z" ModifiedAt="0001-01-01T00:00:00" Name="Exposure" SimpleValue="0.01" ValueDataType="String"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="SNRCut" SimpleValue="2.5" ValueDataType="Double"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="ActiveLaserPower" SimpleValue="False" ValueDataType="String"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="sequencingPixelROI" SimpleValue="[[0,0,1080,1920]]" ValueDataType="JSON"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="traceFilePixelROI" SimpleValue="[[0,0,8,32],[71,0,8,32],[142,0,8,32],[213,0,8,32],[284,0,8,32],[355,0,8,32],[426,0,8,32],[497,0,8,32],[568,0,8,32],[639,0,8,32],[710,0,8,32],[781,0,8,32],[852,0,8,32],[923,0,8,32],[994,0,8,32],[1072,0,8,32],[0,128,8,32],[71,128,8,32],[142,128,8,32],[213,128,8,32],[284,128,8,32],[355,128,8,32],[426,128,8,32],[497,128,8,32],[568,128,8,32],[639,128,8,32],[710,128,8,32],[781,128,8,32],[852,128,8,32],[923,128,8,32],[994,128,8,32],[1072,128,8,32],[0,256,8,32],[71,256,8,32],[142,256,8,32],[213,256,8,32],[284,256,8,32],[355,256,8,32],[426,256,8,32],[497,256,8,32],[568,256,8,32],[639,256,8,32],[710,256,8,32],[781,256,8,32],[852,256,8,32],[923,256,8,32],[994,256,8,32],[1072,256,8,32],[0,384,8,32],[71,384,8,32],[142,384,8,32],[213,384,8,32],[284,384,8,32],[355,384,8,32],[426,384,8,32],[497,384,8,32],[568,384,8,32],[639,384,8,32],[710,384,8,32],[781,384,8,32],[852,384,8,32],[923,384,8,32],[994,384,8,32],[1072,384,8,32],[0,512,8,32],[71,512,8,32],[142,512,8,32],[213,512,8,32],[284,512,8,32],[355,512,8,32],[426,512,8,32],[497,512,8,32],[568,512,8,32],[639,512,8,32],[710,512,8,32],[781,512,8,32],[852,512,8,32],[923,512,8,32],[994,512,8,32],[1072,512,8,32],[0,640,8,32],[71,640,8,32],[142,640,8,32],[213,640,8,32],[284,640,8,32],[355,640,8,32],[426,640,8,32],[497,640,8,32],[568,640,8,32],[639,640,8,32],[710,640,8,32],[781,640,8,32],[852,640,8,32],[923,640,8,32],[994,640,8,32],[1072,640,8,32],[0,768,8,32],[71,768,8,32],[142,768,8,32],[213,768,8,32],[284,768,8,32],[355,768,8,32],[426,768,8,32],[497,768,8,32],[568,768,8,32],[639,768,8,32],[710,768,8,32],[781,768,8,32],[852,768,8,32],[923,768,8,32],[994,768,8,32],[1072,768,8,32],[0,896,8,32],[71,896,8,32],[142,896,8,32],[213,896,8,32],[284,896,8,32],[355,896,8,32],[426,896,8,32],[497,896,8,32],[568,896,8,32],[639,896,8,32],[710,896,8,32],[781,896,8,32],[852,896,8,32],[923,896,8,32],[994,896,8,32],[1072,896,8,32],[0,1024,8,32],[71,1024,8,32],[142,1024,8,32],[213,1024,8,32],[284,1024,8,32],[355,1024,8,32],[426,1024,8,32],[497,1024,8,32],[568,1024,8,32],[639,1024,8,32],[710,1024,8,32],[781,1024,8,32],[852,1024,8,32],[923,1024,8,32],[994,1024,8,32],[1072,1024,8,32],[0,1152,8,32],[71,1152,8,32],[142,1152,8,32],[213,1152,8,32],[284,1152,8,32],[355,1152,8,32],[426,1152,8,32],[497,1152,8,32],[568,1152,8,32],[639,1152,8,32],[710,1152,8,32],[781,1152,8,32],[852,1152,8,32],[923,1152,8,32],[994,1152,8,32],[1072,1152,8,32],[0,1280,8,32],[71,1280,8,32],[142,1280,8,32],[213,1280,8,32],[284,1280,8,32],[355,1280,8,32],[426,1280,8,32],[497,1280,8,32],[568,1280,8,32],[639,1280,8,32],[710,1280,8,32],[781,1280,8,32],[852,1280,8,32],[923,1280,8,32],[994,1280,8,32],[1072,1280,8,32],[0,1408,8,32],[71,1408,8,32],[142,1408,8,32],[213,1408,8,32],[284,1408,8,32],[355,1408,8,32],[426,1408,8,32],[497,1408,8,32],[568,1408,8,32],[639,1408,8,32],[710,1408,8,32],[781,1408,8,32],[852,1408,8,32],[923,1408,8,32],[994,1408,8,32],[1072,1408,8,32],[0,1536,8,32],[71,1536,8,32],[142,1536,8,32],[213,1536,8,32],[284,1536,8,32],[355,1536,8,32],[426,1536,8,32],[497,1536,8,32],[568,1536,8,32],[639,1536,8,32],[710,1536,8,32],[781,1536,8,32],[852,1536,8,32],[923,1536,8,32],[994,1536,8,32],[1072,1536,8,32],[0,1664,8,32],[71,1664,8,32],[142,1664,8,32],[213,1664,8,32],[284,1664,8,32],[355,1664,8,32],[426,1664,8,32],[497,1664,8,32],[568,1664,8,32],[639,1664,8,32],[710,1664,8,32],[781,1664,8,32],[852,1664,8,32],[923,1664,8,32],[994,1664,8,32],[1072,1664,8,32],[0,1792,8,32],[71,1792,8,32],[142,1792,8,32],[213,1792,8,32],[284,1792,8,32],[355,1792,8,32],[426,1792,8,32],[497,1792,8,32],[568,1792,8,32],[639,1792,8,32],[710,1792,8,32],[781,1792,8,32],[852,1792,8,32],[923,1792,8,32],[994,1792,8,32],[1072,1792,8,32],[0,1888,8,32],[71,1888,8,32],[142,1888,8,32],[213,1888,8,32],[284,1888,8,32],[355,1888,8,32],[426,1888,8,32],[497,1888,8,32],[568,1888,8,32],[639,1888,8,32],[710,1888,8,32],[781,1888,8,32],[852,1888,8,32],[923,1888,8,32],[994,1888,8,32],[1072,1888,8,32]]" ValueDataType="JSON"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="CellAtStage" SimpleValue="True" ValueDataType="String"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="SkipAlignment" SimpleValue="False" ValueDataType="String"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="UseStageHotStart" SimpleValue="False" ValueDataType="String"/>
+                        <pbbase:AutomationParameter Name="CellNFCIndex" SimpleValue="1" ValueDataType="Int32"/>
+                        <pbbase:AutomationParameter Name="CollectionNumber" SimpleValue="0" ValueDataType="Int32"/>
+                        <pbbase:AutomationParameter Name="HasN2Switch" SimpleValue="False" ValueDataType="Boolean"/>
+                        <pbbase:AutomationParameter Name="TipSearchMaxDuration" SimpleValue="576" ValueDataType="Int32"/>
+                    </pbbase:AutomationParameters>
+                </pbmeta:Automation>
+                <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                <pbmeta:SetNumber>0</pbmeta:SetNumber>
+                <pbmeta:CellPac Barcode="BA233119" Description="The Sequel SMRT Cell 1M (4/Pack) are nanofabricated consumables each patterned with 1,000,000 wells called zero-mode waveguides (ZMWs). They are packaged together in a streamlined 4 SMRT Cell format (4/tray). One SMRT Cell is utilized in each sequencing reaction, and experiments can be run in single or in batch mode, giving your projects a greater level of flexibility." ExpirationDate="2018-06-08" LotNumber="321693" MovieTimeGrade="LR" Name="SMRT® Cell 1M v2 LR (4/Pack)" PartNumber="101-008-001" Version="2.0">
+                    <pbbase:ChipLayout>SequEL_4.0_RTO3</pbbase:ChipLayout>
+                </pbmeta:CellPac>
+                <pbmeta:ControlKit Barcode="444444101084300062030" ChipType="1mChip" Description="The Sequel DNA Internal Control 2.0 contains a fixed template of 2 kb length bound to Sequel Binding Kit 2.0 for use as an internal sequencing control. Reagent quantities provide spike-in controls for a minimum of 24 samples." ExpirationDate="2030-06-20" LotNumber="444444" Name="Sequel® DNA Internal Control 2.0" PartNumber="101-084-300" Tags="Control Kit, CCK" Version="2.0">
+                    <pbbase:CustomSequence>&gt;left_adapter\nTAGAGAGAGAAAAGGAGGAGGAGGCAACAACAACAACTCTCTCTA\n&gt;right_adapter\nTAGAGAGAGAAAAGGAGGAGGAGGCAACAACAACAACTCTCTCTA\n&gt;custom_sequence\nTGTCTAGGTCATCTCAACGTAGCTTTGACATATAACTTATCTAAAGTAATCCCTGCACACCTGTATGCATTATGCTTGCTATACACGCGGACACAGGCATATCATTTATTTTTTGCCATGTCCATTAATTGTTCAATAATTTTACTCACGGTATTTAATTTGATGTTGTGTTATATAGAATTGGAATTAAACTTATAAGGATGCTTAGACGTTGCATTATAAAAGTTTATGTACTAAGTATTTAAGACATTGGCATATGATTATAGCTTGACATTATTAAAAATTAATTAATTAAATCTCACACAATACTTATTCAAGACATTTTTACTAAGATAACCAAAGGAATGCGAACAAAATAATACTTAAAATATAAGACTTAGAAGTAATATGATCCAATAGTTACATATAGTACACTAAGTTCCTAAATTATATAACTTTAAAACAAAGTTACGAAATTTGGAAATAATTTTATTTAATCATATTTTCATAATAATGAAATACTGTTTATTTCAGTGGCGAAAAGAGATAATACGATTTTATAGTGATAGAATATCCTTGAAATATCTAAAGATAAAATTAGAAACTTTCTCTTTTCGCTGTAAAGCTATATGACTTAAAAATAACTTATACGCAAAGTATATTGCAGTGGAAACCCAAGAGTATAGTAGCCATGTAATCTCGGGTTCGAAACTACACGCCGCGCACGTAGTCAGATGGTCTGAAACTTGTCTGGGGCTGTTTGTTGACGGATGGAGACTTCACTAAGTGGCGTCAGGCGATGCGCACACACGGGACTCAATCCCGTAGCATGTTATGTGTCGTTCGAAACTCGTGCGTTCGAGATTTACGCCACATTGCCGGCTGGTCCAAGGACGTTATCTACCAGATGATACGGTCCAATTCGTAAGTTTGACTCACATAGTCGCGAACCGCGGAGCTGGAGAACAATAATTACCGGATGATTAGTTGACCATACGCACTATCATGCTCCGTGACTCAGTTTCCGCCATGGAGTTCTCACAGCCCCGTGTGTACCATAACTGCAGTAAGTAAGGACCTTGTTCGGAGGCCGACTCGTATTTCATATGATCTTAGTCTCGCCACCTTATCGCACGAATTGGGGGTGTCTTTTAGCCGACTCCGGCACGATCCGCCGGGAAGTTACTCGACCAGTTGCGGGACGCCCTAGTATGTTCGTATTACGTTCGATGCGTAAGCACCCCAGAGATTTTTGGCGGACGTTTCGGTAAATCATAGTAGAACCGGAGCGGTAAAGCTATTGATAACACGCAGGGACGAGCCAGTCGTCTAAGCTCCTCAGGGGTACCGTTCGCCGGACTACAGCCTGTCCCCGGCGGCCGCAACTGGGCTGCGATCCAGCCCCCGCTCCAAAAGGATGACTCGACCTTGCGCCTCGCGTACTCTGCTCTCGAGCTGTCTCCGTGGGCAATGCCGGCTCACGCTGTGGGGAACCCTGGACGCCCGGGCCGAGCCGACGTGGCCCCGCCCAGGCCTTTTCGTCGATCGCAGCTATGTACCCTGTGCTGGCCAGCGCTACTGCGCCGGCCATTAGCGGTGCGCTCTCGACTCGGCCCCAACGTAGACGGCGTCGCTGGCCGGATTCAAAGAAGTGAGCTACTACCATCGCGTGACGCCCTGCGGGCCTGAGTAACCGTGCACGAAGGACACCCCGTTCGTGGCGGGGGTTGCCTCCGCGACGGTCGCCAACGTTGGGGGTCGGTGCATTCAGGCGACGAGGGACCGCTGGTTTCCGGAGAGCGGCCTGTGCTCACACAGGTGCGGTCCATGGGGCCTGTGGATCCGGTTCTCCCACGCGTAGCGCCGGCGTTAGCATGGACGCTAAATAAGTATACGCCGGCAAAGGGAGTGTAGGCCGGCCCGAGGGCAATCGCGGTTACCGGGGTGGGGGAGCTCCCCGCACCAGCCTTGATGTGGTGTGCGAGCG</pbbase:CustomSequence>
+                </pbmeta:ControlKit>
+                <pbmeta:TemplatePrepKit Barcode="333333100902400092215" Description="The SMRTbell® Template Prep Kit contains reagent supplies to perform SMRTbell library preparations of primer-annealed SMRTbell libraries for insert sizes ranging from 500 bp to over 20 kb." ExpirationDate="2015-09-22" IsRestricted="true" LotNumber="333333" MaxInsertSize="20000" MinInsertSize="500" Name="CRISPR/Cas9 Placeholder w/ asymmetric adapters" PartNumber="100-902-400" Tags="Template Prep Kit, TPK" Version="1.0">
+                    <pbbase:LeftAdaptorSequence>ATCTCTCTCAATTTTTTTTTTTTTTTTTTTTTTTAAGAGAGAGAT</pbbase:LeftAdaptorSequence>
+                    <pbbase:LeftPrimerSequence>aacggaggaggagga</pbbase:LeftPrimerSequence>
+                    <pbbase:RightAdaptorSequence>ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT</pbbase:RightAdaptorSequence>
+                    <pbbase:RightPrimerSequence>aacggaggaggagga</pbbase:RightPrimerSequence>
+                </pbmeta:TemplatePrepKit>
+                <pbmeta:BindingKit Barcode="777777100862200062030" ChipType="1mChip" Description="The Sequel Binding Kit 2.0 contains reagent supplies to bind prepared DNA template libraries to the Sequel Polymerase 2.0 in preparation for sequencing on the Sequel System. The result is a DNA polymerase/template complex. Sequel Binding Kit 1.0 should be used only with Sequel Sequencing Kit 2.0. Reagent quantities support 24 binding reactions." ExpirationDate="2030-06-20" LotNumber="777777" Name="Sequel® Binding Kit 2.0" PartNumber="100-862-200" Tags="Binding Kit, BDK" Version="2.0"/>
+                <pbmeta:SequencingKitPlate Barcode="010765999861800030818" Description="The DNA Sequencing Kit contains a sequencing reagent plate with chemistry for single molecule real-time sequencing on the PacBio Sequel®. Reagent quantities support 8 sequencing reactions to be used in conjunction with SMRT® Cell 4Pac(s).  (8 Cells max/Each Row supplies reagents for 1 Sequel SMRT Cell)" ExpirationDate="2018-03-08" IsRestricted="true" LotNumber="010765" Name="Sequel® Dev Sequencing Plate" PartNumber="999-861-800" Tags="Sequencing Kit, SQK" Version="3.0">
+                    <pbrk:ReagentTubes Barcode="012197100619600033122" ExpirationDate="2022-03-31" LotNumber="012197" Name="Sequel® SMRT®Cell Oil" PartNumber="100-619-600"/>
+                </pbmeta:SequencingKitPlate>
+                <pbmeta:Primary>
+                    <pbmeta:AutomationName>SequelAlpha</pbmeta:AutomationName>
+                    <pbmeta:ConfigFileName>SqlPoC_SubCrf_2C2A-t2.xml</pbmeta:ConfigFileName>
+                    <pbmeta:OutputOptions>
+                        <pbmeta:ResultsFolder>326/3260208/r54075_20180905_225050/1_A01/</pbmeta:ResultsFolder>
+                        <pbmeta:CollectionPathUri>/pbi/collections/326/3260208/r54075_20180905_225050/1_A01/</pbmeta:CollectionPathUri>
+                        <pbmeta:CopyFiles>
+                            <pbmeta:CollectionFileCopy>Trace</pbmeta:CollectionFileCopy>
+                            <pbmeta:CollectionFileCopy>Fasta</pbmeta:CollectionFileCopy>
+                            <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                            <pbmeta:CollectionFileCopy>StatsH5</pbmeta:CollectionFileCopy>
+                        </pbmeta:CopyFiles>
+                        <pbmeta:Readout>Pulses</pbmeta:Readout>
+                        <pbmeta:MetricsVerbosity>High</pbmeta:MetricsVerbosity>
+                        <pbmeta:TransferResource>
+                            <pbmeta:Id>srs-pbi-collections</pbmeta:Id>
+                            <pbmeta:TransferScheme>SRS</pbmeta:TransferScheme>
+                            <pbmeta:Name>PBI Collections Xfer test using Rsync+SSH</pbmeta:Name>
+                            <pbmeta:Description>Test Location for writing Transfer services to write to. Should be used by internal tools (PA SIM) and ICS tests</pbmeta:Description>
+                            <pbmeta:DestPath>/pbi/collections</pbmeta:DestPath>
+                        </pbmeta:TransferResource>
+                    </pbmeta:OutputOptions>
+                </pbmeta:Primary>
+                <pbmeta:Secondary>
+                    <pbmeta:AutomationName>NoRS_Standard_Edna.1</pbmeta:AutomationName>
+                    <pbmeta:AutomationParameters>
+                        <pbmeta:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="Reference" SimpleValue="EColi5k_001_BsaI_AS_tc6_scr_tc6_unrolled_circular_12x_l123900" ValueDataType="String"/>
+                    </pbmeta:AutomationParameters>
+                    <pbmeta:CellCountInJob>0</pbmeta:CellCountInJob>
+                </pbmeta:Secondary>
+                <pbmeta:UserDefinedFields>
+                    <pbbase:DataEntities Name="LIMS_IMPORT" SimpleValue="1234TestFolder" ValueDataType="String"/>
+                    <pbbase:DataEntities Name="USER" SimpleValue="TestUser" ValueDataType="String"/>
+                    <pbbase:DataEntities ValueDataType="String"/>
+                    <pbbase:DataEntities ValueDataType="String"/>
+                    <pbbase:DataEntities ValueDataType="String"/>
+                    <pbbase:DataEntities ValueDataType="String"/>
+                </pbmeta:UserDefinedFields>
+                <pbmeta:ComponentVersions>
+                    <pbmeta:VersionInfo Name="ics" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="iui" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="chemistry" Version="6.0.0.SNAPSHOT39452"/>
+                    <pbmeta:VersionInfo Name="pa" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="paws" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="ppa" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="realtime" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="transfer" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="smrtlink-analysisservices-gui" Version="6.0.0.45618"/>
+                    <pbmeta:VersionInfo Name="smrtimisc" Version="6.0.0.45621"/>
+                    <pbmeta:VersionInfo Name="smrtlink" Version="6.0.0.45621"/>
+                    <pbmeta:VersionInfo Name="smrttools" Version="6.0.0.45580"/>
+                    <pbmeta:VersionInfo Name="smrtinub" Version="6.0.0.45580"/>
+                    <pbmeta:VersionInfo Name="smrtview" Version="6.0.0.45580"/>
+                </pbmeta:ComponentVersions>
+            </pbmeta:CollectionMetadata>
+            <pbmeta:CollectionMetadata Context="m54075_180905_233034" CreatedAt="0001-01-01T00:00:00" InstrumentId="60001" InstrumentName="Inst60001" MetaType="CollectionMetadata" ModifiedAt="0001-01-01T00:00:00" Status="Ready" TimeStampedName="60001-CollectionMetadata-2015-07-09T09:30:47.833Z" UniqueId="ca4bfe85-3047-433f-a9ce-e8cb63d8b27f">
+                <pbmeta:InstCtrlVer>6.0.0.SNAPSHOT39495</pbmeta:InstCtrlVer>
+                <pbmeta:SigProcVer>6.0.0.SNAPSHOT39495</pbmeta:SigProcVer>
+                <pbmeta:RunDetails>
+                    <pbmeta:TimeStampedName>r54075_20180905_232954</pbmeta:TimeStampedName>
+                    <pbmeta:Name>3260208_188nM-GTAC_4xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494</pbmeta:Name>
+                    <pbmeta:CreatedBy>unknown</pbmeta:CreatedBy>
+                    <pbmeta:WhenCreated>2015-07-09T09:30:47.833Z</pbmeta:WhenCreated>
+                    <pbmeta:StartedBy>unknown</pbmeta:StartedBy>
+                </pbmeta:RunDetails>
+                <pbmeta:WellSample CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="3260208_188nM-GTAC_4xGCratio_LP7_100fps_15min_5kEColi_SP2p1_3uMSSB_BA243494">
+                    <pbmeta:WellName>SamplePlate-1-A-1</pbmeta:WellName>
+                    <pbmeta:Concentration>0</pbmeta:Concentration>
+                    <pbmeta:SampleReuseEnabled>false</pbmeta:SampleReuseEnabled>
+                    <pbmeta:StageHotstartEnabled>false</pbmeta:StageHotstartEnabled>
+                    <pbmeta:SizeSelectionEnabled>false</pbmeta:SizeSelectionEnabled>
+                    <pbmeta:UseCount>0</pbmeta:UseCount>
+                </pbmeta:WellSample>
+                <pbmeta:Automation Name="manualcellprep.py">
+                    <pbbase:AutomationParameters>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="CouplerLaserPower" SimpleValue="7" ValueDataType="Double"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="MovieLength" SimpleValue="15" ValueDataType="Double"/>
+                        <pbbase:AutomationParameter CreatedAt="2018-03-19T20:02:47.574Z" ModifiedAt="0001-01-01T00:00:00" Name="Exposure" SimpleValue="0.01" ValueDataType="String"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="SNRCut" SimpleValue="2.5" ValueDataType="Double"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="ActiveLaserPower" SimpleValue="False" ValueDataType="String"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="sequencingPixelROI" SimpleValue="[[0,0,1080,1920]]" ValueDataType="JSON"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="traceFilePixelROI" SimpleValue="[[0,0,8,32],[71,0,8,32],[142,0,8,32],[213,0,8,32],[284,0,8,32],[355,0,8,32],[426,0,8,32],[497,0,8,32],[568,0,8,32],[639,0,8,32],[710,0,8,32],[781,0,8,32],[852,0,8,32],[923,0,8,32],[994,0,8,32],[1072,0,8,32],[0,128,8,32],[71,128,8,32],[142,128,8,32],[213,128,8,32],[284,128,8,32],[355,128,8,32],[426,128,8,32],[497,128,8,32],[568,128,8,32],[639,128,8,32],[710,128,8,32],[781,128,8,32],[852,128,8,32],[923,128,8,32],[994,128,8,32],[1072,128,8,32],[0,256,8,32],[71,256,8,32],[142,256,8,32],[213,256,8,32],[284,256,8,32],[355,256,8,32],[426,256,8,32],[497,256,8,32],[568,256,8,32],[639,256,8,32],[710,256,8,32],[781,256,8,32],[852,256,8,32],[923,256,8,32],[994,256,8,32],[1072,256,8,32],[0,384,8,32],[71,384,8,32],[142,384,8,32],[213,384,8,32],[284,384,8,32],[355,384,8,32],[426,384,8,32],[497,384,8,32],[568,384,8,32],[639,384,8,32],[710,384,8,32],[781,384,8,32],[852,384,8,32],[923,384,8,32],[994,384,8,32],[1072,384,8,32],[0,512,8,32],[71,512,8,32],[142,512,8,32],[213,512,8,32],[284,512,8,32],[355,512,8,32],[426,512,8,32],[497,512,8,32],[568,512,8,32],[639,512,8,32],[710,512,8,32],[781,512,8,32],[852,512,8,32],[923,512,8,32],[994,512,8,32],[1072,512,8,32],[0,640,8,32],[71,640,8,32],[142,640,8,32],[213,640,8,32],[284,640,8,32],[355,640,8,32],[426,640,8,32],[497,640,8,32],[568,640,8,32],[639,640,8,32],[710,640,8,32],[781,640,8,32],[852,640,8,32],[923,640,8,32],[994,640,8,32],[1072,640,8,32],[0,768,8,32],[71,768,8,32],[142,768,8,32],[213,768,8,32],[284,768,8,32],[355,768,8,32],[426,768,8,32],[497,768,8,32],[568,768,8,32],[639,768,8,32],[710,768,8,32],[781,768,8,32],[852,768,8,32],[923,768,8,32],[994,768,8,32],[1072,768,8,32],[0,896,8,32],[71,896,8,32],[142,896,8,32],[213,896,8,32],[284,896,8,32],[355,896,8,32],[426,896,8,32],[497,896,8,32],[568,896,8,32],[639,896,8,32],[710,896,8,32],[781,896,8,32],[852,896,8,32],[923,896,8,32],[994,896,8,32],[1072,896,8,32],[0,1024,8,32],[71,1024,8,32],[142,1024,8,32],[213,1024,8,32],[284,1024,8,32],[355,1024,8,32],[426,1024,8,32],[497,1024,8,32],[568,1024,8,32],[639,1024,8,32],[710,1024,8,32],[781,1024,8,32],[852,1024,8,32],[923,1024,8,32],[994,1024,8,32],[1072,1024,8,32],[0,1152,8,32],[71,1152,8,32],[142,1152,8,32],[213,1152,8,32],[284,1152,8,32],[355,1152,8,32],[426,1152,8,32],[497,1152,8,32],[568,1152,8,32],[639,1152,8,32],[710,1152,8,32],[781,1152,8,32],[852,1152,8,32],[923,1152,8,32],[994,1152,8,32],[1072,1152,8,32],[0,1280,8,32],[71,1280,8,32],[142,1280,8,32],[213,1280,8,32],[284,1280,8,32],[355,1280,8,32],[426,1280,8,32],[497,1280,8,32],[568,1280,8,32],[639,1280,8,32],[710,1280,8,32],[781,1280,8,32],[852,1280,8,32],[923,1280,8,32],[994,1280,8,32],[1072,1280,8,32],[0,1408,8,32],[71,1408,8,32],[142,1408,8,32],[213,1408,8,32],[284,1408,8,32],[355,1408,8,32],[426,1408,8,32],[497,1408,8,32],[568,1408,8,32],[639,1408,8,32],[710,1408,8,32],[781,1408,8,32],[852,1408,8,32],[923,1408,8,32],[994,1408,8,32],[1072,1408,8,32],[0,1536,8,32],[71,1536,8,32],[142,1536,8,32],[213,1536,8,32],[284,1536,8,32],[355,1536,8,32],[426,1536,8,32],[497,1536,8,32],[568,1536,8,32],[639,1536,8,32],[710,1536,8,32],[781,1536,8,32],[852,1536,8,32],[923,1536,8,32],[994,1536,8,32],[1072,1536,8,32],[0,1664,8,32],[71,1664,8,32],[142,1664,8,32],[213,1664,8,32],[284,1664,8,32],[355,1664,8,32],[426,1664,8,32],[497,1664,8,32],[568,1664,8,32],[639,1664,8,32],[710,1664,8,32],[781,1664,8,32],[852,1664,8,32],[923,1664,8,32],[994,1664,8,32],[1072,1664,8,32],[0,1792,8,32],[71,1792,8,32],[142,1792,8,32],[213,1792,8,32],[284,1792,8,32],[355,1792,8,32],[426,1792,8,32],[497,1792,8,32],[568,1792,8,32],[639,1792,8,32],[710,1792,8,32],[781,1792,8,32],[852,1792,8,32],[923,1792,8,32],[994,1792,8,32],[1072,1792,8,32],[0,1888,8,32],[71,1888,8,32],[142,1888,8,32],[213,1888,8,32],[284,1888,8,32],[355,1888,8,32],[426,1888,8,32],[497,1888,8,32],[568,1888,8,32],[639,1888,8,32],[710,1888,8,32],[781,1888,8,32],[852,1888,8,32],[923,1888,8,32],[994,1888,8,32],[1072,1888,8,32]]" ValueDataType="JSON"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="CellAtStage" SimpleValue="True" ValueDataType="String"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="SkipAlignment" SimpleValue="False" ValueDataType="String"/>
+                        <pbbase:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="UseStageHotStart" SimpleValue="False" ValueDataType="String"/>
+                        <pbbase:AutomationParameter Name="CellNFCIndex" SimpleValue="1" ValueDataType="Int32"/>
+                        <pbbase:AutomationParameter Name="CollectionNumber" SimpleValue="0" ValueDataType="Int32"/>
+                        <pbbase:AutomationParameter Name="HasN2Switch" SimpleValue="False" ValueDataType="Boolean"/>
+                        <pbbase:AutomationParameter Name="TipSearchMaxDuration" SimpleValue="576" ValueDataType="Int32"/>
+                    </pbbase:AutomationParameters>
+                </pbmeta:Automation>
+                <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                <pbmeta:SetNumber>0</pbmeta:SetNumber>
+                <pbmeta:CellPac Barcode="BA233119" Description="The Sequel SMRT Cell 1M (4/Pack) are nanofabricated consumables each patterned with 1,000,000 wells called zero-mode waveguides (ZMWs). They are packaged together in a streamlined 4 SMRT Cell format (4/tray). One SMRT Cell is utilized in each sequencing reaction, and experiments can be run in single or in batch mode, giving your projects a greater level of flexibility." ExpirationDate="2018-06-08" LotNumber="321693" MovieTimeGrade="LR" Name="SMRT® Cell 1M v2 LR (4/Pack)" PartNumber="101-008-001" Version="2.0">
+                    <pbbase:ChipLayout>SequEL_4.0_RTO3</pbbase:ChipLayout>
+                </pbmeta:CellPac>
+                <pbmeta:ControlKit Barcode="444444101084300062030" ChipType="1mChip" Description="The Sequel DNA Internal Control 2.0 contains a fixed template of 2 kb length bound to Sequel Binding Kit 2.0 for use as an internal sequencing control. Reagent quantities provide spike-in controls for a minimum of 24 samples." ExpirationDate="2030-06-20" LotNumber="444444" Name="Sequel® DNA Internal Control 2.0" PartNumber="101-084-300" Tags="Control Kit, CCK" Version="2.0">
+                    <pbbase:CustomSequence>&gt;left_adapter\nTAGAGAGAGAAAAGGAGGAGGAGGCAACAACAACAACTCTCTCTA\n&gt;right_adapter\nTAGAGAGAGAAAAGGAGGAGGAGGCAACAACAACAACTCTCTCTA\n&gt;custom_sequence\nTGTCTAGGTCATCTCAACGTAGCTTTGACATATAACTTATCTAAAGTAATCCCTGCACACCTGTATGCATTATGCTTGCTATACACGCGGACACAGGCATATCATTTATTTTTTGCCATGTCCATTAATTGTTCAATAATTTTACTCACGGTATTTAATTTGATGTTGTGTTATATAGAATTGGAATTAAACTTATAAGGATGCTTAGACGTTGCATTATAAAAGTTTATGTACTAAGTATTTAAGACATTGGCATATGATTATAGCTTGACATTATTAAAAATTAATTAATTAAATCTCACACAATACTTATTCAAGACATTTTTACTAAGATAACCAAAGGAATGCGAACAAAATAATACTTAAAATATAAGACTTAGAAGTAATATGATCCAATAGTTACATATAGTACACTAAGTTCCTAAATTATATAACTTTAAAACAAAGTTACGAAATTTGGAAATAATTTTATTTAATCATATTTTCATAATAATGAAATACTGTTTATTTCAGTGGCGAAAAGAGATAATACGATTTTATAGTGATAGAATATCCTTGAAATATCTAAAGATAAAATTAGAAACTTTCTCTTTTCGCTGTAAAGCTATATGACTTAAAAATAACTTATACGCAAAGTATATTGCAGTGGAAACCCAAGAGTATAGTAGCCATGTAATCTCGGGTTCGAAACTACACGCCGCGCACGTAGTCAGATGGTCTGAAACTTGTCTGGGGCTGTTTGTTGACGGATGGAGACTTCACTAAGTGGCGTCAGGCGATGCGCACACACGGGACTCAATCCCGTAGCATGTTATGTGTCGTTCGAAACTCGTGCGTTCGAGATTTACGCCACATTGCCGGCTGGTCCAAGGACGTTATCTACCAGATGATACGGTCCAATTCGTAAGTTTGACTCACATAGTCGCGAACCGCGGAGCTGGAGAACAATAATTACCGGATGATTAGTTGACCATACGCACTATCATGCTCCGTGACTCAGTTTCCGCCATGGAGTTCTCACAGCCCCGTGTGTACCATAACTGCAGTAAGTAAGGACCTTGTTCGGAGGCCGACTCGTATTTCATATGATCTTAGTCTCGCCACCTTATCGCACGAATTGGGGGTGTCTTTTAGCCGACTCCGGCACGATCCGCCGGGAAGTTACTCGACCAGTTGCGGGACGCCCTAGTATGTTCGTATTACGTTCGATGCGTAAGCACCCCAGAGATTTTTGGCGGACGTTTCGGTAAATCATAGTAGAACCGGAGCGGTAAAGCTATTGATAACACGCAGGGACGAGCCAGTCGTCTAAGCTCCTCAGGGGTACCGTTCGCCGGACTACAGCCTGTCCCCGGCGGCCGCAACTGGGCTGCGATCCAGCCCCCGCTCCAAAAGGATGACTCGACCTTGCGCCTCGCGTACTCTGCTCTCGAGCTGTCTCCGTGGGCAATGCCGGCTCACGCTGTGGGGAACCCTGGACGCCCGGGCCGAGCCGACGTGGCCCCGCCCAGGCCTTTTCGTCGATCGCAGCTATGTACCCTGTGCTGGCCAGCGCTACTGCGCCGGCCATTAGCGGTGCGCTCTCGACTCGGCCCCAACGTAGACGGCGTCGCTGGCCGGATTCAAAGAAGTGAGCTACTACCATCGCGTGACGCCCTGCGGGCCTGAGTAACCGTGCACGAAGGACACCCCGTTCGTGGCGGGGGTTGCCTCCGCGACGGTCGCCAACGTTGGGGGTCGGTGCATTCAGGCGACGAGGGACCGCTGGTTTCCGGAGAGCGGCCTGTGCTCACACAGGTGCGGTCCATGGGGCCTGTGGATCCGGTTCTCCCACGCGTAGCGCCGGCGTTAGCATGGACGCTAAATAAGTATACGCCGGCAAAGGGAGTGTAGGCCGGCCCGAGGGCAATCGCGGTTACCGGGGTGGGGGAGCTCCCCGCACCAGCCTTGATGTGGTGTGCGAGCG</pbbase:CustomSequence>
+                </pbmeta:ControlKit>
+                <pbmeta:TemplatePrepKit Barcode="333333100902400092215" Description="The SMRTbell® Template Prep Kit contains reagent supplies to perform SMRTbell library preparations of primer-annealed SMRTbell libraries for insert sizes ranging from 500 bp to over 20 kb." ExpirationDate="2015-09-22" IsRestricted="true" LotNumber="333333" MaxInsertSize="20000" MinInsertSize="500" Name="CRISPR/Cas9 Placeholder w/ asymmetric adapters" PartNumber="100-902-400" Tags="Template Prep Kit, TPK" Version="1.0">
+                    <pbbase:LeftAdaptorSequence>ATCTCTCTCAATTTTTTTTTTTTTTTTTTTTTTTAAGAGAGAGAT</pbbase:LeftAdaptorSequence>
+                    <pbbase:LeftPrimerSequence>aacggaggaggagga</pbbase:LeftPrimerSequence>
+                    <pbbase:RightAdaptorSequence>ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT</pbbase:RightAdaptorSequence>
+                    <pbbase:RightPrimerSequence>aacggaggaggagga</pbbase:RightPrimerSequence>
+                </pbmeta:TemplatePrepKit>
+                <pbmeta:BindingKit Barcode="777777100862200062030" ChipType="1mChip" Description="The Sequel Binding Kit 2.0 contains reagent supplies to bind prepared DNA template libraries to the Sequel Polymerase 2.0 in preparation for sequencing on the Sequel System. The result is a DNA polymerase/template complex. Sequel Binding Kit 1.0 should be used only with Sequel Sequencing Kit 2.0. Reagent quantities support 24 binding reactions." ExpirationDate="2030-06-20" LotNumber="777777" Name="Sequel® Binding Kit 2.0" PartNumber="100-862-200" Tags="Binding Kit, BDK" Version="2.0"/>
+                <pbmeta:SequencingKitPlate Barcode="010765999861800030818" Description="The DNA Sequencing Kit contains a sequencing reagent plate with chemistry for single molecule real-time sequencing on the PacBio Sequel®. Reagent quantities support 8 sequencing reactions to be used in conjunction with SMRT® Cell 4Pac(s).  (8 Cells max/Each Row supplies reagents for 1 Sequel SMRT Cell)" ExpirationDate="2018-03-08" IsRestricted="true" LotNumber="010765" Name="Sequel® Dev Sequencing Plate" PartNumber="999-861-800" Tags="Sequencing Kit, SQK" Version="3.0">
+                    <pbrk:ReagentTubes Barcode="012197100619600033122" ExpirationDate="2022-03-31" LotNumber="012197" Name="Sequel® SMRT®Cell Oil" PartNumber="100-619-600"/>
+                </pbmeta:SequencingKitPlate>
+                <pbmeta:Primary>
+                    <pbmeta:AutomationName>SequelAlpha</pbmeta:AutomationName>
+                    <pbmeta:ConfigFileName>SqlPoC_SubCrf_2C2A-t2.xml</pbmeta:ConfigFileName>
+                    <pbmeta:OutputOptions>
+                        <pbmeta:ResultsFolder>326/3260208/r54075_20180905_232954/1_A01/</pbmeta:ResultsFolder>
+                        <pbmeta:CollectionPathUri>/pbi/collections/326/3260208/r54075_20180905_232954/1_A01/</pbmeta:CollectionPathUri>
+                        <pbmeta:CopyFiles>
+                            <pbmeta:CollectionFileCopy>Trace</pbmeta:CollectionFileCopy>
+                            <pbmeta:CollectionFileCopy>Fasta</pbmeta:CollectionFileCopy>
+                            <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                            <pbmeta:CollectionFileCopy>StatsH5</pbmeta:CollectionFileCopy>
+                        </pbmeta:CopyFiles>
+                        <pbmeta:Readout>Pulses</pbmeta:Readout>
+                        <pbmeta:MetricsVerbosity>High</pbmeta:MetricsVerbosity>
+                        <pbmeta:TransferResource>
+                            <pbmeta:Id>srs-pbi-collections</pbmeta:Id>
+                            <pbmeta:TransferScheme>SRS</pbmeta:TransferScheme>
+                            <pbmeta:Name>PBI Collections Xfer test using Rsync+SSH</pbmeta:Name>
+                            <pbmeta:Description>Test Location for writing Transfer services to write to. Should be used by internal tools (PA SIM) and ICS tests</pbmeta:Description>
+                            <pbmeta:DestPath>/pbi/collections</pbmeta:DestPath>
+                        </pbmeta:TransferResource>
+                    </pbmeta:OutputOptions>
+                </pbmeta:Primary>
+                <pbmeta:Secondary>
+                    <pbmeta:AutomationName>NoRS_Standard_Edna.1</pbmeta:AutomationName>
+                    <pbmeta:AutomationParameters>
+                        <pbmeta:AutomationParameter CreatedAt="0001-01-01T00:00:00" ModifiedAt="0001-01-01T00:00:00" Name="Reference" SimpleValue="EColi5k_001_BsaI_AS_tc6_scr_tc6_unrolled_circular_12x_l123900" ValueDataType="String"/>
+                    </pbmeta:AutomationParameters>
+                    <pbmeta:CellCountInJob>0</pbmeta:CellCountInJob>
+                </pbmeta:Secondary>
+                <pbmeta:UserDefinedFields>
+                    <pbbase:DataEntities Name="LIMS_IMPORT" SimpleValue="1234TestFolder" ValueDataType="String"/>
+                    <pbbase:DataEntities Name="USER" SimpleValue="TestUser" ValueDataType="String"/>
+                    <pbbase:DataEntities ValueDataType="String"/>
+                    <pbbase:DataEntities ValueDataType="String"/>
+                    <pbbase:DataEntities ValueDataType="String"/>
+                    <pbbase:DataEntities ValueDataType="String"/>
+                </pbmeta:UserDefinedFields>
+                <pbmeta:ComponentVersions>
+                    <pbmeta:VersionInfo Name="ics" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="iui" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="chemistry" Version="6.0.0.SNAPSHOT39452"/>
+                    <pbmeta:VersionInfo Name="pa" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="paws" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="ppa" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="realtime" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="transfer" Version="6.0.0.SNAPSHOT39495"/>
+                    <pbmeta:VersionInfo Name="smrtlink-analysisservices-gui" Version="6.0.0.45618"/>
+                    <pbmeta:VersionInfo Name="smrtimisc" Version="6.0.0.45621"/>
+                    <pbmeta:VersionInfo Name="smrtlink" Version="6.0.0.45621"/>
+                    <pbmeta:VersionInfo Name="smrttools" Version="6.0.0.45580"/>
+                    <pbmeta:VersionInfo Name="smrtinub" Version="6.0.0.45580"/>
+                    <pbmeta:VersionInfo Name="smrtview" Version="6.0.0.45580"/>
+                </pbmeta:ComponentVersions>
+            </pbmeta:CollectionMetadata>
+        </pbmeta:Collections>
+    </pbds:DataSetMetadata>
+</pbds:ConsensusReadSet>
diff --git a/tests/data/dataset/ccsread.dataset.xml b/tests/data/dataset/ccsread.dataset.xml

new file mode 100644 (file)

index 0000000..97b5943
--- /dev/null
+++ b/tests/data/dataset/ccsread.dataset.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:ConsensusReadSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.ConsensusReadSet" Name="DataSet_ConsensusReadSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First ConsensusRead BAM" Description="Points to an example ConsensusRead BAM file." MetaType="PacBio.ConsensusReadFile.ConsensusReadBamFile" ResourceId="file:///mnt/path/to/ccsreads0.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="file:///mnt/path/to/ccsreads0.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second ConsensusRead BAM" Description="Points to another example ConsensusRead BAM file." MetaType="PacBio.ConsensusReadFile.ConsensusReadBamFile" ResourceId="file:///mnt/path/to/ccsreads1.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="file:///mnt/path/to/ccsreads0.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+</pbds:ConsensusReadSet>
diff --git a/tests/data/dataset/lambda_contigs.xml b/tests/data/dataset/lambda_contigs.xml

new file mode 100644 (file)

index 0000000..4abc8cc
--- /dev/null
+++ b/tests/data/dataset/lambda_contigs.xml
@@ -0,0 +1,6 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<pbds:ReferenceSet CreatedAt="2015-05-28T10:56:36" MetaType="PacBio.DataSet.ReferenceSet" Name="" Tags="" UniqueId="596e87db-34f9-d2fd-c905-b017543170e1" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource ResourceId="file:tests/data/lambda_contigs.fasta"/>
+    </pbbase:ExternalResources>
+</pbds:ReferenceSet>
+\ No newline at end of file
diff --git a/tests/data/dataset/malformed.xml b/tests/data/dataset/malformed.xml

new file mode 100644 (file)

index 0000000..31e0942
--- /dev/null
+++ b/tests/data/dataset/malformed.xml
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="utf-8"?>
+<SubreadSet 
+    CreatedAt="2015-08-19T15:39:36.331"
+    Description="Merged dataset from 1 files using DatasetMerger 0.1.2" 
+    MetaType="PacBio.DataSet.HdfSubreadSet" 
+    Name="Subreads from runr000013_42267_150403" 
+    Tags="pacbio.secondary.instrument=RS" 
+    TimeStampedName="hdfsubreadset_2015-08-19T15:39:36.331-07:00" 
+    UniqueId="b4741521-2a4c-42df-8a13-0a755ca9ed1e"
+    Version="0.5" 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:ns0="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:ns1="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:ns2="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:ns3="http://pacificbiosciences.com/PacBioReagentKit.xsd">
+       <ns0:ExternalResources>
+        <ns0:ExternalResource 
+            MetaType="SubreadFile.SubreadBamFile"
+            TimeStampedName="SubreadFile.SubreadBamFile_00000000000000"
+            UniqueId="251acf71-9eb0-489e-9dd1-cdbd11432753" 
+            ResourceId="file:///mnt/secondary-siv/jenkins/jenkins-bot01/workspace/Ubuntu1404_Mainline_SA3_Tiny_tests/software/smrtanalysis/siv/testkit-jobs/sa3_pipelines/mapping/tiny/job_output-ubuntu1404/tasks/pbsmrtpipe.tasks.h5_subreads_to_subread-0//mnt/secondary-siv/jenkins/jenkins-bot01/workspace/Ubuntu1404_Mainline_SA3_Tiny_tests/software/smrtanalysis/siv/testkit-jobs/sa3_pipelines/mapping/tiny/job_output-ubuntu1404/tasks/pbsmrtpipe.tasks.h5_subreads_to_subread-0/file.subreads.subreads.bam"  />
+    </ns0:ExternalResources>
+    <DataSetMetadata>
+        <TotalLength>50000000</TotalLength>
+        <NumRecords>150000</NumRecords>
+        <ns2:Collections>
+            <ns2:CollectionMetadata 
+                Context="m150404_101626_42267_c100807920800000001823174110291514_s1_p0" 
+                InstrumentId="1" 
+                InstrumentName="42267" 
+                MetaType="PacBio.Collection" 
+                TimeStampedName="m150404_101626_42267_c100807920800000001823174110291514_s1_p0" 
+                UniqueId="d66c8372-2b70-4dcf-b64f-9f8b5cc351fd">
+                <ns2:InstCtrlVer>2.3.0.1.142990</ns2:InstCtrlVer>
+                <ns2:SigProcVer>NRT@172.31.128.10:8082, SwVer=2301.142990, HwVer=1.0</ns2:SigProcVer>
+                <ns2:RunDetails>
+                    <ns2:RunId>r000013_42267_150403</ns2:RunId>
+                    <ns2:Name>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:Name>
+                </ns2:RunDetails>
+                <ns2:WellSample Name="Inst42267-040315-SAT-100pM-2kb-P6C4">
+                    <ns2:PlateId>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:PlateId>
+                    <ns2:WellName>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:WellName>
+                    <ns2:Concentration>0.0</ns2:Concentration>                         
+                    <ns2:SampleReuseEnabled>false</ns2:SampleReuseEnabled>
+                    <ns2:StageHotstartEnabled>false</ns2:StageHotstartEnabled>
+                    <ns2:SizeSelectionEnabled>false</ns2:SizeSelectionEnabled>
+                    <ns2:UseCount>1</ns2:UseCount>
+                    <ns1:BioSamplePointers>
+                        <ns1:BioSamplePointer>251acf71-9eb0-489e-9dd1-cdbd11432752</ns1:BioSamplePointer>
+                    </ns1:BioSamplePointers>
+                </ns2:WellSample>
+                <ns2:Automation>
+                    <ns0:AutomationParameters>
+                        <ns0:AutomationParameter />
+                    </ns0:AutomationParameters>
+                </ns2:Automation>
+                <ns2:CollectionNumber>7</ns2:CollectionNumber>
+                <ns2:CellIndex>4</ns2:CellIndex>
+                <ns2:CellPac Barcode="10080792080000000182317411029151" />
+                <ns2:Primary>
+                    <ns2:AutomationName>BasecallerV1</ns2:AutomationName>
+                    <ns2:ConfigFileName>2-3-0_P6-C4.xml</ns2:ConfigFileName>
+                    <ns2:SequencingCondition />
+                    <ns2:OutputOptions>
+                        <ns2:ResultsFolder>Analysis_Results</ns2:ResultsFolder>
+                        <ns2:CollectionPathUri>rsy://mp-rsync/vol55//RS_DATA_STAGING/42267/Inst42267-040315-SAT-100pM-2kb-P6C4_13/A04_7/</ns2:CollectionPathUri>
+                        <ns2:CopyFiles>
+                            <ns2:CollectionFileCopy>Fasta</ns2:CollectionFileCopy>
+                        </ns2:CopyFiles>
+                        <ns2:Readout>Bases</ns2:Readout>
+                        <ns2:MetricsVerbosity>Minimal</ns2:MetricsVerbosity>
+                     </ns2:OutputOptions>
+                 </ns2:Primary>
+             </ns2:CollectionMetadata>
+         </ns2:Collections>
+         <ns1:BioSamples>
+             <ns1:BioSample
+                 Description="Inst42267-SAT-100pM-2kbLambda-P6C4-Std120_CPS_040315"
+                 MetaType="PacBio.Sample" 
+                 Name="Inst42267-040315-SAT-100pM-2kb-P6C4" 
+                 TimeStampedName="biosample_2015-08-19T15:39:36.331-07:00" 
+                 UniqueId="251acf71-9eb0-489e-9dd1-cdbd11432752" />
+         </ns1:BioSamples>
+      </DataSetMetadata>
+</SubreadSet>
diff --git a/tests/data/dataset/pbalchemy10kbp.xml b/tests/data/dataset/pbalchemy10kbp.xml

new file mode 100644 (file)

index 0000000..96189ad
--- /dev/null
+++ b/tests/data/dataset/pbalchemy10kbp.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:DataSet CreatedAt="2015-05-22T16:56:16" MetaType="PacBio.DataSet.DataSet" Name="" Tags="" UniqueId="58e3f7c5-24c1-b58b-fbd5-37de268cc2f0" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+  <pbbase:ExternalResources>
+    <pbbase:ExternalResource ResourceId="file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam">
+      <pbbase:FileIndices>
+        <pbbase:FileIndex ResourceId="file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam.bai"/>
+      </pbbase:FileIndices>
+    </pbbase:ExternalResource>
+  </pbbase:ExternalResources>
+  <pbds:Filters>
+      <pbds:Filter>
+          <pbbase:Properties>
+              <pbbase:Property Name="rname" Value="E.faecalis.1" Operator="=" />
+          </pbbase:Properties>
+      </pbds:Filter>
+  </pbds:Filters>
+</pbds:DataSet>
diff --git a/tests/data/dataset/qname_filter.bam b/tests/data/dataset/qname_filter.bam

new file mode 100644 (file)

index 0000000..9e5500d

Binary files /dev/null and b/tests/data/dataset/qname_filter.bam differ
diff --git a/tests/data/dataset/qname_filter.bam.pbi b/tests/data/dataset/qname_filter.bam.pbi

new file mode 100644 (file)

index 0000000..c8e80fd

Binary files /dev/null and b/tests/data/dataset/qname_filter.bam.pbi differ
diff --git a/tests/data/dataset/reference.dataset.xml b/tests/data/dataset/reference.dataset.xml

new file mode 100644 (file)

index 0000000..3cfbe8c
--- /dev/null
+++ b/tests/data/dataset/reference.dataset.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:ReferenceSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.ReferenceSet" Name="DataSet_ReferenceSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First References FASTA" Description="Points to an example references FASTA file." MetaType="PacBio.ReferenceFile.ReferenceFastaFile" ResourceId="file:///mnt/path/to/reference.fasta" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.SaWriterIndex" ResourceId="file:///mnt/path/to/reference.fasta.sa"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.SamIndex" ResourceId="file:///mnt/path/to/reference.fasta.fai"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>5000000</pbds:TotalLength>
+               <pbds:NumRecords>500</pbds:NumRecords>
+               <pbds:Organism>Tribble</pbds:Organism>
+               <pbds:Ploidy>Diploid</pbds:Ploidy>
+               <pbds:Contigs>
+                       <pbds:Contig Name="gi|229359445|emb|AM181176.4|" Description="Pseudomonas fluorescens SBW25 complete genome|quiver" Length="6722109" Digest="f627c795efad7ce0050ed42b942d408e"/>
+               </pbds:Contigs>
+       </pbds:DataSetMetadata>
+</pbds:ReferenceSet>
diff --git a/tests/data/dataset/subread_dataset1.xml b/tests/data/dataset/subread_dataset1.xml

new file mode 100644 (file)

index 0000000..1d64e79
--- /dev/null
+++ b/tests/data/dataset/subread_dataset1.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.SubreadSet" Name="DataSet_SubreadSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd" >
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource Name="First Subreads BAM" Description="Points to an example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads0.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads0.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource Name="Second Subreads BAM" Description="Points to another example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads1.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads0.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:Filters>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="rq" Value="0.75" Operator=">"/>
+            </pbbase:Properties>
+        </pbds:Filter>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="QNAME" Value="100/0/0_100" Operator="=="/>
+            </pbbase:Properties>
+        </pbds:Filter>
+    </pbds:Filters>
+    <pbds:DataSetMetadata>
+        <pbds:TotalLength>500000</pbds:TotalLength>
+        <pbds:NumRecords>500</pbds:NumRecords>
+        <pbmeta:Collections>
+            <pbmeta:CollectionMetadata Context="m152720_092723_00114_c100480560100000001823075906281381_s1_p0" InstrumentName="RS" InstrumentId="43210">
+                <pbmeta:InstCtrlVer>2.3.0.0.140640</pbmeta:InstCtrlVer>
+                <pbmeta:SigProcVer>NRT@172.31.128.10:8082, SwVer=2300.140640, HwVer=1.0</pbmeta:SigProcVer>
+                <pbmeta:RunDetails>
+                    <pbmeta:RunId>e903682f-e502-465c-a2b6-9dd77c9f43fc</pbmeta:RunId>
+                    <pbmeta:Name>beta4_130726_biotin_DEV_vs_MFG_PB11K_9458p</pbmeta:Name>
+                </pbmeta:RunDetails>
+                <pbmeta:WellSample Name="Well Sample 1" UniqueId="aaa2df90-d44f-4a48-9f35-3b99473c68f5">
+                    <pbmeta:PlateId>2014-12-24_141_NGAT_Igor_bisPNA Enrichment_Mag Bead Elution Buffers</pbmeta:PlateId>
+                    <pbmeta:WellName>B01</pbmeta:WellName>
+                    <pbmeta:Concentration>10</pbmeta:Concentration>
+                    <pbmeta:SampleReuseEnabled>true</pbmeta:SampleReuseEnabled>
+                    <pbmeta:StageHotstartEnabled>true</pbmeta:StageHotstartEnabled>
+                    <pbmeta:SizeSelectionEnabled>true</pbmeta:SizeSelectionEnabled>
+                    <pbmeta:UseCount>0</pbmeta:UseCount>
+                    <pbmeta:Comments>Lorem ipsum</pbmeta:Comments>
+                    <pbsample:BioSamplePointers>
+                        <pbsample:BioSamplePointer>abc2df90-d44f-4a48-9f35-3b99473c68f5</pbsample:BioSamplePointer>
+                    </pbsample:BioSamplePointers>
+                </pbmeta:WellSample>
+                <pbmeta:Automation>
+                    <pbbase:AutomationParameters>
+                                   <pbbase:AutomationParameter/>
+                    </pbbase:AutomationParameters>
+                </pbmeta:Automation>
+                <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                <pbmeta:CellPac Barcode="100480560100000001823075906281381"/>
+                <pbmeta:Primary>
+                    <pbmeta:AutomationName>BasecallerV1</pbmeta:AutomationName>
+                    <pbmeta:ConfigFileName>1-3-0_Standard_C2.xml</pbmeta:ConfigFileName>
+                    <pbmeta:SequencingCondition/>
+                    <pbmeta:ResultsFolder>Analysis_Results</pbmeta:ResultsFolder>
+                    <pbmeta:CollectionPathUri>rsy://mp-rsync/vol56//RS_DATA_STAGING//2014-12-24_141_NGAT_Igor_bisPNA%20Enrichment_Mag%20Bead%20Elution%20Buffers_1094/B01_1</pbmeta:CollectionPathUri>
+                    <pbmeta:CopyFiles>
+                        <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                    </pbmeta:CopyFiles>
+                </pbmeta:Primary>
+            </pbmeta:CollectionMetadata>
+        </pbmeta:Collections>
+        <pbsample:BioSamples>
+            <pbsample:BioSample UniqueId="abc2df90-d44f-4a48-9f35-3b99473c68f5" Name="consectetur purus" Description="Risus sit amet lectus vehicula vulputate quisque porta accumsan venenatis." CreatedAt="2015-01-20T13:27:23.9271737-08:00"/>
+        </pbsample:BioSamples>
+    </pbds:DataSetMetadata>
+</pbds:SubreadSet>
+<!-- TODO what do internal references look like?-->
diff --git a/tests/data/dataset/subread_dataset2.xml b/tests/data/dataset/subread_dataset2.xml

new file mode 100644 (file)

index 0000000..a395330
--- /dev/null
+++ b/tests/data/dataset/subread_dataset2.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.SubreadSet" Name="DataSet_SubreadSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd" >
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource Name="First Subreads BAM" Description="Points to an example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads2.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads2.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource Name="Second Subreads BAM" Description="Points to another example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads3.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads3.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:Filters>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="rq" Value="0.75" Operator=">"/>
+            </pbbase:Properties>
+        </pbds:Filter>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="QNAME" Value="100/0/0_100" Operator="=="/>
+            </pbbase:Properties>
+        </pbds:Filter>
+    </pbds:Filters>
+    <pbds:DataSetMetadata>
+        <pbds:TotalLength>500000</pbds:TotalLength>
+        <pbds:NumRecords>500</pbds:NumRecords>
+        <pbmeta:Collections>
+            <pbmeta:CollectionMetadata Context="m152720_092723_00114_c100480560100000001823075906281381_s1_p0" InstrumentName="RS" InstrumentId="43210">
+                <pbmeta:InstCtrlVer>2.3.0.0.140640</pbmeta:InstCtrlVer>
+                <pbmeta:SigProcVer>NRT@172.31.128.10:8082, SwVer=2300.140640, HwVer=1.0</pbmeta:SigProcVer>
+                <pbmeta:RunDetails>
+                    <pbmeta:RunId>e903682f-e502-465c-a2b6-9dd77c9f43fc</pbmeta:RunId>
+                    <pbmeta:Name>beta4_130726_biotin_DEV_vs_MFG_PB11K_9458p</pbmeta:Name>
+                </pbmeta:RunDetails>
+                <pbmeta:WellSample Name="Well Sample 1" UniqueId="aaa2df90-d44f-4a48-9f35-3b99473c68f5">
+                    <pbmeta:PlateId>2014-12-24_141_NGAT_Igor_bisPNA Enrichment_Mag Bead Elution Buffers</pbmeta:PlateId>
+                    <pbmeta:WellName>B01</pbmeta:WellName>
+                    <pbmeta:Concentration>10</pbmeta:Concentration>
+                    <pbmeta:SampleReuseEnabled>true</pbmeta:SampleReuseEnabled>
+                    <pbmeta:StageHotstartEnabled>true</pbmeta:StageHotstartEnabled>
+                    <pbmeta:SizeSelectionEnabled>true</pbmeta:SizeSelectionEnabled>
+                    <pbmeta:UseCount>0</pbmeta:UseCount>
+                    <pbmeta:Comments>Lorem ipsum</pbmeta:Comments>
+                    <pbsample:BioSamplePointers>
+                        <pbsample:BioSamplePointer>abc2df90-d44f-4a48-9f35-3b99473c68f5</pbsample:BioSamplePointer>
+                    </pbsample:BioSamplePointers>
+                </pbmeta:WellSample>
+                <pbmeta:Automation>
+                    <pbbase:AutomationParameters>
+                            <pbbase:AutomationParameter/>
+                    </pbbase:AutomationParameters>
+                </pbmeta:Automation>
+                <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                <pbmeta:CellPac Barcode="100480560100000001823075906281381"/>
+                <pbmeta:Primary>
+                    <pbmeta:AutomationName>BasecallerV1</pbmeta:AutomationName>
+                    <pbmeta:ConfigFileName>1-3-0_Standard_C2.xml</pbmeta:ConfigFileName>
+                    <pbmeta:SequencingCondition/>
+                    <pbmeta:ResultsFolder>Analysis_Results</pbmeta:ResultsFolder>
+                    <pbmeta:CollectionPathUri>rsy://mp-rsync/vol56//RS_DATA_STAGING//2014-12-24_141_NGAT_Igor_bisPNA%20Enrichment_Mag%20Bead%20Elution%20Buffers_1094/B01_1</pbmeta:CollectionPathUri>
+                    <pbmeta:CopyFiles>
+                        <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                    </pbmeta:CopyFiles>
+                </pbmeta:Primary>
+            </pbmeta:CollectionMetadata>
+        </pbmeta:Collections>
+        <pbsample:BioSamples>
+            <pbsample:BioSample UniqueId="abc2df90-d44f-4a48-9f35-3b99473c68f5" Name="consectetur purus" Description="Risus sit amet lectus vehicula vulputate quisque porta accumsan venenatis." CreatedAt="2015-01-20T13:27:23.9271737-08:00"/>
+        </pbsample:BioSamples>
+    </pbds:DataSetMetadata>
+</pbds:SubreadSet>
+<!-- TODO what do internal references look like?-->
diff --git a/tests/data/dataset/subread_dataset3.xml b/tests/data/dataset/subread_dataset3.xml

new file mode 100644 (file)

index 0000000..91923a8
--- /dev/null
+++ b/tests/data/dataset/subread_dataset3.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.SubreadSet" Name="DataSet_SubreadSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd" >
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource Name="First Subreads BAM" Description="Points to an example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads2.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads2.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource Name="Second Subreads BAM" Description="Points to another example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads3.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads3.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:Filters>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="rq" Value="0.85" Operator=">"/>
+            </pbbase:Properties>
+        </pbds:Filter>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="QNAME" Value="100/0/0_100" Operator="=="/>
+            </pbbase:Properties>
+        </pbds:Filter>
+    </pbds:Filters>
+    <pbds:DataSetMetadata>
+        <pbds:TotalLength>500000</pbds:TotalLength>
+        <pbds:NumRecords>500</pbds:NumRecords>
+        <pbmeta:Collections>
+            <pbmeta:CollectionMetadata Context="m152720_092723_00114_c100480560100000001823075906281381_s1_p0" InstrumentName="RS" InstrumentId="43210">
+                <pbmeta:InstCtrlVer>2.3.0.0.140640</pbmeta:InstCtrlVer>
+                <pbmeta:SigProcVer>NRT@172.31.128.10:8082, SwVer=2300.140640, HwVer=1.0</pbmeta:SigProcVer>
+                <pbmeta:RunDetails>
+                    <pbmeta:RunId>e903682f-e502-465c-a2b6-9dd77c9f43fc</pbmeta:RunId>
+                    <pbmeta:Name>beta4_130726_biotin_DEV_vs_MFG_PB11K_9458p</pbmeta:Name>
+                </pbmeta:RunDetails>
+                <pbmeta:WellSample Name="Well Sample 1" UniqueId="aaa2df90-d44f-4a48-9f35-3b99473c68f5">
+                    <pbmeta:PlateId>2014-12-24_141_NGAT_Igor_bisPNA Enrichment_Mag Bead Elution Buffers</pbmeta:PlateId>
+                    <pbmeta:WellName>B01</pbmeta:WellName>
+                    <pbmeta:Concentration>10</pbmeta:Concentration>
+                    <pbmeta:SampleReuseEnabled>true</pbmeta:SampleReuseEnabled>
+                    <pbmeta:StageHotstartEnabled>true</pbmeta:StageHotstartEnabled>
+                    <pbmeta:SizeSelectionEnabled>true</pbmeta:SizeSelectionEnabled>
+                    <pbmeta:UseCount>0</pbmeta:UseCount>
+                    <pbmeta:Comments>Lorem ipsum</pbmeta:Comments>
+                    <pbsample:BioSamplePointers>
+                        <pbsample:BioSamplePointer>abc2df90-d44f-4a48-9f35-3b99473c68f5</pbsample:BioSamplePointer>
+                    </pbsample:BioSamplePointers>
+                </pbmeta:WellSample>
+                <pbmeta:Automation>
+                    <pbbase:AutomationParameters>
+                                   <pbbase:AutomationParameter/>
+                    </pbbase:AutomationParameters>
+                </pbmeta:Automation>
+                <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                <pbmeta:CellPac Barcode="100480560100000001823075906281381"/>
+                <pbmeta:Primary>
+                    <pbmeta:AutomationName>BasecallerV1</pbmeta:AutomationName>
+                    <pbmeta:ConfigFileName>1-3-0_Standard_C2.xml</pbmeta:ConfigFileName>
+                    <pbmeta:SequencingCondition/>
+                    <pbmeta:ResultsFolder>Analysis_Results</pbmeta:ResultsFolder>
+                    <pbmeta:CollectionPathUri>rsy://mp-rsync/vol56//RS_DATA_STAGING//2014-12-24_141_NGAT_Igor_bisPNA%20Enrichment_Mag%20Bead%20Elution%20Buffers_1094/B01_1</pbmeta:CollectionPathUri>
+                    <pbmeta:CopyFiles>
+                        <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                    </pbmeta:CopyFiles>
+                </pbmeta:Primary>
+            </pbmeta:CollectionMetadata>
+        </pbmeta:Collections>
+        <pbsample:BioSamples>
+            <pbsample:BioSample UniqueId="abc2df90-d44f-4a48-9f35-3b99473c68f5" Name="consectetur purus" Description="Risus sit amet lectus vehicula vulputate quisque porta accumsan venenatis." CreatedAt="2015-01-20T13:27:23.9271737-08:00"/>
+        </pbsample:BioSamples>
+    </pbds:DataSetMetadata>
+</pbds:SubreadSet>
+<!-- TODO what do internal references look like?-->
diff --git a/tests/data/dataset/transformed_rs_subread_dataset.xml b/tests/data/dataset/transformed_rs_subread_dataset.xml

new file mode 100644 (file)

index 0000000..0750655
--- /dev/null
+++ b/tests/data/dataset/transformed_rs_subread_dataset.xml
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:HdfSubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xmlns:bax="http://whatever"
+    xmlns:fn="http://www.w3.org/2005/xpath-functions"
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" 
+    xmlns:uuid="java:java.util.UUID" 
+    xmlns:xs="http://www.w3.org/2001/XMLSchema"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    Name="Subreads from run r001173_42129_130607"
+    MetaType="PacBio.DataSet.SubreadSet"
+    Tags="pacbio.secondary.instrument=RS"
+    Version="0.5"
+    UniqueId="abbc9183-b01e-4671-8c12-19efee534647">
+   <pbbase:ExternalResources>
+      <pbbase:ExternalResource MetaType="PacBio.SubreadFile.BaxFile"
+          ResourceId="file:///pbi/dept/secondary/siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.0.bax.h5"/>
+      <pbbase:ExternalResource MetaType="PacBio.SubreadFile.BaxFile"
+          ResourceId="file:///pbi/dept/secondary/siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.1.bax.h5"/>
+      <pbbase:ExternalResource MetaType="PacBio.SubreadFile.BaxFile"
+          ResourceId="file:///pbi/dept/secondary/siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.2.bax.h5"/>
+   </pbbase:ExternalResources>
+   <pbds:DataSetMetadata>
+      <pbds:TotalLength>50000000</pbds:TotalLength>
+      <pbds:NumRecords>150000</pbds:NumRecords>
+      <pbmeta:Collections>
+         <pbmeta:CollectionMetadata Context="m130608_033634_42129_c100515232550000001823076608221351_s1_p0"
+                             InstrumentName="42129"
+                             InstrumentId="1">
+            <pbmeta:InstCtrlVer>2.0.1.0.124174</pbmeta:InstCtrlVer>
+            <pbmeta:SigProcVer>NRT@172.31.128.10:8082, SwVer=2010.124174, HwVer=1.0</pbmeta:SigProcVer>
+            <pbmeta:RunDetails>
+               <pbmeta:RunId>r001173_42129_130607</pbmeta:RunId>
+               <pbmeta:Name>2013-06-07_42129_10kb_Ecoli_201-validation_2</pbmeta:Name>
+            </pbmeta:RunDetails>
+            <pbmeta:WellSample Name="P4-C2_Ecoli_10kb_MBS_stageHS">
+               <pbmeta:PlateId>2013-06-07_42129_10kb_Ecoli_201-validation_2</pbmeta:PlateId>
+               <pbmeta:WellName>P4-C2_Ecoli_10kb_MBS_stageHS</pbmeta:WellName>
+               <pbmeta:Concentration>0</pbmeta:Concentration>
+               <pbmeta:SampleReuseEnabled>false</pbmeta:SampleReuseEnabled>
+               <pbmeta:StageHotstartEnabled>true</pbmeta:StageHotstartEnabled>
+               <pbmeta:SizeSelectionEnabled>
+                                    false
+                                                               </pbmeta:SizeSelectionEnabled>
+               <pbmeta:UseCount>1</pbmeta:UseCount>
+               <pbmeta:Comments>P4-C2_Ecoli_10kb_MBS_stageHS</pbmeta:Comments>
+               <pbsample:BioSamplePointers>
+                  <pbsample:BioSamplePointer>abafd4ed-5cf7-4b83-a869-1a5d239d30e2</pbsample:BioSamplePointer>
+               </pbsample:BioSamplePointers>
+            </pbmeta:WellSample>
+            <pbmeta:AutomationName>MagBead Standard Seq v2</pbmeta:AutomationName>
+            <pbmeta:CollectionNumber>2</pbmeta:CollectionNumber>
+            <pbmeta:CellIndex>1</pbmeta:CellIndex>
+            <pbmeta:CellPac Barcode="10051523255000000182307660822135"/>
+            <pbmeta:Primary>
+               <pbmeta:AutomationName>BasecallerV1</pbmeta:AutomationName>
+               <pbmeta:ConfigFileName>2-0-0_P4-C2.xml</pbmeta:ConfigFileName>
+               <pbmeta:SequencingCondition/>
+               <pbmeta:ResultsFolder>Analysis_Results</pbmeta:ResultsFolder>
+               <pbmeta:CollectionPathUri>rsy://mp-f030-io/vol54//RS_DATA_STAGING/42129/2013-06-07_42129_10kb_Ecoli_201-validation_2_1173/A01_2/</pbmeta:CollectionPathUri>
+               <pbmeta:CopyFiles>
+                  <pbmeta:CollectionFileCopy>Fasta</pbmeta:CollectionFileCopy>
+               </pbmeta:CopyFiles>
+            </pbmeta:Primary>
+         </pbmeta:CollectionMetadata>
+      </pbmeta:Collections>
+      <pbsample:BioSamples>
+         <pbsample:BioSample Name="P4-C2_Ecoli_10kb_MBS_stageHS" Description="P4-C2_Ecoli_10kb_MBS_stageHS"
+                    UniqueId="abafd4ed-5cf7-4b83-a869-1a5d239d30e2"/>
+      </pbsample:BioSamples>
+   </pbds:DataSetMetadata>
+</pbds:HdfSubreadSet>
diff --git a/tests/data/empty.bam b/tests/data/empty.bam

new file mode 100644 (file)

index 0000000..1b22456

Binary files /dev/null and b/tests/data/empty.bam differ
diff --git a/tests/data/empty.bam.pbi b/tests/data/empty.bam.pbi

new file mode 100644 (file)

index 0000000..e398d79

Binary files /dev/null and b/tests/data/empty.bam.pbi differ
diff --git a/tests/data/fastx/chunking.fa b/tests/data/fastx/chunking.fa

new file mode 100644 (file)

index 0000000..d0094ca
--- /dev/null
+++ b/tests/data/fastx/chunking.fa
@@ -0,0 +1,70 @@
+>seq/0
+AAAA
+>seq/1
+CCCC
+>seq/2
+TTTT
+>seq/3
+CCCC
+>seq/4
+GGGG
+>seq/5
+AAAA
+>seq/6
+CCCC
+>seq/7
+TTTT
+>seq/8
+CCCC
+>seq/9
+GGGG
+>seq/10
+AAAA
+>seq/11
+CCCC
+>seq/12
+TTTT
+>seq/13
+CCCC
+>seq/14
+GGGG
+>seq/15
+AAAA
+>seq/16
+CCCC
+>seq/17
+TTTT
+>seq/18
+CCCC
+>seq/19
+GGGG
+>seq/20
+CCCC
+>seq/25
+GGGG
+>seq/30
+AAAA
+>seq/35
+CCCC
+>seq/40
+TTTT
+>seq/45
+CCCC
+>seq/50
+GGGG
+>seq/100/0_100
+CCCC
+>seq/100/100_200
+AAAA
+>seq/100/200_300
+GGGG
+>seq/100/300_400
+CCCC
+>seq/110/ccs
+TTTT
+>seq/120/ccs
+AAAA
+>seq/130/transcript
+GGGG
+>seq/140/transcript
+AAAA
diff --git a/tests/data/fastx/chunking.fa.fai b/tests/data/fastx/chunking.fa.fai

new file mode 100644 (file)

index 0000000..9898868
--- /dev/null
+++ b/tests/data/fastx/chunking.fa.fai
@@ -0,0 +1,35 @@
+seq/0  4       7       4       5
+seq/1  4       19      4       5
+seq/2  4       31      4       5
+seq/3  4       43      4       5
+seq/4  4       55      4       5
+seq/5  4       67      4       5
+seq/6  4       79      4       5
+seq/7  4       91      4       5
+seq/8  4       103     4       5
+seq/9  4       115     4       5
+seq/10 4       128     4       5
+seq/11 4       141     4       5
+seq/12 4       154     4       5
+seq/13 4       167     4       5
+seq/14 4       180     4       5
+seq/15 4       193     4       5
+seq/16 4       206     4       5
+seq/17 4       219     4       5
+seq/18 4       232     4       5
+seq/19 4       245     4       5
+seq/20 4       258     4       5
+seq/25 4       271     4       5
+seq/30 4       284     4       5
+seq/35 4       297     4       5
+seq/40 4       310     4       5
+seq/45 4       323     4       5
+seq/50 4       336     4       5
+seq/100/0_100  4       356     4       5
+seq/100/100_200        4       378     4       5
+seq/100/200_300        4       400     4       5
+seq/100/300_400        4       422     4       5
+seq/110/ccs    4       440     4       5
+seq/120/ccs    4       458     4       5
+seq/130/transcript     4       483     4       5
+seq/140/transcript     4       508     4       5
diff --git a/tests/data/fastx/chunking.fq b/tests/data/fastx/chunking.fq

new file mode 100644 (file)

index 0000000..619e336
--- /dev/null
+++ b/tests/data/fastx/chunking.fq
@@ -0,0 +1,140 @@
+@seq/0
+AAAA
++
+~~~~
+@seq/1
+CCCC
++
+oooo
+@seq/2
+TTTT
++
+BBBB
+@seq/3
+CCCC
++
+$$$$
+@seq/4
+GGGG
++
+####
+@seq/5
+AAAA
++
+~~~~
+@seq/6
+CCCC
++
+oooo
+@seq/7
+TTTT
++
+BBBB
+@seq/8
+CCCC
++
+$$$$
+@seq/9
+GGGG
++
+####
+@seq/10
+AAAA
++
+~~~~
+@seq/11
+CCCC
++
+oooo
+@seq/12
+TTTT
++
+BBBB
+@seq/13
+CCCC
++
+$$$$
+@seq/14
+GGGG
++
+####
+@seq/15
+AAAA
++
+~~~~
+@seq/16
+CCCC
++
+oooo
+@seq/17
+TTTT
++
+BBBB
+@seq/18
+CCCC
++
+$$$$
+@seq/19
+GGGG
++
+####
+@seq/20
+CCCC
++
+$$$$
+@seq/25
+GGGG
++
+####
+@seq/30
+AAAA
++
+~~~~
+@seq/35
+CCCC
++
+oooo
+@seq/40
+TTTT
++
+BBBB
+@seq/45
+CCCC
++
+$$$$
+@seq/50
+GGGG
++
+####
+@seq/100/0_100
+CCCC
++
+$$$$
+@seq/100/100_200
+AAAA
++
+~~~~
+@seq/100/200_300
+GGGG
++
+####
+@seq/100/300_400
+CCCC
++
+$$$$
+@seq/110/ccs
+TTTT
++
+BBBB
+@seq/120/ccs
+AAAA
++
+~~~~
+@seq/130/transcript
+GGGG
++
+####
+@seq/140/transcript
+AAAA
++
+~~~~
diff --git a/tests/data/fastx/chunking.fq.fai b/tests/data/fastx/chunking.fq.fai

new file mode 100644 (file)

index 0000000..13ef960
--- /dev/null
+++ b/tests/data/fastx/chunking.fq.fai
@@ -0,0 +1,35 @@
+seq/0  4       7       4       5       14
+seq/1  4       26      4       5       33
+seq/2  4       45      4       5       52
+seq/3  4       64      4       5       71
+seq/4  4       83      4       5       90
+seq/5  4       102     4       5       109
+seq/6  4       121     4       5       128
+seq/7  4       140     4       5       147
+seq/8  4       159     4       5       166
+seq/9  4       178     4       5       185
+seq/10 4       198     4       5       205
+seq/11 4       218     4       5       225
+seq/12 4       238     4       5       245
+seq/13 4       258     4       5       265
+seq/14 4       278     4       5       285
+seq/15 4       298     4       5       305
+seq/16 4       318     4       5       325
+seq/17 4       338     4       5       345
+seq/18 4       358     4       5       365
+seq/19 4       378     4       5       385
+seq/20 4       398     4       5       405
+seq/25 4       418     4       5       425
+seq/30 4       438     4       5       445
+seq/35 4       458     4       5       465
+seq/40 4       478     4       5       485
+seq/45 4       498     4       5       505
+seq/50 4       518     4       5       525
+seq/100/0_100  4       545     4       5       552
+seq/100/100_200        4       574     4       5       581
+seq/100/200_300        4       603     4       5       610
+seq/100/300_400        4       632     4       5       639
+seq/110/ccs    4       657     4       5       664
+seq/120/ccs    4       682     4       5       689
+seq/130/transcript     4       714     4       5       721
+seq/140/transcript     4       746     4       5       753
diff --git a/tests/data/fastx/simple-bgzf.fa.gz b/tests/data/fastx/simple-bgzf.fa.gz

new file mode 100644 (file)

index 0000000..ecbefd6

Binary files /dev/null and b/tests/data/fastx/simple-bgzf.fa.gz differ
diff --git a/tests/data/fastx/simple-bgzf.fa.gz.fai b/tests/data/fastx/simple-bgzf.fa.gz.fai

new file mode 100644 (file)

index 0000000..16cd9c3
--- /dev/null
+++ b/tests/data/fastx/simple-bgzf.fa.gz.fai
@@ -0,0 +1,8 @@
+seq1   63      6       63      64
+seq2   63      76      63      64
+seq3   63      146     63      64
+seq4   63      216     63      64
+seq5   63      286     63      64
+seq6   63      356     63      64
+seq7   63      426     63      64
+seq8   63      496     63      64
diff --git a/tests/data/fastx/simple-bgzf.fa.gz.gzi b/tests/data/fastx/simple-bgzf.fa.gz.gzi

new file mode 100644 (file)

index 0000000..1b1cb4d

Binary files /dev/null and b/tests/data/fastx/simple-bgzf.fa.gz.gzi differ
diff --git a/tests/data/fastx/simple-bgzf.fq.gz b/tests/data/fastx/simple-bgzf.fq.gz

new file mode 100644 (file)

index 0000000..bd80362

Binary files /dev/null and b/tests/data/fastx/simple-bgzf.fq.gz differ
diff --git a/tests/data/fastx/simple-bgzf.fq.gz.fai b/tests/data/fastx/simple-bgzf.fq.gz.fai

new file mode 100644 (file)

index 0000000..2861bf5
--- /dev/null
+++ b/tests/data/fastx/simple-bgzf.fq.gz.fai
@@ -0,0 +1,8 @@
+seq1   63      6       63      64      72
+seq2   63      142     63      64      208
+seq3   63      278     63      64      344
+seq4   63      414     63      64      480
+seq5   63      550     63      64      616
+seq6   63      686     63      64      752
+seq7   63      822     63      64      888
+seq8   63      958     63      64      1024
diff --git a/tests/data/fastx/simple-bgzf.fq.gz.gzi b/tests/data/fastx/simple-bgzf.fq.gz.gzi

new file mode 100644 (file)

index 0000000..1b1cb4d

Binary files /dev/null and b/tests/data/fastx/simple-bgzf.fq.gz.gzi differ
diff --git a/tests/data/fastx/simple-gzip.fa.gz b/tests/data/fastx/simple-gzip.fa.gz

new file mode 100644 (file)

index 0000000..273af21

Binary files /dev/null and b/tests/data/fastx/simple-gzip.fa.gz differ
diff --git a/tests/data/fastx/simple-gzip.fq.gz b/tests/data/fastx/simple-gzip.fq.gz

new file mode 100644 (file)

index 0000000..45ab8e6

Binary files /dev/null and b/tests/data/fastx/simple-gzip.fq.gz differ
diff --git a/tests/data/fastx/simple.fa b/tests/data/fastx/simple.fa

new file mode 100644 (file)

index 0000000..cb3b9ca
--- /dev/null
+++ b/tests/data/fastx/simple.fa
@@ -0,0 +1,16 @@
+>seq1
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG
+>seq2
+GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA
+>seq3
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG
+>seq4
+GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA
+>seq5
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG
+>seq6
+GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA
+>seq7
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG
+>seq8
+GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA
diff --git a/tests/data/fastx/simple.fa.fai b/tests/data/fastx/simple.fa.fai

new file mode 100644 (file)

index 0000000..16cd9c3
--- /dev/null
+++ b/tests/data/fastx/simple.fa.fai
@@ -0,0 +1,8 @@
+seq1   63      6       63      64
+seq2   63      76      63      64
+seq3   63      146     63      64
+seq4   63      216     63      64
+seq5   63      286     63      64
+seq6   63      356     63      64
+seq7   63      426     63      64
+seq8   63      496     63      64
diff --git a/tests/data/fastx/simple.fq b/tests/data/fastx/simple.fq

new file mode 100644 (file)

index 0000000..aadf10f
--- /dev/null
+++ b/tests/data/fastx/simple.fq
@@ -0,0 +1,32 @@
+@seq1
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG
++
+ZABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+@seq2
+GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA
++
+~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@
+@seq3
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_
+@seq4
+GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA
++
+_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
+@seq5
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG
++
+;;>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+@seq6
+GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA
++
+~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJHGFECB@>;;
+@seq7
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG
++
+ZABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+@seq8
+GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA
++
+~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@
diff --git a/tests/data/fastx/simple.fq.fai b/tests/data/fastx/simple.fq.fai

new file mode 100644 (file)

index 0000000..2861bf5
--- /dev/null
+++ b/tests/data/fastx/simple.fq.fai
@@ -0,0 +1,8 @@
+seq1   63      6       63      64      72
+seq2   63      142     63      64      208
+seq3   63      278     63      64      344
+seq4   63      414     63      64      480
+seq5   63      550     63      64      616
+seq6   63      686     63      64      752
+seq7   63      822     63      64      888
+seq8   63      958     63      64      1024
diff --git a/tests/data/fastx/windows_formatted.fasta b/tests/data/fastx/windows_formatted.fasta

new file mode 100644 (file)

index 0000000..e6dd4f6
--- /dev/null
+++ b/tests/data/fastx/windows_formatted.fasta
@@ -0,0 +1,14 @@
+>Clontech_5p\r
+AAGCAGTGGTATCAACGCAGAGTACATGGG\r
+>tissue1_3p\r
+atgacgcatcgtctgaGTACTCTGCGTTGATACCACTGCTT\r
+>tissue2_3p\r
+gcagagtcatgtatagGTACTCTGCGTTGATACCACTGCTT\r
+>tissue3_3p\r
+gagtgctactctagtaGTACTCTGCGTTGATACCACTGCTT\r
+>tissue4_3p\r
+catgtactgatacacaGTACTCTGCGTTGATACCACTGCTT\r
+>tissue5_3p\r
+gcatatagtagagatcGTACTCTGCGTTGATACCACTGCTT\r
+>tissue6_3p\r
+cagcagtatagactgtGTACTCTGCGTTGATACCACTGCTT
+\ No newline at end of file
diff --git a/tests/data/fastx/windows_formatted.fastq b/tests/data/fastx/windows_formatted.fastq

new file mode 100644 (file)

index 0000000..ef8f614
--- /dev/null
+++ b/tests/data/fastx/windows_formatted.fastq
@@ -0,0 +1,4 @@
+@C5\r
+AAGCA\r
++\r
+~~~~~\r
diff --git a/tests/data/group/group.fofn.in b/tests/data/group/group.fofn.in

new file mode 100644 (file)

index 0000000..c2621c5
--- /dev/null
+++ b/tests/data/group/group.fofn.in
@@ -0,0 +1,3 @@
+@PacBioBAM_TestsDir@/data/group/test1.bam
+@PacBioBAM_TestsDir@/data/group/test2.bam
+@PacBioBAM_TestsDir@/data/group/test3.bam
diff --git a/tests/data/group/test1.bam b/tests/data/group/test1.bam

new file mode 100644 (file)

index 0000000..2ba687b

Binary files /dev/null and b/tests/data/group/test1.bam differ
diff --git a/tests/data/group/test2.bam b/tests/data/group/test2.bam

new file mode 100644 (file)

index 0000000..9e22b30

Binary files /dev/null and b/tests/data/group/test2.bam differ
diff --git a/tests/data/group/test2.bam.pbi b/tests/data/group/test2.bam.pbi

new file mode 100644 (file)

index 0000000..761600b

Binary files /dev/null and b/tests/data/group/test2.bam.pbi differ
diff --git a/tests/data/group/test3.bam b/tests/data/group/test3.bam

new file mode 100644 (file)

index 0000000..093e93a

Binary files /dev/null and b/tests/data/group/test3.bam differ
diff --git a/tests/data/lambdaNEB.fa b/tests/data/lambdaNEB.fa

new file mode 100644 (file)

index 0000000..33011e5
--- /dev/null
+++ b/tests/data/lambdaNEB.fa
@@ -0,0 +1,608 @@
+>lambda_NEB3011
+GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTA
+ATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGACAGGTGCTGAAAGCGAGCTTTTTGGCCTCTGTCGTTTCC
+TTTCTCTGTTTTTGTCCGTGGAATGAACAATGGAAGTCAACAAAAAGCAGCTGGCTGACATTTTCGGTGCGAGTATCCGT
+ACCATTCAGAACTGGCAGGAACAGGGAATGCCCGTTCTGCGAGGCGGTGGCAAGGGTAATGAGGTGCTTTATGACTCTGC
+CGCCGTCATAAAATGGTATGCCGAAAGGGATGCTGAAATTGAGAACGAAAAGCTGCGCCGGGAGGTTGAAGAACTGCGGC
+AGGCCAGCGAGGCAGATCTCCAGCCAGGAACTATTGAGTACGAACGCCATCGACTTACGCGTGCGCAGGCCGACGCACAG
+GAACTGAAGAATGCCAGAGACTCCGCTGAAGTGGTGGAAACCGCATTCTGTACTTTCGTGCTGTCGCGGATCGCAGGTGA
+AATTGCCAGTATTCTCGACGGGCTCCCCCTGTCGGTGCAGCGGCGTTTTCCGGAACTGGAAAACCGACATGTTGATTTCC
+TGAAACGGGATATCATCAAAGCCATGAACAAAGCAGCCGCGCTGGATGAACTGATACCGGGGTTGCTGAGTGAATATATC
+GAACAGTCAGGTTAACAGGCTGCGGCATTTTGTCCGCGCCGGGCTTCGCTCACTGTTCAGGCCGGAGCCACAGACCGCCG
+TTGAATGGGCGGATGCTAATTACTATCTCCCGAAAGAATCCGCATACCAGGAAGGGCGCTGGGAAACACTGCCCTTTCAG
+CGGGCCATCATGAATGCGATGGGCAGCGACTACATCCGTGAGGTGAATGTGGTGAAGTCTGCCCGTGTCGGTTATTCCAA
+AATGCTGCTGGGTGTTTATGCCTACTTTATAGAGCATAAGCAGCGCAACACCCTTATCTGGTTGCCGACGGATGGTGATG
+CCGAGAACTTTATGAAAACCCACGTTGAGCCGACTATTCGTGATATTCCGTCGCTGCTGGCGCTGGCCCCGTGGTATGGC
+AAAAAGCACCGGGATAACACGCTCACCATGAAGCGTTTCACTAATGGGCGTGGCTTCTGGTGCCTGGGCGGTAAAGCGGC
+AAAAAACTACCGTGAAAAGTCGGTGGATGTGGCGGGTTATGATGAACTTGCTGCTTTTGATGATGATATTGAACAGGAAG
+GCTCTCCGACGTTCCTGGGTGACAAGCGTATTGAAGGCTCGGTCTGGCCAAAGTCCATCCGTGGCTCCACGCCAAAAGTG
+AGAGGCACCTGTCAGATTGAGCGTGCAGCCAGTGAATCCCCGCATTTTATGCGTTTTCATGTTGCCTGCCCGCATTGCGG
+GGAGGAGCAGTATCTTAAATTTGGCGACAAAGAGACGCCGTTTGGCCTCAAATGGACGCCGGATGACCCCTCCAGCGTGT
+TTTATCTCTGCGAGCATAATGCCTGCGTCATCCGCCAGCAGGAGCTGGACTTTACTGATGCCCGTTATATCTGCGAAAAG
+ACCGGGATCTGGACCCGTGATGGCATTCTCTGGTTTTCGTCATCCGGTGAAGAGATTGAGCCACCTGACAGTGTGACCTT
+TCACATCTGGACAGCGTACAGCCCGTTCACCACCTGGGTGCAGATTGTCAAAGACTGGATGAAAACGAAAGGGGATACGG
+GAAAACGTAAAACCTTCGTAAACACCACGCTCGGTGAGACGTGGGAGGCGAAAATTGGCGAACGTCCGGATGCTGAAGTG
+ATGGCAGAGCGGAAAGAGCATTATTCAGCGCCCGTTCCTGACCGTGTGGCTTACCTGACCGCCGGTATCGACTCCCAGCT
+GGACCGCTACGAAATGCGCGTATGGGGATGGGGGCCGGGTGAGGAAAGCTGGCTGATTGACCGGCAGATTATTATGGGCC
+GCCACGACGATGAACAGACGCTGCTGCGTGTGGATGAGGCCATCAATAAAACCTATACCCGCCGGAATGGTGCAGAAATG
+TCGATATCCCGTATCTGCTGGGATACTGGCGGGATTGACCCGACCATTGTGTATGAACGCTCGAAAAAACATGGGCTGTT
+CCGGGTGATCCCCATTAAAGGGGCATCCGTCTACGGAAAGCCGGTGGCCAGCATGCCACGTAAGCGAAACAAAAACGGGG
+TTTACCTTACCGAAATCGGTACGGATACCGCGAAAGAGCAGATTTATAACCGCTTCACACTGACGCCGGAAGGGGATGAA
+CCGCTTCCCGGTGCCGTTCACTTCCCGAATAACCCGGATATTTTTGATCTGACCGAAGCGCAGCAGCTGACTGCTGAAGA
+GCAGGTCGAAAAATGGGTGGATGGCAGGAAAAAAATACTGTGGGACAGCAAAAAGCGACGCAATGAGGCACTCGACTGCT
+TCGTTTATGCGCTGGCGGCGCTGCGCATCAGTATTTCCCGCTGGCAGCTGGATCTCAGTGCGCTGCTGGCGAGCCTGCAG
+GAAGAGGATGGTGCAGCAACCAACAAGAAAACACTGGCAGATTACGCCCGTGCCTTATCCGGAGAGGATGAATGACGCGA
+CAGGAAGAACTTGCCGCTGCCCGTGCGGCACTGCATGACCTGATGACAGGTAAACGGGTGGCAACAGTACAGAAAGACGG
+ACGAAGGGTGGAGTTTACGGCCACTTCCGTGTCTGACCTGAAAAAATATATTGCAGAGCTGGAAGTGCAGACCGGCATGA
+CACAGCGACGCAGGGGACCTGCAGGATTTTATGTATGAAAACGCCCACCATTCCCACCCTTCTGGGGCCGGACGGCATGA
+CATCGCTGCGCGAATATGCCGGTTATCACGGCGGTGGCAGCGGATTTGGAGGGCAGTTGCGGTCGTGGAACCCACCGAGT
+GAAAGTGTGGATGCAGCCCTGTTGCCCAACTTTACCCGTGGCAATGCCCGCGCAGACGATCTGGTACGCAATAACGGCTA
+TGCCGCCAACGCCATCCAGCTGCATCAGGATCATATCGTCGGGTCTTTTTTCCGGCTCAGTCATCGCCCAAGCTGGCGCT
+ATCTGGGCATCGGGGAGGAAGAAGCCCGTGCCTTTTCCCGCGAGGTTGAAGCGGCATGGAAAGAGTTTGCCGAGGATGAC
+TGCTGCTGCATTGACGTTGAGCGAAAACGCACGTTTACCATGATGATTCGGGAAGGTGTGGCCATGCACGCCTTTAACGG
+TGAACTGTTCGTTCAGGCCACCTGGGATACCAGTTCGTCGCGGCTTTTCCGGACACAGTTCCGGATGGTCAGCCCGAAGC
+GCATCAGCAACCCGAACAATACCGGCGACAGCCGGAACTGCCGTGCCGGTGTGCAGATTAATGACAGCGGTGCGGCGCTG
+GGATATTACGTCAGCGAGGACGGGTATCCTGGCTGGATGCCGCAGAAATGGACATGGATACCCCGTGAGTTACCCGGCGG
+GCGCGCCTCGTTCATTCACGTTTTTGAACCCGTGGAGGACGGGCAGACTCGCGGTGCAAATGTGTTTTACAGCGTGATGG
+AGCAGATGAAGATGCTCGACACGCTGCAGAACACGCAGCTGCAGAGCGCCATTGTGAAGGCGATGTATGCCGCCACCATT
+GAGAGTGAGCTGGATACGCAGTCAGCGATGGATTTTATTCTGGGCGCGAACAGTCAGGAGCAGCGGGAAAGGCTGACCGG
+CTGGATTGGTGAAATTGCCGCGTATTACGCCGCAGCGCCGGTCCGGCTGGGAGGCGCAAAAGTACCGCACCTGATGCCGG
+GTGACTCACTGAACCTGCAGACGGCTCAGGATACGGATAACGGCTACTCCGTGTTTGAGCAGTCACTGCTGCGGTATATC
+GCTGCCGGGCTGGGTGTCTCGTATGAGCAGCTTTCCCGGAATTACGCCCAGATGAGCTACTCCACGGCACGGGCCAGTGC
+GAACGAGTCGTGGGCGTACTTTATGGGGCGGCGAAAATTCGTCGCATCCCGTCAGGCGAGCCAGATGTTTCTGTGCTGGC
+TGGAAGAGGCCATCGTTCGCCGCGTGGTGACGTTACCTTCAAAAGCGCGCTTCAGTTTTCAGGAAGCCCGCAGTGCCTGG
+GGGAACTGCGACTGGATAGGCTCCGGTCGTATGGCCATCGATGGTCTGAAAGAAGTTCAGGAAGCGGTGATGCTGATAGA
+AGCCGGACTGAGTACCTACGAGAAAGAGTGCGCAAAACGCGGTGACGACTATCAGGAAATTTTTGCCCAGCAGGTCCGTG
+AAACGATGGAGCGCCGTGCAGCCGGTCTTAAACCGCCCGCCTGGGCGGCTGCAGCATTTGAATCCGGGCTGCGACAATCA
+ACAGAGGAGGAGAAGAGTGACAGCAGAGCTGCGTAATCTCCCGCATATTGCCAGCATGGCCTTTAATGAGCCGCTGATGC
+TTGAACCCGCCTATGCGCGGGTTTTCTTTTGTGCGCTTGCAGGCCAGCTTGGGATCAGCAGCCTGACGGATGCGGTGTCC
+GGCGACAGCCTGACTGCCCAGGAGGCACTCGCGACGCTGGCATTATCCGGTGATGATGACGGACCACGACAGGCCCGCAG
+TTATCAGGTCATGAACGGCATCGCCGTGCTGCCGGTGTCCGGCACGCTGGTCAGCCGGACGCGGGCGCTGCAGCCGTACT
+CGGGGATGACCGGTTACAACGGCATTATCGCCCGTCTGCAACAGGCTGCCAGCGATCCGATGGTGGACGGCATTCTGCTC
+GATATGGACACGCCCGGCGGGATGGTGGCGGGGGCATTTGACTGCGCTGACATCATCGCCCGTGTGCGTGACATAAAACC
+GGTATGGGCGCTTGCCAACGACATGAACTGCAGTGCAGGTCAGTTGCTTGCCAGTGCCGCCTCCCGGCGTCTGGTCACGC
+AGACCGCCCGGACAGGCTCCATCGGCGTCATGATGGCTCACAGTAATTACGGTGCTGCGCTGGAGAAACAGGGTGTGGAA
+ATCACGCTGATTTACAGCGGCAGCCATAAGGTGGATGGCAACCCCTACAGCCATCTTCCGGATGACGTCCGGGAGACACT
+GCAGTCCCGGATGGACGCAACCCGCCAGATGTTTGCGCAGAAGGTGTCGGCATATACCGGCCTGTCCGTGCAGGTTGTGC
+TGGATACCGAGGCTGCAGTGTACAGCGGTCAGGAGGCCATTGATGCCGGACTGGCTGATGAACTTGTTAACAGCACCGAT
+GCGATCACCGTCATGCGTGATGCACTGGATGCACGTAAATCCCGTCTCTCAGGAGGGCGAATGACCAAAGAGACTCAATC
+AACAACTGTTTCAGCCACTGCTTCGCAGGCTGACGTTACTGACGTGGTGCCAGCGACGGAGGGCGAGAACGCCAGCGCGG
+CGCAGCCGGACGTGAACGCGCAGATCACCGCAGCGGTTGCGGCAGAAAACAGCCGCATTATGGGGATCCTCAACTGTGAG
+GAGGCTCACGGACGCGAAGAACAGGCACGCGTGCTGGCAGAAACCCCCGGTATGACCGTGAAAACGGCCCGCCGCATTCT
+GGCCGCAGCACCACAGAGTGCACAGGCGCGCAGTGACACTGCGCTGGATCGTCTGATGCAGGGGGCACCGGCACCGCTGG
+CTGCAGGTAACCCGGCATCTGATGCCGTTAACGATTTGCTGAACACACCAGTGTAAGGGATGTTTATGACGAGCAAAGAA
+ACCTTTACCCATTACCAGCCGCAGGGCAACAGTGACCCGGCTCATACCGCAACCGCGCCCGGCGGATTGAGTGCGAAAGC
+GCCTGCAATGACCCCGCTGATGCTGGACACCTCCAGCCGTAAGCTGGTTGCGTGGGATGGCACCACCGACGGTGCTGCCG
+TTGGCATTCTTGCGGTTGCTGCTGACCAGACCAGCACCACGCTGACGTTCTACAAGTCCGGCACGTTCCGTTATGAGGAT
+GTGCTCTGGCCGGAGGCTGCCAGCGACGAGACGAAAAAACGGACCGCGTTTGCCGGAACGGCAATCAGCATCGTTTAACT
+TTACCCTTCATCACTAAAGGCCGCCTGTGCGGCTTTTTTTACGGGATTTTTTTATGTCGATGTACACAACCGCCCAACTG
+CTGGCGGCAAATGAGCAGAAATTTAAGTTTGATCCGCTGTTTCTGCGTCTCTTTTTCCGTGAGAGCTATCCCTTCACCAC
+GGAGAAAGTCTATCTCTCACAAATTCCGGGACTGGTAAACATGGCGCTGTACGTTTCGCCGATTGTTTCCGGTGAGGTTA
+TCCGTTCCCGTGGCGGCTCCACCTCTGAATTTACGCCGGGATATGTCAAGCCGAAGCATGAAGTGAATCCGCAGATGACC
+CTGCGTCGCCTGCCGGATGAAGATCCGCAGAATCTGGCGGACCCGGCTTACCGCCGCCGTCGCATCATCATGCAGAACAT
+GCGTGACGAAGAGCTGGCCATTGCTCAGGTCGAAGAGATGCAGGCAGTTTCTGCCGTGCTTAAGGGCAAATACACCATGA
+CCGGTGAAGCCTTCGATCCGGTTGAGGTGGATATGGGCCGCAGTGAGGAGAATAACATCACGCAGTCCGGCGGCACGGAG
+TGGAGCAAGCGTGACAAGTCCACGTATGACCCGACCGACGATATCGAAGCCTACGCGCTGAACGCCAGCGGTGTGGTGAA
+TATCATCGTGTTCGATCCGAAAGGCTGGGCGCTGTTCCGTTCCTTCAAAGCCGTCAAGGAGAAGCTGGATACCCGTCGTG
+GCTCTAATTCCGAGCTGGAGACAGCGGTGAAAGACCTGGGCAAAGCGGTGTCCTATAAGGGGATGTATGGCGATGTGGCC
+ATCGTCGTGTATTCCGGACAGTACGTGGAAAACGGCGTCAAAAAGAACTTCCTGCCGGACAACACGATGGTGCTGGGGAA
+CACTCAGGCACGCGGTCTGCGCACCTATGGCTGCATTCAGGATGCGGACGCACAGCGCGAAGGCATTAACGCCTCTGCCC
+GTTACCCGAAAAACTGGGTGACCACCGGCGATCCGGCGCGTGAGTTCACCATGATTCAGTCAGCACCGCTGATGCTGCTG
+GCTGACCCTGATGAGTTCGTGTCCGTACAACTGGCGTAATCATGGCCCTTCGGGGCCATTGTTTCTCTGTGGAGGAGTCC
+ATGACGAAAGATGAACTGATTGCCCGTCTCCGCTCGCTGGGTGAACAACTGAACCGTGATGTCAGCCTGACGGGGACGAA
+AGAAGAACTGGCGCTCCGTGTGGCAGAGCTGAAAGAGGAGCTTGATGACACGGATGAAACTGCCGGTCAGGACACCCCTC
+TCAGCCGGGAAAATGTGCTGACCGGACATGAAAATGAGGTGGGATCAGCGCAGCCGGATACCGTGATTCTGGATACGTCT
+GAACTGGTCACGGTCGTGGCACTGGTGAAGCTGCATACTGATGCACTTCACGCCACGCGGGATGAACCTGTGGCATTTGT
+GCTGCCGGGAACGGCGTTTCGTGTCTCTGCCGGTGTGGCAGCCGAAATGACAGAGCGCGGCCTGGCCAGAATGCAATAAC
+GGGAGGCGCTGTGGCTGATTTCGATAACCTGTTCGATGCTGCCATTGCCCGCGCCGATGAAACGATACGCGGGTACATGG
+GAACGTCAGCCACCATTACATCCGGTGAGCAGTCAGGTGCGGTGATACGTGGTGTTTTTGATGACCCTGAAAATATCAGC
+TATGCCGGACAGGGCGTGCGCGTTGAAGGCTCCAGCCCGTCCCTGTTTGTCCGGACTGATGAGGTGCGGCAGCTGCGGCG
+TGGAGACACGCTGACCATCGGTGAGGAAAATTTCTGGGTAGATCGGGTTTCGCCGGATGATGGCGGAAGTTGTCATCTCT
+GGCTTGGACGGGGCGTACCGCCTGCCGTTAACCGTCGCCGCTGAAAGGGGGATGTATGGCCATAAAAGGTCTTGAGCAGG
+CCGTTGAAAACCTCAGCCGTATCAGCAAAACGGCGGTGCCTGGTGCCGCCGCAATGGCCATTAACCGCGTTGCTTCATCC
+GCGATATCGCAGTCGGCGTCACAGGTTGCCCGTGAGACAAAGGTACGCCGGAAACTGGTAAAGGAAAGGGCCAGGCTGAA
+AAGGGCCACGGTCAAAAATCCGCAGGCCAGAATCAAAGTTAACCGGGGGGATTTGCCCGTAATCAAGCTGGGTAATGCGC
+GGGTTGTCCTTTCGCGCCGCAGGCGTCGTAAAAAGGGGCAGCGTTCATCCCTGAAAGGTGGCGGCAGCGTGCTTGTGGTG
+GGTAACCGTCGTATTCCCGGCGCGTTTATTCAGCAACTGAAAAATGGCCGGTGGCATGTCATGCAGCGTGTGGCTGGGAA
+AAACCGTTACCCCATTGATGTGGTGAAAATCCCGATGGCGGTGCCGCTGACCACGGCGTTTAAACAAAATATTGAGCGGA
+TACGGCGTGAACGTCTTCCGAAAGAGCTGGGCTATGCGCTGCAGCATCAACTGAGGATGGTAATAAAGCGATGAAACATA
+CTGAACTCCGTGCAGCCGTACTGGATGCACTGGAGAAGCATGACACCGGGGCGACGTTTTTTGATGGTCGCCCCGCTGTT
+TTTGATGAGGCGGATTTTCCGGCAGTTGCCGTTTATCTCACCGGCGCTGAATACACGGGCGAAGAGCTGGACAGCGATAC
+CTGGCAGGCGGAGCTGCATATCGAAGTTTTCCTGCCTGCTCAGGTGCCGGATTCAGAGCTGGATGCGTGGATGGAGTCCC
+GGATTTATCCGGTGATGAGCGATATCCCGGCACTGTCAGATTTGATCACCAGTATGGTGGCCAGCGGCTATGACTACCGG
+CGCGACGATGATGCGGGCTTGTGGAGTTCAGCCGATCTGACTTATGTCATTACCTATGAAATGTGAGGACGCTATGCCTG
+TACCAAATCCTACAATGCCGGTGAAAGGTGCCGGGACCACCCTGTGGGTTTATAAGGGGAGCGGTGACCCTTACGCGAAT
+CCGCTTTCAGACGTTGACTGGTCGCGTCTGGCAAAAGTTAAAGACCTGACGCCCGGCGAACTGACCGCTGAGTCCTATGA
+CGACAGCTATCTCGATGATGAAGATGCAGACTGGACTGCGACCGGGCAGGGGCAGAAATCTGCCGGAGATACCAGCTTCA
+CGCTGGCGTGGATGCCCGGAGAGCAGGGGCAGCAGGCGCTGCTGGCGTGGTTTAATGAAGGCGATACCCGTGCCTATAAA
+ATCCGCTTCCCGAACGGCACGGTCGATGTGTTCCGTGGCTGGGTCAGCAGTATCGGTAAGGCGGTGACGGCGAAGGAAGT
+GATCACCCGCACGGTGAAAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGTAACAGCGGCAACCG
+GCATGACCGTGACGCCTGCCAGCACCTCGGTGGTGAAAGGGCAGAGCACCACGCTGACCGTGGCCTTCCAGCCGGAGGGC
+GTAACCGACAAGAGCTTTCGTGCGGTGTCTGCGGATAAAACAAAAGCCACCGTGTCGGTCAGTGGTATGACCATCACCGT
+GAACGGCGTTGCTGCAGGCAAGGTCAACATTCCGGTTGTATCCGGTAATGGTGAGTTTGCTGCGGTTGCAGAAATTACCG
+TCACCGCCAGTTAATCCGGAGAGTCAGCGATGTTCCTGAAAACCGAATCATTTGAACATAACGGTGTGACCGTCACGCTT
+TCTGAACTGTCAGCCCTGCAGCGCATTGAGCATCTCGCCCTGATGAAACGGCAGGCAGAACAGGCGGAGTCAGACAGCAA
+CCGGAAGTTTACTGTGGAAGACGCCATCAGAACCGGCGCGTTTCTGGTGGCGATGTCCCTGTGGCATAACCATCCGCAGA
+AGACGCAGATGCCGTCCATGAATGAAGCCGTTAAACAGATTGAGCAGGAAGTGCTTACCACCTGGCCCACGGAGGCAATT
+TCTCATGCTGAAAACGTGGTGTACCGGCTGTCTGGTATGTATGAGTTTGTGGTGAATAATGCCCCTGAACAGACAGAGGA
+CGCCGGGCCCGCAGAGCCTGTTTCTGCGGGAAAGTGTTCGACGGTGAGCTGAGTTTTGCCCTGAAACTGGCGCGTGAGAT
+GGGGCGACCCGACTGGCGTGCCATGCTTGCCGGGATGTCATCCACGGAGTATGCCGACTGGCACCGCTTTTACAGTACCC
+ATTATTTTCATGATGTTCTGCTGGATATGCACTTTTCCGGGCTGACGTACACCGTGCTCAGCCTGTTTTTCAGCGATCCG
+GATATGCATCCGCTGGATTTCAGTCTGCTGAACCGGCGCGAGGCTGACGAAGAGCCTGAAGATGATGTGCTGATGCAGAA
+AGCGGCAGGGCTTGCCGGAGGTGTCCGCTTTGGCCCGGACGGGAATGAAGTTATCCCCGCTTCCCCGGATGTGGCGGACA
+TGACGGAGGATGACGTAATGCTGATGACAGTATCAGAAGGGATCGCAGGAGGAGTCCGGTATGGCTGAACCGGTAGGCGA
+TCTGGTCGTTGATTTGAGTCTGGATGCGGCCAGATTTGACGAGCAGATGGCCAGAGTCAGGCGTCATTTTTCTGGTACGG
+AAAGTGATGCGAAAAAAACAGCGGCAGTCGTTGAACAGTCGCTGAGCCGACAGGCGCTGGCTGCACAGAAAGCGGGGATT
+TCCGTCGGGCAGTATAAAGCCGCCATGCGTATGCTGCCTGCACAGTTCACCGACGTGGCCACGCAGCTTGCAGGCGGGCA
+AAGTCCGTGGCTGATCCTGCTGCAACAGGGGGGGCAGGTGAAGGACTCCTTCGGCGGGATGATCCCCATGTTCAGGGGGC
+TTGCCGGTGCGATCACCCTGCCGATGGTGGGGGCCACCTCGCTGGCGGTGGCGACCGGTGCGCTGGCGTATGCCTGGTAT
+CAGGGCAACTCAACCCTGTCCGATTTCAACAAAACGCTGGTCCTTTCCGGCAATCAGGCGGGACTGACGGCAGATCGTAT
+GCTGGTCCTGTCCAGAGCCGGGCAGGCGGCAGGGCTGACGTTTAACCAGACCAGCGAGTCACTCAGCGCACTGGTTAAGG
+CGGGGGTAAGCGGTGAGGCTCAGATTGCGTCCATCAGCCAGAGTGTGGCGCGTTTCTCCTCTGCATCCGGCGTGGAGGTG
+GACAAGGTCGCTGAAGCCTTCGGGAAGCTGACCACAGACCCGACGTCGGGGCTGACGGCGATGGCTCGCCAGTTCCATAA
+CGTGTCGGCGGAGCAGATTGCGTATGTTGCTCAGTTGCAGCGTTCCGGCGATGAAGCCGGGGCATTGCAGGCGGCGAACG
+AGGCCGCAACGAAAGGGTTTGATGACCAGACCCGCCGCCTGAAAGAGAACATGGGCACGCTGGAGACCTGGGCAGACAGG
+ACTGCGCGGGCATTCAAATCCATGTGGGATGCGGTGCTGGATATTGGTCGTCCTGATACCGCGCAGGAGATGCTGATTAA
+GGCAGAGGCTGCGTATAAGAAAGCAGACGACATCTGGAATCTGCGCAAGGATGATTATTTTGTTAACGATGAAGCGCGGG
+CGCGTTACTGGGATGATCGTGAAAAGGCCCGTCTTGCGCTTGAAGCCGCCCGAAAGAAGGCTGAGCAGCAGACTCAACAG
+GACAAAAATGCGCAGCAGCAGAGCGATACCGAAGCGTCACGGCTGAAATATACCGAAGAGGCGCAGAAGGCTTACGAACG
+GCTGCAGACGCCGCTGGAGAAATATACCGCCCGTCAGGAAGAACTGAACAAGGCACTGAAAGACGGGAAAATCCTGCAGG
+CGGATTACAACACGCTGATGGCGGCGGCGAAAAAGGATTATGAAGCGACGCTGAAAAAGCCGAAACAGTCCAGCGTGAAG
+GTGTCTGCGGGCGATCGTCAGGAAGACAGTGCTCATGCTGCCCTGCTGACGCTTCAGGCAGAACTCCGGACGCTGGAGAA
+GCATGCCGGAGCAAATGAGAAAATCAGCCAGCAGCGCCGGGATTTGTGGAAGGCGGAGAGTCAGTTCGCGGTACTGGAGG
+AGGCGGCGCAACGTCGCCAGCTGTCTGCACAGGAGAAATCCCTGCTGGCGCATAAAGATGAGACGCTGGAGTACAAACGC
+CAGCTGGCTGCACTTGGCGACAAGGTTACGTATCAGGAGCGCCTGAACGCGCTGGCGCAGCAGGCGGATAAATTCGCACA
+GCAGCAACGGGCAAAACGGGCCGCCATTGATGCGAAAAGCCGGGGGCTGACTGACCGGCAGGCAGAACGGGAAGCCACGG
+AACAGCGCCTGAAGGAACAGTATGGCGATAATCCGCTGGCGCTGAATAACGTCATGTCAGAGCAGAAAAAGACCTGGGCG
+GCTGAAGACCAGCTTCGCGGGAACTGGATGGCAGGCCTGAAGTCCGGCTGGAGTGAGTGGGAAGAGAGCGCCACGGACAG
+TATGTCGCAGGTAAAAAGTGCAGCCACGCAGACCTTTGATGGTATTGCACAGAATATGGCGGCGATGCTGACCGGCAGTG
+AGCAGAACTGGCGCAGCTTCACCCGTTCCGTGCTGTCCATGATGACAGAAATTCTGCTTAAGCAGGCAATGGTGGGGATT
+GTCGGGAGTATCGGCAGCGCCATTGGCGGGGCTGTTGGTGGCGGCGCATCCGCGTCAGGCGGTACAGCCATTCAGGCCGC
+TGCGGCGAAATTCCATTTTGCAACCGGAGGATTTACGGGAACCGGCGGCAAATATGAGCCAGCGGGGATTGTTCACCGTG
+GTGAGTTTGTCTTCACGAAGGAGGCAACCAGCCGGATTGGCGTGGGGAATCTTTACCGGCTGATGCGCGGCTATGCCACC
+GGCGGTTATGTCGGTACACCGGGCAGCATGGCAGACAGCCGGTCGCAGGCGTCCGGGACGTTTGAGCAGAATAACCATGT
+GGTGATTAACAACGACGGCACGAACGGGCAGATAGGTCCGGCTGCTCTGAAGGCGGTGTATGACATGGCCCGCAAGGGTG
+CCCGTGATGAAATTCAGACACAGATGCGTGATGGTGGCCTGTTCTCCGGAGGTGGACGATGAAGACCTTCCGCTGGAAAG
+TGAAACCCGGTATGGATGTGGCTTCGGTCCCTTCTGTAAGAAAGGTGCGCTTTGGTGATGGCTATTCTCAGCGAGCGCCT
+GCCGGGCTGAATGCCAACCTGAAAACGTACAGCGTGACGCTTTCTGTCCCCCGTGAGGAGGCCACGGTACTGGAGTCGTT
+TCTGGAAGAGCACGGGGGCTGGAAATCCTTTCTGTGGACGCCGCCTTATGAGTGGCGGCAGATAAAGGTGACCTGCGCAA
+AATGGTCGTCGCGGGTCAGTATGCTGCGTGTTGAGTTCAGCGCAGAGTTTGAACAGGTGGTGAACTGATGCAGGATATCC
+GGCAGGAAACACTGAATGAATGCACCCGTGCGGAGCAGTCGGCCAGCGTGGTGCTCTGGGAAATCGACCTGACAGAGGTC
+GGTGGAGAACGTTATTTTTTCTGTAATGAGCAGAACGAAAAAGGTGAGCCGGTCACCTGGCAGGGGCGACAGTATCAGCC
+GTATCCCATTCAGGGGAGCGGTTTTGAACTGAATGGCAAAGGCACCAGTACGCGCCCCACGCTGACGGTTTCTAACCTGT
+ACGGTATGGTCACCGGGATGGCGGAAGATATGCAGAGTCTGGTCGGCGGAACGGTGGTCCGGCGTAAGGTTTACGCCCGT
+TTTCTGGATGCGGTGAACTTCGTCAACGGAAACAGTTACGCCGATCCGGAGCAGGAGGTGATCAGCCGCTGGCGCATTGA
+GCAGTGCAGCGAACTGAGCGCGGTGAGTGCCTCCTTTGTACTGTCCACGCCGACGGAAACGGATGGCGCTGTTTTTCCGG
+GACGTATCATGCTGGCCAACACCTGCACCTGGACCTATCGCGGTGACGAGTGCGGTTATAGCGGTCCGGCTGTCGCGGAT
+GAATATGACCAGCCAACGTCCGATATCACGAAGGATAAATGCAGCAAATGCCTGAGCGGTTGTAAGTTCCGCAATAACGT
+CGGCAACTTTGGCGGCTTCCTTTCCATTAACAAACTTTCGCAGTAAATCCCATGACACAGACAGAATCAGCGATTCTGGC
+GCACGCCCGGCGATGTGCGCCAGCGGAGTCGTGCGGCTTCGTGGTAAGCACGCCGGAGGGGGAAAGATATTTCCCCTGCG
+TGAATATCTCCGGTGAGCCGGAGGCGTATTTCCGTATGTCGCCGGAAGACTGGCTGCAGGCAGAAATGCAGGGTGAGATT
+GTGGCGCTGGTCCACAGCCACCCCGGTGGTCTGCCCTGGCTGAGTGAGGCCGACCGGCGGCTGCAGGTGCAGAGTGATTT
+GCCGTGGTGGCTGGTCTGCCGGGGGACGATTCATAAGTTCCGCTGTGTGCCGCATCTCACCGGGCGGCGCTTTGAGCACG
+GTGTGACGGACTGTTACACACTGTTCCGGGATGCTTATCATCTGGCGGGGATTGAGATGCCGGACTTTCATCGTGAGGAT
+GACTGGTGGCGTAACGGCCAGAATCTCTATCTGGATAATCTGGAGGCGACGGGGCTGTATCAGGTGCCGTTGTCAGCGGC
+ACAGCCGGGCGATGTGCTGCTGTGCTGTTTTGGTTCATCAGTGCCGAATCACGCCGCAATTTACTGCGGCGACGGCGAGC
+TGCTGCACCATATTCCTGAACAACTGAGCAAACGAGAGAGGTACACCGACAAATGGCAGCGACGCACACACTCCCTCTGG
+CGTCACCGGGCATGGCGCGCATCTGCCTTTACGGGGATTTACAACGATTTGGTCGCCGCATCGACCTTCGTGTGAAAACG
+GGGGCTGAAGCCATCCGGGCACTGGCCACACAGCTCCCGGCGTTTCGTCAGAAACTGAGCGACGGCTGGTATCAGGTACG
+GATTGCCGGGCGGGACGTCAGCACGTCCGGGTTAACGGCGCAGTTACATGAGACTCTGCCTGATGGCGCTGTAATTCATA
+TTGTTCCCAGAGTCGCCGGGGCCAAGTCAGGTGGCGTATTCCAGATTGTCCTGGGGGCTGCCGCCATTGCCGGATCATTC
+TTTACCGCCGGAGCCACCCTTGCAGCATGGGGGGCAGCCATTGGGGCCGGTGGTATGACCGGCATCCTGTTTTCTCTCGG
+TGCCAGTATGGTGCTCGGTGGTGTGGCGCAGATGCTGGCACCGAAAGCCAGAACTCCCCGTATACAGACAACGGATAACG
+GTAAGCAGAACACCTATTTCTCCTCACTGGATAACATGGTTGCCCAGGGCAATGTTCTGCCTGTTCTGTACGGGGAAATG
+CGCGTGGGGTCACGCGTGGTTTCTCAGGAGATCAGCACGGCAGACGAAGGGGACGGTGGTCAGGTTGTGGTGATTGGTCG
+CTGATGCAAAATGTTTTATGTGAAACCGCCTGCGGGCGGTTTTGTCATTTATGGAGCGTGAGGAATGGGTAAAGGAAGCA
+GTAAGGGGCATACCCCGCGCGAAGCGAAGGACAACCTGAAGTCCACGCAGTTGCTGAGTGTGATCGATGCCATCAGCGAA
+GGGCCGATTGAAGGTCCGGTGGATGGCTTAAAAAGCGTGCTGCTGAACAGTACGCCGGTGCTGGACACTGAGGGGAATAC
+CAACATATCCGGTGTCACGGTGGTGTTCCGGGCTGGTGAGCAGGAGCAGACTCCGCCGGAGGGATTTGAATCCTCCGGCT
+CCGAGACGGTGCTGGGTACGGAAGTGAAATATGACACGCCGATCACCCGCACCATTACGTCTGCAAACATCGACCGTCTG
+CGCTTTACCTTCGGTGTACAGGCACTGGTGGAAACCACCTCAAAGGGTGACAGGAATCCGTCGGAAGTCCGCCTGCTGGT
+TCAGATACAACGTAACGGTGGCTGGGTGACGGAAAAAGACATCACCATTAAGGGCAAAACCACCTCGCAGTATCTGGCCT
+CGGTGGTGATGGGTAACCTGCCGCCGCGCCCGTTTAATATCCGGATGCGCAGGATGACGCCGGACAGCACCACAGACCAG
+CTGCAGAACAAAACGCTCTGGTCGTCATACACTGAAATCATCGATGTGAAACAGTGCTACCCGAACACGGCACTGGTCGG
+CGTGCAGGTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGTATTCTGCAGGTGC
+CGTCGAACTATAACCCGCAGACGCGGCAATACAGCGGTATCTGGGACGGAACGTTTAAACCGGCATACAGCAACAACATG
+GCCTGGTGTCTGTGGGATATGCTGACCCATCCGCGCTACGGCATGGGGAAACGTCTTGGTGCGGCGGATGTGGATAAATG
+GGCGCTGTATGTCATCGGCCAGTACTGCGACCAGTCAGTGCCGGACGGCTTTGGCGGCACGGAGCCGCGCATCACCTGTA
+ATGCGTACCTGACCACACAGCGTAAGGCGTGGGATGTGCTCAGCGATTTCTGCTCGGCGATGCGCTGTATGCCGGTATGG
+AACGGGCAGACGCTGACGTTCGTGCAGGACCGACCGTCGGATAAGACGTGGACCTATAACCGCAGTAATGTGGTGATGCC
+GGATGATGGCGCGCCGTTCCGCTACAGCTTCAGCGCCCTGAAGGACCGCCATAATGCCGTTGAGGTGAACTGGATTGACC
+CGAACAACGGCTGGGAGACGGCGACAGAGCTTGTTGAAGATACGCAGGCCATTGCCCGTTACGGTCGTAATGTTACGAAG
+ATGGATGCCTTTGGCTGTACCAGCCGGGGGCAGGCACACCGCGCCGGGCTGTGGCTGATTAAAACAGAACTGCTGGAAAC
+GCAGACCGTGGATTTCAGCGTCGGCGCAGAAGGGCTTCGCCATGTACCGGGCGATGTTATTGAAATCTGCGATGATGACT
+ATGCCGGTATCAGCACCGGTGGTCGTGTGCTGGCGGTGAACAGCCAGACCCGGACGCTGACGCTCGACCGTGAAATCACG
+CTGCCATCCTCCGGTACCGCGCTGATAAGCCTGGTTGACGGAAGTGGCAATCCGGTCAGCGTGGAGGTTCAGTCCGTCAC
+CGACGGCGTGAAGGTAAAAGTGAGCCGTGTTCCTGACGGTGTTGCTGAATACAGCGTATGGGAGCTGAAGCTGCCGACGC
+TGCGCCAGCGACTGTTCCGCTGCGTGAGTATCCGTGAGAACGACGACGGCACGTATGCCATCACCGCCGTGCAGCATGTG
+CCGGAAAAAGAGGCCATCGTGGATAACGGGGCGCACTTTGACGGCGAACAGAGTGGCACGGTGAATGGTGTCACGCCGCC
+AGCGGTGCAGCACCTGACCGCAGAAGTCACTGCAGACAGCGGGGAATATCAGGTGCTGGCGCGATGGGACACACCGAAGG
+TGGTGAAGGGCGTGAGTTTCCTGCTCCGTCTGACCGTAACAGCGGACGACGGCAGTGAGCGGCTGGTCAGCACGGCCCGG
+ACGACGGAAACCACATACCGCTTCACGCAACTGGCGCTGGGGAACTACAGGCTGACAGTCCGGGCGGTAAATGCGTGGGG
+GCAGCAGGGCGATCCGGCGTCGGTATCGTTCCGGATTGCCGCACCGGCAGCACCGTCGAGGATTGAGCTGACGCCGGGCT
+ATTTTCAGATAACCGCCACGCCGCATCTTGCCGTTTATGACCCGACGGTACAGTTTGAGTTCTGGTTCTCGGAAAAGCAG
+ATTGCGGATATCAGACAGGTTGAAACCAGCACGCGTTATCTTGGTACGGCGCTGTACTGGATAGCCGCCAGTATCAATAT
+CAAACCGGGCCATGATTATTACTTTTATATCCGCAGTGTGAACACCGTTGGCAAATCGGCATTCGTGGAGGCCGTCGGTC
+GGGCGAGCGATGATGCGGAAGGTTACCTGGATTTTTTCAAAGGCAAGATAACCGAATCCCATCTCGGCAAGGAGCTGCTG
+GAAAAAGTCGAGCTGACGGAGGATAACGCCAGCAGACTGGAGGAGTTTTCGAAAGAGTGGAAGGATGCCAGTGATAAGTG
+GAATGCCATGTGGGCTGTCAAAATTGAGCAGACCAAAGACGGCAAACATTATGTCGCGGGTATTGGCCTCAGCATGGAGG
+ACACGGAGGAAGGCAAACTGAGCCAGTTTCTGGTTGCCGCCAATCGTATCGCATTTATTGACCCGGCAAACGGGAATGAA
+ACGCCGATGTTTGTGGCGCAGGGCAACCAGATATTCATGAACGACGTGTTCCTGAAGCGCCTGACGGCCCCCACCATTAC
+CAGCGGCGGCAATCCTCCGGCCTTTTCCCTGACACCGGACGGAAAGCTGACCGCTAAAAATGCGGATATCAGTGGCAGTG
+TGAATGCGAACTCCGGGACGCTCAGTAATGTGACGATAGCTGAAAACTGTACGATAAACGGTACGCTGAGGGCGGAAAAA
+ATCGTCGGGGACATTGTAAAGGCGGCGAGCGCGGCTTTTCCGCGCCAGCGTGAAAGCAGTGTGGACTGGCCGTCAGGTAC
+CCGTACTGTCACCGTGACCGATGACCATCCTTTTGATCGCCAGATAGTGGTGCTTCCGCTGACGTTTCGCGGAAGTAAGC
+GTACTGTCAGCGGCAGGACAACGTATTCGATGTGTTATCTGAAAGTACTGATGAACGGTGCGGTGATTTATGATGGCGCG
+GCGAACGAGGCGGTACAGGTGTTCTCCCGTATTGTTGACATGCCAGCGGGTCGGGGAAACGTGATCCTGACGTTCACGCT
+TACGTCCACACGGCATTCGGCAGATATTCCGCCGTATACGTTTGCCAGCGATGTGCAGGTTATGGTGATTAAGAAACAGG
+CGCTGGGCATCAGCGTGGTCTGAGTGTGTTACAGAGGTTCGTCCGGGAACGGGCGTTTTATTATAAAACAGTGAGAGGTG
+AACGATGCGTAATGTGTGTATTGCCGTTGCTGTCTTTGCCGCACTTGCGGTGACAGTCACTCCGGCCCGTGCGGAAGGTG
+GACATGGTACGTTTACGGTGGGCTATTTTCAAGTGAAACCGGGTACATTGCCGTCGTTGTCGGGCGGGGATACCGGTGTG
+AGTCATCTGAAAGGGATTAACGTGAAGTACCGTTATGAGCTGACGGACAGTGTGGGGGTGATGGCTTCCCTGGGGTTCGC
+CGCGTCGAAAAAGAGCAGCACAGTGATGACCGGGGAGGATACGTTTCACTATGAGAGCCTGCGTGGACGTTATGTGAGCG
+TGATGGCCGGACCGGTTTTACAAATCAGTAAGCAGGTCAGTGCGTACGCCATGGCCGGAGTGGCTCACAGTCGGTGGTCC
+GGCAGTACAATGGATTACCGTAAGACGGAAATCACTCCCGGGTATATGAAAGAGACGACCACTGCCAGGGACGAAAGTGC
+AATGCGGCATACCTCAGTGGCGTGGAGTGCAGGTATACAGATTAATCCGGCAGCGTCCGTCGTTGTTGATATTGCTTATG
+AAGGCTCCGGCAGTGGCGACTGGCGTACTGACGGATTCATCGTTGGGGTCGGTTATAAATTCTGATTAGCCAGGTAACAC
+AGTGTTATGACAGCCCGCCGGAACCGGTGGGCTTTTTTGTGGGGTGAATATGGCAGTAAAGATTTCAGGAGTCCTGAAAG
+ACGGCACAGGAAAACCGGTACAGAACTGCACCATTCAGCTGAAAGCCAGACGTAACAGCACCACGGTGGTGGTGAACACG
+GTGGGCTCAGAGAATCCGGATGAAGCCGGGCGTTACAGCATGGATGTGGAGTACGGTCAGTACAGTGTCATCCTGCAGGT
+TGACGGTTTTCCACCATCGCACGCCGGGACCATCACCGTGTATGAAGATTCACAACCGGGGACGCTGAATGATTTTCTCT
+GTGCCATGACGGAGGATGATGCCCGGCCGGAGGTGCTGCGTCGTCTTGAACTGATGGTGGAAGAGGTGGCGCGTAACGCG
+TCCGTGGTGGCACAGAGTACGGCAGACGCGAAGAAATCAGCCGGCGATGCCAGTGCATCAGCTGCTCAGGTCGCGGCCCT
+TGTGACTGATGCAACTGACTCAGCACGCGCCGCCAGCACGTCCGCCGGACAGGCTGCATCGTCAGCTCAGGAAGCGTCCT
+CCGGCGCAGAAGCGGCATCAGCAAAGGCCACTGAAGCGGAAAAAAGTGCCGCAGCCGCAGAGTCCTCAAAAAACGCGGCG
+GCCACCAGTGCCGGTGCGGCGAAAACGTCAGAAACGAATGCTGCAGCGTCACAACAATCAGCCGCCACGTCTGCCTCCAC
+CGCGGCCACGAAAGCGTCAGAGGCCGCCACTTCAGCACGAGATGCGGTGGCCTCAAAAGAGGCAGCAAAATCATCAGAAA
+CGAACGCATCATCAAGTGCCGGTCGTGCAGCTTCCTCGGCAACGGCGGCAGAAAATTCTGCCAGGGCGGCAAAAACGTCC
+GAGACGAATGCCAGGTCATCTGAAACAGCAGCGGAACGGAGCGCCTCTGCCGCGGCAGACGCAAAAACAGCGGCGGCGGG
+GAGTGCGTCAACGGCATCCACGAAGGCGACAGAGGCTGCGGGAAGTGCGGTATCAGCATCGCAGAGCAAAAGTGCGGCAG
+AAGCGGCGGCAATACGTGCAAAAAATTCGGCAAAACGTGCAGAAGATATAGCTTCAGCTGTCGCGCTTGAGGATGCGGAC
+ACAACGAGAAAGGGGATAGTGCAGCTCAGCAGTGCAACCAACAGCACGTCTGAAACGCTTGCTGCAACGCCAAAGGCGGT
+TAAGGTGGTAATGGATGAAACGAACAGAAAAGCCCACTGGACAGTCCGGCACTGACCGGAACGCCAACAGCACCAACCGC
+GCTCAGGGGAACAAACAATACCCAGATTGCGAACACCGCTTTTGTACTGGCCGCGATTGCAGATGTTATCGACGCGTCAC
+CTGACGCACTGAATACGCTGAATGAACTGGCCGCAGCGCTCGGGAATGATCCAGATTTTGCTACCACCATGACTAACGCG
+CTTGCGGGTAAACAACCGAAGAATGCGACACTGACGGCGCTGGCAGGGCTTTCCACGGCGAAAAATAAATTACCGTATTT
+TGCGGAAAATGATGCCGCCAGCCTGACTGAACTGACTCAGGTTGGCAGGGATATTCTGGCAAAAAATTCCGTTGCAGATG
+TTCTTGAATACCTTGGGGCCGGTGAGAATTCGGCCTTTCCGGCAGGTGCGCCGATCCCGTGGCCATCAGATATCGTTCCG
+TCTGGCTACGTCCTGATGCAGGGGCAGGCGTTTGACAAATCAGCCTACCCAAAACTTGCTGTCGCGTATCCATCGGGTGT
+GCTTCCTGATATGCGAGGCTGGACAATCAAGGGGAAACCCGCCAGCGGTCGTGCTGTATTGTCTCAGGAACAGGATGGAA
+TTAAGTCGCACACCCACAGTGCCAGTGCATCCGGTACGGATTTGGGGACGAAAACCACATCGTCGTTTGATTACGGGACG
+AAAACAACAGGCAGTTTCGATTACGGCACCAAATCGACGAATAACACGGGGGCTCATGCTCACAGTCTGAGCGGTTCAAC
+AGGGGCCGCGGGTGCTCATGCCCACACAAGTGGTTTAAGGATGAACAGTTCTGGCTGGAGTCAGTATGGAACAGCAACCA
+TTACAGGAAGTTTATCCACAGTTAAAGGAACCAGCACACAGGGTATTGCTTATTTATCGAAAACGGACAGTCAGGGCAGC
+CACAGTCACTCATTGTCCGGTACAGCCGTGAGTGCCGGTGCACATGCGCATACAGTTGGTATTGGTGCGCACCAGCATCC
+GGTTGTTATCGGTGCTCATGCCCATTCTTTCAGTATTGGTTCACACGGACACACCATCACCGTTAACGCTGCGGGTAACG
+CGGAAAACACCGTCAAAAACATTGCATTTAACTATATTGTGAGGCTTGCATAATGGCATTCAGAATGAGTGAACAACCAC
+GGACCATAAAAATTTATAATCTGCTGGCCGGAACTAATGAATTTATTGGTGAAGGTGACGCATATATTCCGCCTCATACC
+GGTCTGCCTGCAAACAGTACCGATATTGCACCGCCAGATATTCCGGCTGGCTTTGTGGCTGTTTTCAACAGTGATGAGGC
+ATCGTGGCATCTCGTTGAAGACCATCGGGGTAAAACCGTCTATGACGTGGCTTCCGGCGACGCGTTATTTATTTCTGAAC
+TCGGTCCGTTACCGGAAAATTTTACCTGGTTATCGCCGGGAGGGGAATATCAGAAGTGGAACGGCACAGCCTGGGTGAAG
+GATACGGAAGCAGAAAAACTGTTCCGGATCCGGGAGGCGGAAGAAACAAAAAAAAGCCTGATGCAGGTAGCCAGTGAGCA
+TATTGCGCCGCTTCAGGATGCTGCAGATCTGGAAATTGCAACGAAGGAAGAAACCTCGTTGCTGGAAGCCTGGAAGAAGT
+ATCGGGTGTTGCTGAACCGTGTTGATACATCAACTGCACCTGATATTGAGTGGCCTGCTGTCCCTGTTATGGAGTAATCG
+TTTTGTGATATGCCGCAGAAACGTTGTATGAAATAACGTTCTGCGGTTAGTTAGTATATTGTAAAGCTGAGTATTGGTTT
+ATTTGGCGATTATTATCTTCAGGAGAATAATGGAAGTTCTATGACTCAATTGTTCATAGTGTTTACATCACCGCCAATTG
+CTTTTAAGACTGAACGCATGAAATATGGTTTTTCGTCATGTTTTGAGTCTGCTGTTGATATTTCTAAAGTCGGTTTTTTT
+TCTTCGTTTTCTCTAACTATTTTCCATGAAATACATTTTTGATTATTATTTGAATCAATTCCAATTACCTGAAGTCTTTC
+ATCTATAATTGGCATTGTATGTATTGGTTTATTGGAGTAGATGCTTGCTTTTCTGAGCCATAGCTCTGATATCCAAATGA
+AGCCATAGGCATTTGTTATTTTGGCTCTGTCAGCTGCATAACGCCAAAAAATATATTTATCTGCTTGATCTTCAAATGTT
+GTATTGATTAAATCAATTGGATGGAATTGTTTATCATAAAAAATTAATGTTTGAATGTGATAACCGTCCTTTAAAAAAGT
+CGTTTCTGCAAGCTTGGCTGTATAGTCAACTAACTCTTCTGTCGAAGTGATATTTTTAGGCTTATCTACCAGTTTTAGAC
+GCTCTTTAATATCTTCAGGAATTATTTTATTGTCATATTGTATCATGCTAAATGACAATTTGCTTATGGAGTAATCTTTT
+AATTTTAAATAAGTTATTCTCCTGGCTTCATCAAATAAAGAGTCGAATGATGTTGGCGAAATCACATCGTCACCCATTGG
+ATTGTTTATTTGTATGCCAAGAGAGTTACAGCAGTTATACATTCTGCCATAGATTATAGCTAAGGCATGTAATAATTCGT
+AATCTTTTAGCGTATTAGCGACCCATCGTCTTTCTGATTTAATAATAGATGATTCAGTTAAATATGAAGGTAATTTCTTT
+TGTGCAAGTCTGACTAACTTTTTTATACCAATGTTTAACATACTTTCATTTGTAATAAACTCAATGTCATTTTCTTCAAT
+GTAAGATGAAATAAGAGTAGCCTTTGCCTCGCTATACATTTCTAAATCGCCTTGTTTTTCTATCGTATTGCGAGAATTTT
+TAGCCCAAGCCATTAATGGATCATTTTTCCATTTTTCAATAACATTATTGTTATACCAAATGTCATATCCTATAATCTGG
+TTTTTGTTTTTTTGAATAATAAATGTTACTGTTCTTGCGGTTTGGAGGAATTGATTCAAATTCAAGCGAAATAATTCAGG
+GTCAAAATATGTATCAATGCAGCATTTGAGCAAGTGCGATAAATCTTTAAGTCTTCTTTCCCATGGTTTTTTAGTCATAA
+AACTCTCCATTTTGATAGGTTGCATGCTAGATGCTGATATATTTTAGAGGTGATAAAATTAACTGCTTAACTGTCAATGT
+AATACAAGTTGTTTGATCTTTGCAATGATTCTTATCAGAAACCATATAGTAAATTAGTTACACAGGAAATTTTTAATATT
+ATTATTATCATTCATTATGTATTAAAATTAGAGTTGTGGCTTGGCTCTGCTAACACGTTGCTCATAGGAGATATGGTAGA
+GCCGCAGACACGTCGTATGCAGGAACGTGCTGCGGCTGGCTGGTGAACTTCCGATAGTGCGGGTGTTGAATGATTTCCAG
+TTGCTACCGATTTTACATATTTTTTGCATGAGAGAATTTGTACCACCTCCCACCGACCATCTATGACTGTACGCCACTGT
+CCCTAGGACTGCTATGTGCCGGAGCGGACATTACAAACGTCCTTCTCGGTGCATGCCACTGTTGCCAATGACCTGCCTAG
+GAATTGGTTAGCAAGTTACTACCGGATTTTGTAAAAACAGCCCTCCTCATATAAAAAGTATTCGTTCACTTCCGATAAGC
+GTCGTAATTTTCTATCTTTCATCATATTCTAGATCCCTCTGAAAAAATCTTCCGAGTTTGCTAGGCACTGATACATAACT
+CTTTTCCAATAATTGGGGAAGTCATTCAAATCTATAATAGGTTTCAGATTTGCTTCAATAAATTCTGACTGTAGCTGCTG
+AAACGTTGCGGTTGAACTATATTTCCTTATAACTTTTACGAAAGAGTTTCTTTGAGTAATCACTTCACTCAAGTGCTTCC
+CTGCCTCCAAACGATACCTGTTAGCAATATTTAATAGCTTGAAATGATGAAGAGCTCTGTGTTTGTCTTCCTGCCTCCAG
+TTCGCCGGGCATTCAACATAAAAACTGATAGCACCCGGAGTTCCGGAAACGAAATTTGCATATACCCATTGCTCACGAAA
+AAAAATGTCCTTGTCGATATAGGGATGAATCGCTTGGTGTACCTCATCTACTGCGAAAACTTGACCTTTCTCTCCCATAT
+TGCAGTCGCGGCACGATGGAACTAAATTAATAGGCATCACCGAAAATTCAGGATAATGTGCAATAGGAAGAAAATGATCT
+ATATTTTTTGTCTGTCCTATATCACCACAAAATGGACATTTTTCACCTGATGAAACAAGCATGTCATCGTAATATGTTCT
+AGCGGGTTTGTTTTTATCTCGGAGATTATTTTCATAAAGCTTTTCTAATTTAACCTTTGTCAGGTTACCAACTACTAAGG
+TTGTAGGCTCAAGAGGGTGTGTCCTGTCGTAGGTAAATAACTGACCTGTCGAGCTTAATATTCTATATTGTTGTTCTTTC
+TGCAAAAAAGTGGGGAAGTGAGTAATGAAATTATTTCTAACATTTATCTGCATCATACCTTCCGAGCATTTATTAAGCAT
+TTCGCTATAAGTTCTCGCTGGAAGAGGTAGTTTTTTCATTGTACTTTACCTTCATCTCTGTTCATTATCATCGCTTTTAA
+AACGGTTCGACCTTCTAATCCTATCTGACCATTATAATTTTTTAGAATGGTTTCATAAGAAAGCTCTGAATCAACGGACT
+GCGATAATAAGTGGTGGTATCCAGAATTTGTCACTTCAAGTAAAAACACCTCACGAGTTAAAACACCTAAGTTCTCACCG
+AATGTCTCAATATCCGGACGGATAATATTTATTGCTTCTCTTGACCGTAGGACTTTCCACATGCAGGATTTTGGAACCTC
+TTGCAGTACTACTGGGGAATGAGTTGCAATTATTGCTACACCATTGCGTGCATCGAGTAAGTCGCTTAATGTTCGTAAAA
+AAGCAGAGAGCAAAGGTGGATGCAGATGAACCTCTGGTTCATCGAATAAAACTAATGACTTTTCGCCAACGACATCTACT
+AATCTTGTGATAGTAAATAAAACAATTGCATGTCCAGAGCTCATTCGAAGCAGATATTTCTGGATATTGTCATAAAACAA
+TTTAGTGAATTTATCATCGTCCACTTGAATCTGTGGTTCATTACGTCTTAACTCTTCATATTTAGAAATGAGGCTGATGA
+GTTCCATATTTGAAAAGTTTTCATCACTACTTAGTTTTTTGATAGCTTCAAGCCAGAGTTGTCTTTTTCTATCTACTCTC
+ATACAACCAATAAATGCTGAAATGAATTCTAAGCGGAGATCGCCTAGTGATTTTAAACTATTGCTGGCAGCATTCTTGAG
+TCCAATATAAAAGTATTGTGTACCTTTTGCTGGGTCAGGTTGTTCTTTAGGAGGAGTAAAAGGATCAAATGCACTAAACG
+AAACTGAAACAAGCGATCGAAAATATCCCTTTGGGATTCTTGACTCGATAAGTCTATTATTTTCAGAGAAAAAATATTCA
+TTGTTTTCTGGGTTGGTGATTGCACCAATCATTCCATTCAAAATTGTTGTTTTACCACACCCATTCCGCCCGATAAAAGC
+ATGAATGTTCGTGCTGGGCATAGAATTAACCGTCACCTCAAAAGGTATAGTTAAATCACTGAATCCGGGAGCACTTTTTC
+TATTAAATGAAAAGTGGAAATCTGACAATTCTGGCAAACCATTTAACACACGTGCGAACTGTCCATGAATTTCTGAAAGA
+GTTACCCCTCTAAGTAATGAGGTGTTAAGGACGCTTTCATTTTCAATGTCGGCTAATCGATTTGGCCATACTACTAAATC
+CTGAATAGCTTTAAGAAGGTTATGTTTAAAACCATCGCTTAATTTGCTGAGATTAACATAGTAGTCAATGCTTTCACCTA
+AGGAAAAAAACATTTCAGGGAGTTGACTGAATTTTTTATCTATTAATGAATAAGTGCTTACTTCTTCTTTTTGACCTACA
+AAACCAATTTTAACATTTCCGATATCGCATTTTTCACCATGCTCATCAAAGACAGTAAGATAAAACATTGTAACAAAGGA
+ATAGTCATTCCAACCATCTGCTCGTAGGAATGCCTTATTTTTTTCTACTGCAGGAATATACCCGCCTCTTTCAATAACAC
+TAAACTCCAACATATAGTAACCCTTAATTTTATTAAAATAACCGCAATTTATTTGGCGGCAACACAGGATCTCTCTTTTA
+AGTTACTCTCTATTACATACGTTTTCCATCTAAAAATTAGTAGTATTGAACTTAACGGGGCATCGTATTGTAGTTTTCCA
+TATTTAGCTTTCTGCTTCCTTTTGGATAACCCACTGTTATTCATGTTGCATGGTGCACTGTTTATACCAACGATATAGTC
+TATTAATGCATATATAGTATCGCCGAACGATTAGCTCTTCAGGCTTCTGAAGAAGCGTTTCAAGTACTAATAAGCCGATA
+GATAGCCACGGACTTCGTAGCCATTTTTCATAAGTGTTAACTTCCGCTCCTCGCTCATAACAGACATTCACTACAGTTAT
+GGCGGAAAGGTATGCATGCTGGGTGTGGGGAAGTCGTGAAAGAAAAGAAGTCAGCTGCGTCGTTTGACATCACTGCTATC
+TTCTTACTGGTTATGCAGGTCGTAGTGGGTGGCACACAAAGCTTTGCACTGGATTGCGAGGCTTTGTGCTTCTCTGGAGT
+GCGACAGGTTTGATGACAAAAAATTAGCGCAAGAAGACAAAAATCACCTTGCGCTAATGCTCTGTTACAGGTCACTAATA
+CCATCTAAGTAGTTGATTCATAGTGACTGCATATGTTGTGTTTTACAGTATTATGTAGTCTGTTTTTTATGCAAAATCTA
+ATTTAATATATTGATATTTATATCATTTTACGTTTCTCGTTCAGCTTTTTTATACTAAGTTGGCATTATAAAAAAGCATT
+GCTTATCAATTTGTTGCAACGAACAGGTCACTATCAGTCAAAATAAAATCATTATTTGATTTCAATTTTGTCCCACTCCC
+TGCCTCTGTCATCACGATACTGTGATGCCATGGTGTCCGACTTATGCCCGAGAAGATGTTGAGCAAACTTATCGCTTATC
+TGCTTCTCATAGAGTCTTGCAGACAAACTGCGCAACTCGTGAAAGGTAGGCGGATCCCCTTCGAAGGAAAGACCTGATGC
+TTTTCGTGCGCGCATAAAATACCTTGATACTGTGCCGGATGAAAGCGGTTCGCGACGAGTAGATGCAATTATGGTTTCTC
+CGCCAAGAATCTCTTTGCATTTATCAAGTGTTTCCTTCATTGATATTCCGAGAGCATCAATATGCAATGCTGTTGGGATG
+GCAATTTTTACGCCTGTTTTGCTTTGCTCGACATAAAGATATCCATCTACGATATCAGACCACTTCATTTCGCATAAATC
+ACCAACTCGTTGCCCGGTAACAACAGCCAGTTCCATTGCAAGTCTGAGCCAACATGGTGATGATTCTGCTGCTTGATAAA
+TTTTCAGGTATTCGTCAGCCGTAAGTCTTGATCTCCTTACCTCTGATTTTGCTGCGCGAGTGGCAGCGACATGGTTTGTT
+GTTATATGGCCTTCAGCTATTGCCTCTCGGAATGCATCGCTCAGTGTTGATCTGATTAACTTGGCTGACGCCGCCTTGCC
+CTCGTCTATGTATCCATTGAGCATTGCCGCAATTTCTTTTGTGGTGATGTCTTCAAGTGGAGCATCAGGCAGACCCCTCC
+TTATTGCTTTAATTTTGCTCATGTAATTTATGAGTGTCTTCTGCTTGATTCCTCTGCTGGCCAGGATTTTTTCGTAGCGA
+TCAAGCCATGAATGTAACGTAACGGAATTATCACTGTTGATTCTCGCTGTCAGAGGCTTGTGTTTGTGTCCTGAAAATAA
+CTCAATGTTGGCCTGTATAGCTTCAGTGATTGCGATTCGCCTGTCTCTGCCTAATCCAAACTCTTTACCCGTCCTTGGGT
+CCCTGTAGCAGTAATATCCATTGTTTCTTATATAAAGGTTAGGGGGTAAATCCCGGCGCTCATGACTTCGCCTTCTTCCC
+ATTTCTGATCCTCTTCAAAAGGCCACCTGTTACTGGTCGATTTAAGTCAACCTTTACCGCTGATTCGTGGAACAGATACT
+CTCTTCCATCCTTAACCGGAGGTGGGAATATCCTGCATTCCCGAACCCATCGACGAACTGTTTCAAGGCTTCTTGGACGT
+CGCTGGCGTGCGTTCCACTCCTGAAGTGTCAAGTACATCGCAAAGTCTCCGCAATTACACGCAAGAAAAAACCGCCATCA
+GGCGGCTTGGTGTTCTTTCAGTTCTTCAATTCGAATATTGGTTACGTCTGCATGTGCTATCTGCGCCCATATCATCCAGT
+GGTCGTAGCAGTCGTTGATGTTCTCCGCTTCGATAACTCTGTTGAATGGCTCTCCATTCCATTCTCCTGTGACTCGGAAG
+TGCATTTATCATCTCCATAAAACAAAACCCGCCGTAGCGAGTTCAGATAAAATAAATCCCCGCGAGTGCGAGGATTGTTA
+TGTAATATTGGGTTTAATCATCTATATGTTTTGTACAGAGAGGGCAAGTATCGTTTCCACCGTACTCGTGATAATAATTT
+TGCACGGTATCAGTCATTTCTCGCACATTGCAGAATGGGGATTTGTCTTCATTAGACTTATAAACCTTCATGGAATATTT
+GTATGCCGACTCTATATCTATACCTTCATCTACATAAACACCTTCGTGATGTCTGCATGGAGACAAGACACCGGATCTGC
+ACAACATTGATAACGCCCAATCTTTTTGCTCAGACTCTAACTCATTGATACTCATTTATAAACTCCTTGCAATGTATGTC
+GTTTCAGCTAAACGGTATCAGCAATGTTTATGTAAAGAAACAGTAAGATAATACTCAACCCGATGTTTGAGTACGGTCAT
+CATCTGACACTACAGACTCTGGCATCGCTGTGAAGACGACGCGAAATTCAGCATTTTCACAAGCGTTATCTTTTACAAAA
+CCGATCTCACTCTCCTTTGATGCGAATGCCAGCGTCAGACATCATATGCAGATACTCACCTGCATCCTGAACCCATTGAC
+CTCCAACCCCGTAATAGCGATGCGTAATGATGTCGATAGTTACTAACGGGTCTTGTTCGATTAACTGCCGCAGAAACTCT
+TCCAGGTCACCAGTGCAGTGCTTGATAACAGGAGTCTTCCCAGGATGGCGAACAACAAGAAACTGGTTTCCGTCTTCACG
+GACTTCGTTGCTTTCCAGTTTAGCAATACGCTTACTCCCATCCGAGATAACACCTTCGTAATACTCACGCTGCTCGTTGA
+GTTTTGATTTTGCTGTTTCAAGCTCAACACGCAGTTTCCCTACTGTTAGCGCAATATCCTCGTTCTCCTGGTCGCGGCGT
+TTGATGTATTGCTGGTTTCTTTCCCGTTCATCCAGCAGTTCCAGCACAATCGATGGTGTTACCAATTCATGGAAAAGGTC
+TGCGTCAAATCCCCAGTCGTCATGCATTGCCTGCTCTGCCGCTTCACGCAGTGCCTGAGAGTTAATTTCGCTCACTTCGA
+ACCTCTCTGTTTACTGATAAGTTCCAGATCCTCCTGGCAACTTGCACAAGTCCGACAACCCTGAACGACCAGGCGTCTTC
+GTTCATCTATCGGATCGCCACACTCACAACAATGAGTGGCAGATATAGCCTGGTGGTTCAGGCGGCGCATTTTTATTGCT
+GTGTTGCGCTGTAATTCTTCTATTTCTGATGCTGAATCAATGATGTCTGCCATCTTTCATTAATCCCTGAACTGTTGGTT
+AATACGCTTGAGGGTGAATGCGAATAATAAAAAAGGAGCCTGTAGCTCCCTGATGATTTTGCTTTTCATGTTCATCGTTC
+CTTAAAGACGCCGTTTAACATGCCGATTGCCAGGCTTAAATGAGTCGGTGTGAATCCCATCAGCGTTACCGTTTCGCGGT
+GCTTCTTCAGTACGCTACGGCAAATGTCATCGACGTTTTTATCCGGAAACTGCTGTCTGGCTTTTTTTGATTTCAGAATT
+AGCCTGACGGGCAATGCTGCGAAGGGCGTTTTCCTGCTGAGGTGTCATTGAACAAGTCCCATGTCGGCAAGCATAAGCAC
+ACAGAATATGAAGCCCGCTGCCAGAAAAATGCATTCCGTGGTTGTCATACCTGGTTTCTCTCATCTGCTTCTGCTTTCGC
+CACCATCATTTCCAGCTTTTGTGAAAGGGATGCGGCTAACGTATGAAATTCTTCGTCTGTTTCTACTGGTATTGGCACAA
+ACCTGATTCCAATTTGAGCAAGGCTATGTGCCATCTCGATACTCGTTCTTAACTCAACAGAAGATGCTTTGTGCATACAG
+CCCCTCGTTTATTATTTATCTCCTCAGCCAGCCGCTGTGCTTTCAGTGGATTTCGGATAACAGAAAGGCCGGGAAATACC
+CAGCCTCGCTTTGTAACGGAGTAGACGAAAGTGATTGCGCCTACCCGGATATTATCGTGAGGATGCGTCATCGCCATTGC
+TCCCCAAATACAAAACCAATTTCAGCCAGTGCCTCGTCCATTTTTTCGATGAACTCCGGCACGATCTCGTCAAAACTCGC
+CATGTACTTTTCATCCCGCTCAATCACGACATAATGCAGGCCTTCACGCTTCATACGCGGGTCATAGTTGGCAAAGTACC
+AGGCATTTTTTCGCGTCACCCACATGCTGTACTGCACCTGGGCCATGTAAGCTGACTTTATGGCCTCGAAACCACCGAGC
+CGGAACTTCATGAAATCCCGGGAGGTAAACGGGCATTTCAGTTCAAGGCCGTTGCCGTCACTGCATAAACCATCGGGAGA
+GCAGGCGGTACGCATACTTTCGTCGCGATAGATGATCGGGGATTCAGTAACATTCACGCCGGAAGTGAATTCAAACAGGG
+TTCTGGCGTCGTTCTCGTACTGTTTTCCCCAGGCCAGTGCTTTAGCGTTAACTTCCGGAGCCACACCGGTGCAAACCTCA
+GCAAGCAGGGTGTGGAAGTAGGACATTTTCATGTCAGGCCACTTCTTTCCGGAGCGGGGTTTTGCTATCACGTTGTGAAC
+TTCTGAAGCGGTGATGACGCCGAGCCGTAATTTGTGCCACGCATCATCCCCCTGTTCGACAGCTCTCACATCGATCCCGG
+TACGCTGCAGGATAATGTCCGGTGTCATGCTGCCACCTTCTGCTCTGCGGCTTTCTGTTTCAGGAATCCAAGAGCTTTTA
+CTGCTTCGGCCTGTGTCAGTTCTGACGATGCACGAATGTCGCGGCGAAATATCTGGGAACAGAGCGGCAATAAGTCGTCA
+TCCCATGTTTTATCCAGGGCGATCAGCAGAGTGTTAATCTCCTGCATGGTTTCATCGTTAACCGGAGTGATGTCGCGTTC
+CGGCTGACGTTCTGCAGTGTATGCAGTATTTTCGACAATGCGCTCGGCTTCATCCTTGTCATAGATACCAGCAAATCCGA
+AGGCCAGACGGGCACACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTGATTTCT
+CTGCCTTCGCGAGTTTTGAATGGTTCGCGGCGGCATTCATCCATCCATTCGGTAACGCAGATCGGATGATTACGGTCCTT
+GCGGTAAATCCGGCATGTACAGGATTCATTGTCCTGCTCAAAGTCCATGCCATCAAACTGCTGGTTTTCATTGATGATGC
+GGGACCAGCCATCAACGCCCACCACCGGAACGATGCCATTCTGCTTATCAGGAAAGGCGTAAATTTCTTTCGTCCACGGA
+TTAAGGCCGTACTGGTTGGCAACGATCAGTAATGCGATGAACTGCGCATCGCTGGCATCACCTTTAAATGCCGTCTGGCG
+AAGAGTGGTGATCAGTTCCTGTGGGTCGACAGAATCCATGCCGACACGTTCAGCCAGCTTCCCAGCCAGCGTTGCGAGTG
+CAGTACTCATTCGTTTTATACCTCTGAATCAATATCAACCTGGTGGTGAGCAATGGTTTCAACCATGTACCGGATGTGTT
+CTGCCATGCGCTCCTGAAACTCAACATCGTCATCAAACGCACGGGTAATGGATTTTTTGCTGGCCCCGTGGCGTTGCAAA
+TGATCGATGCATAGCGATTCAAACAGGTGCTGGGGCAGGCCTTTTTCCATGTCGTCTGCCAGTTCTGCCTCTTTCTCTTC
+ACGGGCGAGCTGCTGGTAGTGACGCGCCCAGCTCTGAGCCTCAAGACGATCCTGAATGTAATAAGCGTTCATGGCTGAAC
+TCCTGAAATAGCTGTGAAAATATCGCCCGCGAAATGCCGGGCTGATTAGGAAAACAGGAAAGGGGGTTAGTGAATGCTTT
+TGCTTGATCTCAGTTTCAGTATTAATATCCATTTTTTATAAGCGTCGACGGCTTCACGAAACATCTTTTCATCGCCAATA
+AAAGTGGCGATAGTGAATTTAGTCTGGATAGCCATAAGTGTTTGATCCATTCTTTGGGACTCCTGGCTGATTAAGTATGT
+CGATAAGGCGTTTCCATCCGTCACGTAATTTACGGGTGATTCGTTCAAGTAAAGATTCGGAAGGGCAGCCAGCAACAGGC
+CACCCTGCAATGGCATATTGCATGGTGTGCTCCTTATTTATACATAACGAAAAACGCCTCGAGTGAAGCGTTATTGGTAT
+GCGGTAAAACCGCACTCAGGCGGCCTTGATAGTCATATCATCTGAATCAAATATTCCTGATGTATCGATATCGGTAATTC
+TTATTCCTTCGCTACCATCCATTGGAGGCCATCCTTCCTGACCATTTCCATCATTCCAGTCGAACTCACACACAACACCA
+TATGCATTTAAGTCGCTTGAAATTGCTATAAGCAGAGCATGTTGCGCCAGCATGATTAATACAGCATTTAATACAGAGCC
+GTGTTTATTGAGTCGGTATTCAGAGTCTGACCAGAAATTATTAATCTGGTGAAGTTTTTCCTCTGTCATTACGTCATGGT
+CGATTTCAATTTCTATTGATGCTTTCCAGTCGTAATCAATGATGTATTTTTTGATGTTTGACATCTGTTCATATCCTCAC
+AGATAAAAAATCGCCCTCACACTGGAGGGCAAAGAAGATTTCCAATAATCAGAACAAGTCGGCTCCTGTTTAGTTACGAG
+CGACATTGCTCCGTGTATTCACTCGTTGGAATGAATACACAGTGCAGTGTTTATTCTGTTATTTATGCCAAAAATAAAGG
+CCACTATCAGGCAGCTTTGTTGTTCTGTTTACCAAGTTCTCTGGCAATCATTGCCGTCGTTCGTATTGCCCATTTATCGA
+CATATTTCCCATCTTCCATTACAGGAAACATTTCTTCAGGCTTAACCATGCATTCCGATTGCAGCTTGCATCCATTGCAT
+CGCTTGAATTGTCCACACCATTGATTTTTATCAATAGTCGTAGTCATACGGATAGTCCTGGTATTGTTCCATCACATCCT
+GAGGATGCTCTTCGAACTCTTCAAATTCTTCTTCCATATATCACCTTAAATAGTGGATTGCGGTAGTAAAGATTGTGCCT
+GTCTTTTAACCACATCAGGCTCGGTGGTTCTCGTGTACCCCTACAGCGAGAAATCGGATAAACTATTACAACCCCTACAG
+TTTGATGAGTATAGAAATGGATCCACTCGTTATTCTCGGACGAGTGTTCAGTAATGAACCTCTGGAGAGAACCATGTATA
+TGATCGTTATCTGGGTTGGACTTCTGCTTTTAAGCCCAGATAACTGGCCTGAATATGTTAATGAGAGAATCGGTATTCCT
+CATGTGTGGCATGTTTTCGTCTTTGCTCTTGCATTTTCGCTAGCAATTAATGTGCATCGATTATCAGCTATTGCCAGCGC
+CAGATATAAGCGATTTAAGCTAAGAAAACGCATTAAGATGCAAAACGATAAAGTGCGATCAGTAATTCAAAACCTTACAG
+AAGAGCAATCTATGGTTTTGTGCGCAGCCCTTAATGAAGGCAGGAAGTATGTGGTTACATCAAAACAATTCCCATACATT
+AGTGAGTTGATTGAGCTTGGTGTGTTGAACAAAACTTTTTCCCGATGGAATGGAAAGCATATATTATTCCCTATTGAGGA
+TATTTACTGGACTGAATTAGTTGCCAGCTATGATCCATATAATATTGAGATAAAGCCAAGGCCAATATCTAAGTAACTAG
+ATAAGAGGAATCGATTTTCCCTTAATTTTCTGGCGTCCACTGCATGTTATGCCGCGTTCGCCAGGCTTGCTGTACCATGT
+GCGCTGATTCTTGCGCTCAATACGTTGCAGGTTGCTTTCAATCTGTTTGTGGTATTCAGCCAGCACTGTAAGGTCTATCG
+GATTTAGTGCGCTTTCTACTCGTGATTTCGGTTTGCGATTCAGCGAGAGAATAGGGCGGTTAACTGGTTTTGCGCTTACC
+CCAACCAACAGGGGATTTGCTGCTTTCCATTGAGCCTGTTTCTCTGCGCGACGTTCGCGGCGGCGTGTTTGTGCATCCAT
+CTGGATTCTCCTGTCAGTTAGCTTTGGTGGTGTGTGGCAGTTGTAGTCCTGAACGAAAACCCCCCGCGATTGGCACATTG
+GCAGCTAATCCGGAATCGCACTTACGGCCAATGCTTCGTTTCGTATCACACACCCCAAAGCCTTCTGCTTTGAATGCTGC
+CCTTCTTCAGGGCTTAATTTTTAAGAGCGTCACCTTCATGGTGGTCAGTGCGTCCTGCTGATGTGCTCAGTATCACCGCC
+AGTGGTATTTATGTCAACACCGCCAGAGATAATTTATCACCGCAGATGGTTATCTGTATGTTTTTTATATGAATTTATTT
+TTTGCAGGGGGGCATTGTTTGGTAGGTGAGAGATCTGAATTGCTATGTTTAGTGAGTTGTATCTATTTATTTTTCAATAA
+ATACAATTGGTTATGTGTTTTGGGGGCGATCGTGAGGCAAAGAAAACCCGGCGCTGAGGCCGGGTTATTCTTGTTCTCTG
+GTCAAATTATATAGTTGGAAAACAAGGATGCATATATGAATGAACGATGCAGAGGCAATGCCGATGGCGATAGTGGGTAT
+CATGTAGCCGCTTATGCTGGAAAGAAGCAATAACCCGCAGAAAAACAAAGCTCCAAGCTCAACAAAACTAAGGGCATAGA
+CAATAACTACCGATGTCATATACCCATACTCTCTAATCTTGGCCAGTCGGCGCGTTCTGCTTCCGATTAGAAACGTCAAG
+GCAGCAATCAGGATTGCAATCATGGTTCCTGCATATGATGACAATGTCGCCCCAAGACCATCTCTATGAGCTGAAAAAGA
+AACACCAGGAATGTAGTGGCGGAAAAGGAGATAGCAAATGCTTACGATAACGTAAGGAATTATTACTATGTAAACACCAG
+GCATGATTCTGTTCCGCATAATTACTCCTGATAATTAATCCTTAACTTTGCCCACCTGCCTTTTAAAACATTCCAGTATA
+TCACTTTTCATTCTTGCGTAGCAATATGCCATCTCTTCAGCTATCTCAGCATTGGTGACCTTGTTCAGAGGCGCTGAGAG
+ATGGCCTTTTTCTGATAGATAATGTTCTGTTAAAATATCTCCGGCCTCATCTTTTGCCCGCAGGCTAATGTCTGAAAATT
+GAGGTGACGGGTTAAAAATAATATCCTTGGCAACCTTTTTTATATCCCTTTTAAATTTTGGCTTAATGACTATATCCAAT
+GAGTCAAAAAGCTCCCCTTCAATATCTGTTGCCCCTAAGACCTTTAATATATCGCCAAATACAGGTAGCTTGGCTTCTAC
+CTTCACCGTTGTTCGGCCGATGAAATGCATATGCATAACATCGTCTTTGGTGGTTCCCCTCATCAGTGGCTCTATCTGAA
+CGCGCTCTCCACTGCTTAATGACATTCCTTTCCCGATTAAAAAATCTGTCAGATCGGATGTGGTCGGCCCGAAAACAGTT
+CTGGCAAAACCAATGGTGTCGCCTTCAACAAACAAAAAAGATGGGAATCCCAATGATTCGTCATCTGCGAGGCTGTTCTT
+AATATCTTCAACTGAAGCTTTAGAGCGATTTATCTTCTGAACCAGACTCTTGTCATTTGTTTTGGTAAAGAGAAAAGTTT
+TTCCATCGATTTTATGAATATACAAATAATTGGAGCCAACCTGCAGGTGATGATTATCAGCCAGCAGAGAATTAAGGAAA
+ACAGACAGGTTTATTGAGCGCTTATCTTTCCCTTTATTTTTGCTGCGGTAAGTCGCATAAAAACCATTCTTCATAATTCA
+ATCCATTTACTATGTTATGTTCTGAGGGGAGTGAAAATTCCCCTAATTCGATGAAGATTCTTGCTCAATTGTTATCAGCT
+ATGCGCCGACCAGAACACCTTGCCGATCAGCCAAACGTCTCTTCAGGCCACTGACTAGCGATAACTTTCCCCACAACGGA
+ACAACTCTCATTGCATGGGATCATTGGGTACTGTGGGTTTAGTGGTTGTAAAAACACCTGACCGCTATCCCTGATCAGTT
+TCTTGAAGGTAAACTCATCACCCCCAAGTCTGGCTATGCAGAAATCACCTGGCTCAACAGCCTGCTCAGGGTCAACGAGA
+ATTAACATTCCGTCAGGAAAGCTTGGCTTGGAGCCTGTTGGTGCGGTCATGGAATTACCTTCAACCTCAAGCCAGAATGC
+AGAATCACTGGCTTTTTTGGTTGTGCTTACCCATCTCTCCGCATCACCTTTGGTAAAGGTTCTAAGCTTAGGTGAGAACA
+TCCCTGCCTGAACATGAGAAAAAACAGGGTACTCATACTCACTTCTAAGTGACGGCTGCATACTAACCGCTTCATACATC
+TCGTAGATTTCTCTGGCGATTGAAGGGCTAAATTCTTCAACGCTAACTTTGAGAATTTTTGTAAGCAATGCGGCGTTATA
+AGCATTTAATGCATTGATGCCATTAAATAAAGCACCAACGCCTGACTGCCCCATCCCCATCTTGTCTGCGACAGATTCCT
+GGGATAAGCCAAGTTCATTTTTCTTTTTTTCATAAATTGCTTTAAGGCGACGTGCGTCCTCAAGCTGCTCTTGTGTTAAT
+GGTTTCTTTTTTGTGCTCATACGTTAAATCTATCACCGCAAGGGATAAATATCTAACACCGTGCGTGTTGACTATTTTAC
+CTCTGGCGGTGATAATGGTTGCATGTACTAAGGAGGTTGTATGGAACAACGCATAACCCTGAAAGATTATGCAATGCGCT
+TTGGGCAAACCAAGACAGCTAAAGATCTCGGCGTATATCAAAGCGCGATCAACAAGGCCATTCATGCAGGCCGAAAGATT
+TTTTTAACTATAAACGCTGATGGAAGCGTTTATGCGGAAGAGGTAAAGCCCTTCCCGAGTAACAAAAAAACAACAGCATA
+AATAACCCCGCTCTTACACATTCCAGCCCTGAAAAAGGGCATCAAATTAAACCACACCTATGGTGTATGCATTTATTTGC
+ATACATTCAATCAATTGTTATCTAAGGAAATACTTACATATGGTTCGTGCAAACAAACGCAACGAGGCTCTACGAATCGA
+GAGTGCGTTGCTTAACAAAATCGCAATGCTTGGAACTGAGAAGACAGCGGAAGCTGTGGGCGTTGATAAGTCGCAGATCA
+GCAGGTGGAAGAGGGACTGGATTCCAAAGTTCTCAATGCTGCTTGCTGTTCTTGAATGGGGGGTCGTTGACGACGACATG
+GCTCGATTGGCGCGACAAGTTGCTGCGATTCTCACCAATAAAAAACGCCCGGCGGCAACCGAGCGTTCTGAACAAATCCA
+GATGGAGTTCTGAGGTCATTACTGGATCTATCAACAGGAGTCATTATGACAAATACAGCAAAAATACTCAACTTCGGCAG
+AGGTAACTTTGCCGGACAGGAGCGTAATGTGGCAGATCTCGATGATGGTTACGCCAGACTATCAAATATGCTGCTTGAGG
+CTTATTCGGGCGCAGATCTGACCAAGCGACAGTTTAAAGTGCTGCTTGCCATTCTGCGTAAAACCTATGGGTGGAATAAA
+CCAATGGACAGAATCACCGATTCTCAACTTAGCGAGATTACAAAGTTACCTGTCAAACGGTGCAATGAAGCCAAGTTAGA
+ACTCGTCAGAATGAATATTATCAAGCAGCAAGGCGGCATGTTTGGACCAAATAAAAACATCTCAGAATGGTGCATCCCTC
+AAAACGAGGGAAAATCCCCTAAAACGAGGGATAAAACATCCCTCAAATTGGGGGATTGCTATCCCTCAAAACAGGGGGAC
+ACAAAAGACACTATTACAAAAGAAAAAAGAAAAGATTATTCGTCAGAGAATTCTGGCGAATCCTCTGACCAGCCAGAAAA
+CGACCTTTCTGTGGTGAAACCGGATGCTGCAATTCAGAGCGGCAGCAAGTGGGGGACAGCAGAAGACCTGACCGCCGCAG
+AGTGGATGTTTGACATGGTGAAGACTATCGCACCATCAGCCAGAAAACCGAATTTTGCTGGGTGGGCTAACGATATCCGC
+CTGATGCGTGAACGTGACGGACGTAACCACCGCGACATGTGTGTGCTGTTCCGCTGGGCATGCCAGGACAACTTCTGGTC
+CGGTAACGTGCTGAGCCCGGCCAAACTCCGCGATAAGTGGACCCAACTCGAAATCAACCGTAACAAGCAACAGGCAGGCG
+TGACAGCCAGCAAACCAAAACTCGACCTGACAAACACAGACTGGATTTACGGGGTGGATCTATGAAAAACATCGCCGCAC
+AGATGGTTAACTTTGACCGTGAGCAGATGCGTCGGATCGCCAACAACATGCCGGAACAGTACGACGAAAAGCCGCAGGTA
+CAGCAGGTAGCGCAGATCATCAACGGTGTGTTCAGCCAGTTACTGGCAACTTTCCCGGCGAGCCTGGCTAACCGTGACCA
+GAACGAAGTGAACGAAATCCGTCGCCAGTGGGTTCTGGCTTTTCGGGAAAACGGGATCACCACGATGGAACAGGTTAACG
+CAGGAATGCGCGTAGCCCGTCGGCAGAATCGACCATTTCTGCCATCACCCGGGCAGTTTGTTGCATGGTGCCGGGAAGAA
+GCATCCGTTACCGCCGGACTGCCAAACGTCAGCGAGCTGGTTGATATGGTTTACGAGTATTGCCGGAAGCGAGGCCTGTA
+TCCGGATGCGGAGTCTTATCCGTGGAAATCAAACGCGCACTACTGGCTGGTTACCAACCTGTATCAGAACATGCGGGCCA
+ATGCGCTTACTGATGCGGAATTACGCCGTAAGGCCGCAGATGAGCTTGTCCATATGACTGCGAGAATTAACCGTGGTGAG
+GCGATCCCTGAACCAGTAAAACAACTTCCTGTCATGGGCGGTAGACCTCTAAATCGTGCACAGGCTCTGGCGAAGATCGC
+AGAAATCAAAGCTAAGTTCGGACTGAAAGGAGCAAGTGTATGACGGGCAAAGAGGCAATTATTCATTACCTGGGGACGCA
+TAATAGCTTCTGTGCGCCGGACGTTGCCGCGCTAACAGGCGCAACAGTAACCAGCATAAATCAGGCCGCGGCTAAAATGG
+CACGGGCAGGTCTTCTGGTTATCGAAGGTAAGGTCTGGCGAACGGTGTATTACCGGTTTGCTACCAGGGAAGAACGGGAA
+GGAAAGATGAGCACGAACCTGGTTTTTAAGGAGTGTCGCCAGAGTGCCGCGATGAAACGGGTATTGGCGGTATATGGAGT
+TAAAAGATGACCATCTACATTACTGAGCTAATAACAGGCCTGCTGGTAATCGCAGGCCTTTTTATTTGGGGGAGAGGGAA
+GTCATGAAAAAACTAACCTTTGAAATTCGATCTCCAGCACATCAGCAAAACGCTATTCACGCAGTACAGCAAATCCTTCC
+AGACCCAACCAAACCAATCGTAGTAACCATTCAGGAACGCAACCGCAGCTTAGACCAAAACAGGAAGCTATGGGCCTGCT
+TAGGTGACGTCTCTCGTCAGGTTGAATGGCATGGTCGCTGGCTGGATGCAGAAAGCTGGAAGTGTGTGTTTACCGCAGCA
+TTAAAGCAGCAGGATGTTGTTCCTAACCTTGCCGGGAATGGCTTTGTGGTAATAGGCCAGTCAACCAGCAGGATGCGTGT
+AGGCGAATTTGCGGAGCTATTAGAGCTTATACAGGCATTCGGTACAGAGCGTGGCGTTAAGTGGTCAGACGAAGCGAGAC
+TGGCTCTGGAGTGGAAAGCGAGATGGGGAGACAGGGCTGCATGATAAATGTCGTTAGTTTCTCCGGTGGCAGGACGTCAG
+CATATTTGCTCTGGCTAATGGAGCAAAAGCGACGGGCAGGTAAAGACGTGCATTACGTTTTCATGGATACAGGTTGTGAA
+CATCCAATGACATATCGGTTTGTCAGGGAAGTTGTGAAGTTCTGGGATATACCGCTCACCGTATTGCAGGTTGATATCAA
+CCCGGAGCTTGGACAGCCAAATGGTTATACGGTATGGGAACCAAAGGATATTCAGACGCGAATGCCTGTTCTGAAGCCAT
+TTATCGATATGGTAAAGAAATATGGCACTCCATACGTCGGCGGCGCGTTCTGCACTGACAGATTAAAACTCGTTCCCTTC
+ACCAAATACTGTGATGACCATTTCGGGCGAGGGAATTACACCACGTGGATTGGCATCAGAGCTGATGAACCGAAGCGGCT
+AAAGCCAAAGCCTGGAATCAGATATCTTGCTGAACTGTCAGACTTTGAGAAGGAAGATATCCTCGCATGGTGGAAGCAAC
+AACCATTCGATTTGCAAATACCGGAACATCTCGGTAACTGCATATTCTGCATTAAAAAATCAACGCAAAAAATCGGACTT
+GCCTGCAAAGATGAGGAGGGATTGCAGCGTGTTTTTAATGAGGTCATCACGGGATCCCATGTGCGTGACGGACATCGGGA
+AACGCCAAAGGAGATTATGTACCGAGGAAGAATGTCGCTGGACGGTATCGCGAAAATGTATTCAGAAAATGATTATCAAG
+CCCTGTATCAGGACATGGTACGAGCTAAAAGATTCGATACCGGCTCTTGTTCTGAGTCATGCGAAATATTTGGAGGGCAG
+CTTGATTTCGACTTCGGGAGGGAAGCTGCATGATGCGATGTTATCGGTGCGGTGAATGCAAAGAAGATAACCGCTTCCGA
+CCAAATCAACCTTACTGGAATCGATGGTGTCTCCGGTGTGAAAGAACACCAACAGGGGTGTTACCACTACCGCAGGAAAA
+GGAGGACGTGTGGCGAGACAGCGACGAAGTATCACCGACATAATCTGCGAAAACTGCAAATACCTTCCAACGAAACGCAC
+CAGAAATAAACCCAAGCCAATCCCAAAAGAATCTGACGTAAAAACCTTCAACTACACGGCTCACCTGTGGGATATCCGGT
+GGCTAAGACGTCGTGCGAGGAAAACAAGGTGATTGACCAAAATCGAAGTTACGAACAAGAAAGCGTCGAGCGAGCTTTAA
+CGTGCGCTAACTGCGGTCAGAAGCTGCATGTGCTGGAAGTTCACGTGTGTGAGCACTGCTGCGCAGAACTGATGAGCGAT
+CCGAATAGCTCGATGCACGAGGAAGAAGATGATGGCTAAACCAGCGCGAAGACGATGTAAAAACGATGAATGCCGGGAAT
+GGTTTCACCCTGCATTCGCTAATCAGTGGTGGTGCTCTCCAGAGTGTGGAACCAAGATAGCACTCGAACGACGAAGTAAA
+GAACGCGAAAAAGCGGAAAAAGCAGCAGAGAAGAAACGACGACGAGAGGAGCAGAAACAGAAAGATAAACTTAAGATTCG
+AAAACTCGCCTTAAAGCCCCGCAGTTACTGGATTAAACAAGCCCAACAAGCCGTAAACGCCTTCATCAGAGAAAGAGACC
+GCGACTTACCATGTATCTCGTGCGGAACGCTCACGTCTGCTCAGTGGGATGCCGGACATTACCGGACAACTGCTGCGGCA
+CCTCAACTCCGATTTAATGAACGCAATATTCACAAGCAATGCGTGGTGTGCAACCAGCACAAAAGCGGAAATCTCGTTCC
+GTATCGCGTCGAACTGATTAGCCGCATCGGGCAGGAAGCAGTAGACGAAATCGAATCAAACCATAACCGCCATCGCTGGA
+CTATCGAAGAGTGCAAGGCGATCAAGGCAGAGTACCAACAGAAACTCAAAGACCTGCGAAATAGCAGAAGTGAGGCCGCA
+TGACGTTCTCAGTAAAAACCATTCCAGACATGCTCGTTGAAACATACGGAAATCAGACAGAAGTAGCACGCAGACTGAAA
+TGTAGTCGCGGTACGGTCAGAAAATACGTTGATGATAAAGACGGGAAAATGCACGCCATCGTCAACGACGTTCTCATGGT
+TCATCGCGGATGGAGTGAAAGAGATGCGCTATTACGAAAAAATTGATGGCAGCAAATACCGAAATATTTGGGTAGTTGGC
+GATCTGCACGGATGCTACACGAACCTGATGAACAAACTGGATACGATTGGATTCGACAACAAAAAAGACCTGCTTATCTC
+GGTGGGCGATTTGGTTGATCGTGGTGCAGAGAACGTTGAATGCCTGGAATTAATCACATTCCCCTGGTTCAGAGCTGTAC
+GTGGAAACCATGAGCAAATGATGATTGATGGCTTATCAGAGCGTGGAAACGTTAATCACTGGCTGCTTAATGGCGGTGGC
+TGGTTCTTTAATCTCGATTACGACAAAGAAATTCTGGCTAAAGCTCTTGCCCATAAAGCAGATGAACTTCCGTTAATCAT
+CGAACTGGTGAGCAAAGATAAAAAATATGTTATCTGCCACGCCGATTATCCCTTTGACGAATACGAGTTTGGAAAGCCAG
+TTGATCATCAGCAGGTAATCTGGAACCGCGAACGAATCAGCAACTCACAAAACGGGATCGTGAAAGAAATCAAAGGCGCG
+GACACGTTCATCTTTGGTCATACGCCAGCAGTGAAACCACTCAAGTTTGCCAACCAAATGTATATCGATACCGGCGCAGT
+GTTCTGCGGAAACCTAACATTGATTCAGGTACAGGGAGAAGGCGCATGAGACTCGAAAGCGTAGCTAAATTTCATTCGCC
+AAAAAGCCCGATGATGAGCGACTCACCACGGGCCACGGCTTCTGACTCTCTTTCCGGTACTGATGTGATGGCTGCTATGG
+GGATGGCGCAATCACAAGCCGGATTCGGTATGGCTGCATTCTGCGGTAAGCACGAACTCAGCCAGAACGACAAACAAAAG
+GCTATCAACTATCTGATGCAATTTGCACACAAGGTATCGGGGAAATACCGTGGTGTGGCAAAGCTTGAAGGAAATACTAA
+GGCAAAGGTACTGCAAGTGCTCGCAACATTCGCTTATGCGGATTATTGCCGTAGTGCCGCGACGCCGGGGGCAAGATGCA
+GAGATTGCCATGGTACAGGCCGTGCGGTTGATATTGCCAAAACAGAGCTGTGGGGGAGAGTTGTCGAGAAAGAGTGCGGA
+AGATGCAAAGGCGTCGGCTATTCAAGGATGCCAGCAAGCGCAGCATATCGCGCTGTGACGATGCTAATCCCAAACCTTAC
+CCAACCCACCTGGTCACGCACTGTTAAGCCGCTGTATGACGCTCTGGTGGTGCAATGCCACAAAGAAGAGTCAATCGCAG
+ACAACATTTTGAATGCGGTCACACGTTAGCAGCATGATTGCCACGGATGGCAACATATTAACGGCATGATATTGACTTAT
+TGAATAAAATTGGGTAAATTTGACTCAACGATGGGTTAATTCGCTCGTTGTGGTAGTGAGATGAAAAGAGGCGGCGCTTA
+CTACCGATTCCGCCTAGTTGGTCACTTCGACGTATCGTCTGGAACTCCAACCATCGCAGGCAGAGAGGTCTGCAAAATGC
+AATCCCGAAACAGTTCGCAGGTAATAGTTAGAGCCTGCATAACGGTTTCGGGATTTTTTATATCTGCACAACAGGTAAGA
+GCATTGAGTCGATAATCGTGAAGAGTCGGCGAGCCTGGTTAGCCAGTGCTCTTTCCGTTGTGCTGAATTAAGCGAATACC
+GGAAGCAGAACCGGATCACCAAATGCGTACAGGCGTCATCGCCGCCCAGCAACAGCACAACCCAAACTGAGCCGTAGCCA
+CTGTCTGTCCTGAATTCATTAGTAATAGTTACGCTGCGGCCTTTTACACATGACCTTCGTGAAAGCGGGTGGCAGGAGGT
+CGCGCTAACAACCTCCTGCCGTTTTGCCCGTGCATATCGGTCACGAACAAATCTGATTACTAAACACAGTAGCCTGGATT
+TGTTCTATCAGTAATCGACCTTATTCCTAATTAAATAGAGCAAATCCCCTTATTGGGGGTAAGACATGAAGATGCCAGAA
+AAACATGACCTGTTGGCCGCCATTCTCGCGGCAAAGGAACAAGGCATCGGGGCAATCCTTGCGTTTGCAATGGCGTACCT
+TCGCGGCAGATATAATGGCGGTGCGTTTACAAAAACAGTAATCGACGCAACGATGTGCGCCATTATCGCCTAGTTCATTC
+GTGACCTTCTCGACTTCGCCGGACTAAGTAGCAATCTCGCTTATATAACGAGCGTGTTTATCGGCTACATCGGTACTGAC
+TCGATTGGTTCGCTTATCAAACGCTTCGCTGCTAAAAAAGCCGGAGTAGAAGATGGTAGAAATCAATAATCAACGTAAGG
+CGTTCCTCGATATGCTGGCGTGGTCGGAGGGAACTGATAACGGACGTCAGAAAACCAGAAATCATGGTTATGACGTCATT
+GTAGGCGGAGAGCTATTTACTGATTACTCCGATCACCCTCGCAAACTTGTCACGCTAAACCCAAAACTCAAATCAACAGG
+CGCCGGACGCTACCAGCTTCTTTCCCGTTGGTGGGATGCCTACCGCAAGCAGCTTGGCCTGAAAGACTTCTCTCCGAAAA
+GTCAGGACGCTGTGGCATTGCAGCAGATTAAGGAGCGTGGCGCTTTACCTATGATTGATCGTGGTGATATCCGTCAGGCA
+ATCGACCGTTGCAGCAATATCTGGGCTTCACTGCCGGGCGCTGGTTATGGTCAGTTCGAGCATAAGGCTGACAGCCTGAT
+TGCAAAATTCAAAGAAGCGGGCGGAACGGTCAGAGAGATTGATGTATGAGCAGAGTCACCGCGATTATCTCCGCTCTGGT
+TATCTGCATCATCGTCTGCCTGTCATGGGCTGTTAATCATTACCGTGATAACGCCATTACCTACAAAGCCCAGCGCGACA
+AAAATGCCAGAGAACTGAAGCTGGCGAACGCGGCAATTACTGACATGCAGATGCGTCAGCGTGATGTTGCTGCGCTCGAT
+GCAAAATACACGAAGGAGTTAGCTGATGCTAAAGCTGAAAATGATGCTCTGCGTGATGATGTTGCCGCTGGTCGTCGTCG
+GTTGCACATCAAAGCAGTCTGTCAGTCAGTGCGTGAAGCCACCACCGCCTCCGGCGTGGATAATGCAGCCTCCCCCCGAC
+TGGCAGACACCGCTGAACGGGATTATTTCACCCTCAGAGAGAGGCTGATCACTATGCAAAAACAACTGGAAGGAACCCAG
+AAGTATATTAATGAGCAGTGCAGATAGAGTTGCCCATATCGATGGGCAACTCATGCAATTATTGTGAGCAATACACACGC
+GCTTCCAGCGGAGTATAAATGCCTAAAGTAATAAAACCGAGCAATCCATTTACGAATGTTTGCTGGGTTTCTGTTTTAAC
+AACATTTTCTGCGCCGCCACAAATTTTGGCTGCATCGACAGTTTTCTTCTGCCCAATTCCAGAAACGAAGAAATGATGGG
+TGATGGTTTCCTTTGGTGCTACTGCTGCCGGTTTGTTTTGAACAGTAAACGTCTGTTGAGCACATCCTGTAATAAGCAGG
+GCCAGCGCAGTAGCGAGTAGCATTTTTTTCATGGTGTTATTCCCGATGCTTTTTGAAGTTCGCAGAATCGTATGTGTAGA
+AAATTAAACAAACCCTAAACAATGAGTTGAAATTTCATATTGTTAATATTTATTAATGTATGTCAGGTGCGATGAATCGT
+CATTGTATTCCCGGATTAACTATGTCCACAGCCCTGACGGGGAACTTCTCTGCGGGAGTGTCCGGGAATAATTAAAACGA
+TGCACACAGGGTTTAGCGCGTACACGTATTGCATTATGCCAACGCCCCGGTGCTGACACGGAAGAAACCGGACGTTATGA
+TTTAGCGTGGAAAGATTTGTGTAGTGTTCTGAATGCTCTCAGTAAATAGTAATGAATTATCAAAGGTATAGTAATATCTT
+TTATGTTCATGGATATTTGTAACCCATCGGAAAACTCCTGCTTTAGCAAGATTTTCCCTGTATTGCTGAAATGTGATTTC
+TCTTGATTTCAACCTATCATAGGACGTTTCTATAAGATGCGTGTTTCTTGAGAATTTAACATTTACAACCTTTTTAAGTC
+CTTTTATTAACACGGTGTTATCGTTTTCTAACACGATGTGAATATTATCTGTGGCTAGATAGTAAATATAATGTGAGACG
+TTGTGACGTTTTAGTTCAGAATAAAACAATTCACAGTCTAAATCTTTTCGCACTTGATCGAATATTTCTTTAAAAATGGC
+AACCTGAGCCATTGGTAAAACCTTCCATGTGATACGAGGGCGCGTAGTTTGCATTATCGTTTTTATCGTTTCAATCTGGT
+CTGACCTCCTTGTGTTTTGTTGATGATTTATGTCAAATATTAGGAATGTTTTCACTTAATAGTATTGGTTGCGTAACAAA
+GTGCGGTCCTGCTGGCATTCTGGAGGGAAATACAACCGACAGATGTATGTAAGGCCAACGTGCTCAAATCTTCATACAGA
+AAGATTTGAAGTAATATTTTAACCGCTAGATGAAGAGCAAGCGCATGGAGCGACAAAATGAATAAAGAACAATCTGCTGA
+TGATCCCTCCGTGGATCTGATTCGTGTAAAAAATATGCTTAATAGCACCATTTCTATGAGTTACCCTGATGTTGTAATTG
+CATGTATAGAACATAAGGTGTCTCTGGAAGCATTCAGAGCAATTGAGGCAGCGTTGGTGAAGCACGATAATAATATGAAG
+GATTATTCCCTGGTGGTTGACTGATCACCATAACTGCTAATCATTCAAACTATTTAGTCTGTGACAGAGCCAACACGCAG
+TCTGTCACTGTCAGGAAAGTGGTAAAACTGCAACTCAATTACTGCAATGCCCTCGTAATTAAGTGAATTTACAATATCGT
+CCTGTTCGGAGGGAAGAACGCGGGATGTTCATTCTTCATCACTTTTAATTGATGTATATGCTCTCTTTTCTGACGTTAGT
+CTCCGACGGCAGGCTTCAATGACCCAGGCTGAGAAATTCCCGGACCCTTTTTGCTCAAGAGCGATGTTAATTTGTTCAAT
+CATTTGGTTAGGAAAGCGGATGTTGCGGGTTGTTGTTCTGCGGGTTCTGTTCTTCGTTGACATGAGGTTGCCCCGTATTC
+AGTGTCGCTGATTTGTATTGTCTGAAGTTGTTTTTACGTTAAGTTGATGCAGATCAATTAATACGATACCTGCGTCATAA
+TTGATTATTTGACGTGGTTTGATGGCCTCCACGCACGTTGTGATATGTAGATGATAATCATTATCACTTTACGGGTCCTT
+TCCGGTGATCCGACAGGTTACG
diff --git a/tests/data/lambdaNEB.fa.fai b/tests/data/lambdaNEB.fa.fai

new file mode 100644 (file)

index 0000000..064af36
--- /dev/null
+++ b/tests/data/lambdaNEB.fa.fai
@@ -0,0 +1 @@
+lambda_NEB3011 48502   16      80      81
diff --git a/tests/data/long-cigar-1.7.bam b/tests/data/long-cigar-1.7.bam

new file mode 100644 (file)

index 0000000..480c776

Binary files /dev/null and b/tests/data/long-cigar-1.7.bam differ
diff --git a/tests/data/long_reads.bam b/tests/data/long_reads.bam

new file mode 100644 (file)

index 0000000..cdbeb5e

Binary files /dev/null and b/tests/data/long_reads.bam differ
diff --git a/tests/data/merge.fofn b/tests/data/merge.fofn

new file mode 100644 (file)

index 0000000..8a79dff
--- /dev/null
+++ b/tests/data/merge.fofn
@@ -0,0 +1,2 @@
+aligned.bam
+aligned2.bam
diff --git a/tests/data/pbbamify/input-aligned-1.bam b/tests/data/pbbamify/input-aligned-1.bam

new file mode 100644 (file)

index 0000000..b46607c

Binary files /dev/null and b/tests/data/pbbamify/input-aligned-1.bam differ
diff --git a/tests/data/pbbamify/input-aligned-2.bam b/tests/data/pbbamify/input-aligned-2.bam

new file mode 100644 (file)

index 0000000..345f5ac

Binary files /dev/null and b/tests/data/pbbamify/input-aligned-2.bam differ
diff --git a/tests/data/pbbamify/input-aligned-3.bam b/tests/data/pbbamify/input-aligned-3.bam

new file mode 100644 (file)

index 0000000..59b1e38

Binary files /dev/null and b/tests/data/pbbamify/input-aligned-3.bam differ
diff --git a/tests/data/pbbamify/input-aligned-all.bam b/tests/data/pbbamify/input-aligned-all.bam

new file mode 100644 (file)

index 0000000..7160327

Binary files /dev/null and b/tests/data/pbbamify/input-aligned-all.bam differ
diff --git a/tests/data/pbbamify/synthetic-ref-1.fa b/tests/data/pbbamify/synthetic-ref-1.fa

new file mode 100644 (file)

index 0000000..9e49565
--- /dev/null
+++ b/tests/data/pbbamify/synthetic-ref-1.fa
@@ -0,0 +1,2 @@
+>synthetic_ref_1
+GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGACAGGTGCTGAAAGCGAGGCTTTTTGGCCT
diff --git a/tests/data/pbbamify/synthetic-ref-1.fa.fai b/tests/data/pbbamify/synthetic-ref-1.fa.fai

new file mode 100644 (file)

index 0000000..d9d41c0
--- /dev/null
+++ b/tests/data/pbbamify/synthetic-ref-1.fa.fai
@@ -0,0 +1 @@
+synthetic_ref_1        150     17      150     151
diff --git a/tests/data/pbbamify/synthetic_movie_1.subreads.bam b/tests/data/pbbamify/synthetic_movie_1.subreads.bam

new file mode 100644 (file)

index 0000000..22da3d3

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_1.subreads.bam differ
diff --git a/tests/data/pbbamify/synthetic_movie_1.subreads.bam.bai b/tests/data/pbbamify/synthetic_movie_1.subreads.bam.bai

new file mode 100644 (file)

index 0000000..bfd4731

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_1.subreads.bam.bai differ
diff --git a/tests/data/pbbamify/synthetic_movie_1.subreads.bam.pbi b/tests/data/pbbamify/synthetic_movie_1.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..06fb614

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_1.subreads.bam.pbi differ
diff --git a/tests/data/pbbamify/synthetic_movie_2.subreads.bam b/tests/data/pbbamify/synthetic_movie_2.subreads.bam

new file mode 100644 (file)

index 0000000..9526f60

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_2.subreads.bam differ
diff --git a/tests/data/pbbamify/synthetic_movie_2.subreads.bam.bai b/tests/data/pbbamify/synthetic_movie_2.subreads.bam.bai

new file mode 100644 (file)

index 0000000..ac6f2ae

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_2.subreads.bam.bai differ
diff --git a/tests/data/pbbamify/synthetic_movie_2.subreads.bam.pbi b/tests/data/pbbamify/synthetic_movie_2.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..2f08904

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_2.subreads.bam.pbi differ
diff --git a/tests/data/pbbamify/synthetic_movie_3.subreads.bam b/tests/data/pbbamify/synthetic_movie_3.subreads.bam

new file mode 100644 (file)

index 0000000..d19da7b

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_3.subreads.bam differ
diff --git a/tests/data/pbbamify/synthetic_movie_3.subreads.bam.bai b/tests/data/pbbamify/synthetic_movie_3.subreads.bam.bai

new file mode 100644 (file)

index 0000000..8e3bba4

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_3.subreads.bam.bai differ
diff --git a/tests/data/pbbamify/synthetic_movie_3.subreads.bam.pbi b/tests/data/pbbamify/synthetic_movie_3.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..89b0adc

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_3.subreads.bam.pbi differ
diff --git a/tests/data/pbbamify/synthetic_movie_all.subreadset.xml.in b/tests/data/pbbamify/synthetic_movie_all.subreadset.xml.in

new file mode 100644 (file)

index 0000000..508d167
--- /dev/null
+++ b/tests/data/pbbamify/synthetic_movie_all.subreadset.xml.in
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:SubreadSet CreatedAt="2017-12-05T04:48:31" MetaType="PacBio.DataSet.SubreadSet" Name="pacbio_dataset_subreadset-171205_124831161" Tags="" TimeStampedName="pacbio_dataset_subreadset-171205_124831161" UniqueId="a9e4086c-db3a-bded-923b-3c84fbeae52b" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_1.subreads.bam" TimeStampedName="pacbio_subreadfile_subreadbamfile-171205_124831158" UniqueId="f44a1fe8-236d-442a-80bb-50733b70cd36">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_1.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-171205_124831159" UniqueId="5de6a1ac-ffbd-41a8-b104-2d7ef4f4ff13"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_1.subreads.bam.bai" TimeStampedName="pacbio_index_bamindex-171205_124831158" UniqueId="30a69956-c1ff-4e39-a901-f35041c1071a"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_2.subreads.bam" TimeStampedName="pacbio_subreadfile_subreadbamfile-171205_124831158" UniqueId="80a97293-bb25-4cc5-a0b2-5189335beecf">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_2.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-171205_124831158" UniqueId="5482eae8-7c35-48c4-9a6f-dca2c1ee3181"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_2.subreads.bam.bai" TimeStampedName="pacbio_index_bamindex-171205_124831158" UniqueId="7e75f3b5-29ee-4c90-b0a6-7bc5e917d41f"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_3.subreads.bam" TimeStampedName="pacbio_subreadfile_subreadbamfile-171205_124831158" UniqueId="4ddcb97d-e879-455f-bc99-97b24efe7d65">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_3.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-171205_124831158" UniqueId="d9d1d76b-2a63-4b39-b3bb-709c93c64114"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_3.subreads.bam.bai" TimeStampedName="pacbio_index_bamindex-171205_124831158" UniqueId="5310cd63-9f1f-47ee-992d-d60c7b16e278"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>502</pbds:TotalLength>
+               <pbds:NumRecords>6</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:SubreadSet>
diff --git a/tests/data/phi29.bam b/tests/data/phi29.bam

new file mode 100644 (file)

index 0000000..46176b6

Binary files /dev/null and b/tests/data/phi29.bam differ
diff --git a/tests/data/phi29.bam.pbi b/tests/data/phi29.bam.pbi

new file mode 100644 (file)

index 0000000..5282b94

Binary files /dev/null and b/tests/data/phi29.bam.pbi differ
diff --git a/tests/data/polymerase/consolidate.subread.dataset.xml b/tests/data/polymerase/consolidate.subread.dataset.xml

new file mode 100644 (file)

index 0000000..ca85a7a
--- /dev/null
+++ b/tests/data/polymerase/consolidate.subread.dataset.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+    TimeStampedName="subreadset_150304_231155" 
+    MetaType="PacBio.DataSet.SubreadSet" 
+    Name="DataSet_SubreadSet" 
+    Version="3.0.0" 
+    CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="production.subreads.bam">
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource
+        UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195"
+        TimeStampedName="scraps_bam_150304_231155"
+        MetaType="PacBio.SubreadFile.ScrapsBamFile"
+        ResourceId="production.scraps.bam">
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="qStart" Value="4000" Operator=">"/>
+            <pbbase:Property Name="qStart" Value="5000" Operator="<"/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
diff --git a/tests/data/polymerase/filtered_resources.subread.dataset.xml b/tests/data/polymerase/filtered_resources.subread.dataset.xml

new file mode 100644 (file)

index 0000000..e414e00
--- /dev/null
+++ b/tests/data/polymerase/filtered_resources.subread.dataset.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+    TimeStampedName="subreadset_150304_231155" 
+    MetaType="PacBio.DataSet.SubreadSet" 
+    Name="DataSet_SubreadSet" 
+    Version="3.0.0" 
+    CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="./production.subreads.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.ScrapsBamFile" 
+                ResourceId="./production.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="./internal.subreads.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.ScrapsBamFile" 
+                ResourceId="./internal.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5197" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.HqRegionBamFile" 
+        ResourceId="./production_hq.hqregion.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5199" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.HqScrapsBamFile" 
+                ResourceId="./production_hq.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="zm" Value="100000" Operator="=="/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
+\ No newline at end of file
diff --git a/tests/data/polymerase/internal.hqregions.bam b/tests/data/polymerase/internal.hqregions.bam

new file mode 100644 (file)

index 0000000..8e31e6b

Binary files /dev/null and b/tests/data/polymerase/internal.hqregions.bam differ
diff --git a/tests/data/polymerase/internal.hqregions.bam.pbi b/tests/data/polymerase/internal.hqregions.bam.pbi

new file mode 100644 (file)

index 0000000..b79e661

Binary files /dev/null and b/tests/data/polymerase/internal.hqregions.bam.pbi differ
diff --git a/tests/data/polymerase/internal.lqregions.bam b/tests/data/polymerase/internal.lqregions.bam

new file mode 100644 (file)

index 0000000..96878a3

Binary files /dev/null and b/tests/data/polymerase/internal.lqregions.bam differ
diff --git a/tests/data/polymerase/internal.lqregions.bam.pbi b/tests/data/polymerase/internal.lqregions.bam.pbi

new file mode 100644 (file)

index 0000000..a4b7237

Binary files /dev/null and b/tests/data/polymerase/internal.lqregions.bam.pbi differ
diff --git a/tests/data/polymerase/internal.polymerase.bam b/tests/data/polymerase/internal.polymerase.bam

new file mode 100644 (file)

index 0000000..8f293c1

Binary files /dev/null and b/tests/data/polymerase/internal.polymerase.bam differ
diff --git a/tests/data/polymerase/internal.polymerase.bam.pbi b/tests/data/polymerase/internal.polymerase.bam.pbi

new file mode 100644 (file)

index 0000000..c423905

Binary files /dev/null and b/tests/data/polymerase/internal.polymerase.bam.pbi differ
diff --git a/tests/data/polymerase/internal.scraps.bam b/tests/data/polymerase/internal.scraps.bam

new file mode 100644 (file)

index 0000000..47c1689

Binary files /dev/null and b/tests/data/polymerase/internal.scraps.bam differ
diff --git a/tests/data/polymerase/internal.scraps.bam.pbi b/tests/data/polymerase/internal.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..9db21f2

Binary files /dev/null and b/tests/data/polymerase/internal.scraps.bam.pbi differ
diff --git a/tests/data/polymerase/internal.subreads.bam b/tests/data/polymerase/internal.subreads.bam

new file mode 100644 (file)

index 0000000..00ad171

Binary files /dev/null and b/tests/data/polymerase/internal.subreads.bam differ
diff --git a/tests/data/polymerase/internal.subreads.bam.pbi b/tests/data/polymerase/internal.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..b0d7e28

Binary files /dev/null and b/tests/data/polymerase/internal.subreads.bam.pbi differ
diff --git a/tests/data/polymerase/multiple_resources.subread.dataset.xml b/tests/data/polymerase/multiple_resources.subread.dataset.xml

new file mode 100644 (file)

index 0000000..109535d
--- /dev/null
+++ b/tests/data/polymerase/multiple_resources.subread.dataset.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+    TimeStampedName="subreadset_150304_231155" 
+    MetaType="PacBio.DataSet.SubreadSet" 
+    Name="DataSet_SubreadSet" 
+    Version="3.0.0" 
+    CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="./production.subreads.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.ScrapsBamFile" 
+                ResourceId="./production.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5197" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.HqRegionBamFile" 
+        ResourceId="./production_hq.hqregion.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5199" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.HqScrapsBamFile" 
+                ResourceId="./production_hq.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+</pbds:SubreadSet>
+\ No newline at end of file
diff --git a/tests/data/polymerase/production.polymerase.bam b/tests/data/polymerase/production.polymerase.bam

new file mode 100644 (file)

index 0000000..4c84b23

Binary files /dev/null and b/tests/data/polymerase/production.polymerase.bam differ
diff --git a/tests/data/polymerase/production.scraps.bam b/tests/data/polymerase/production.scraps.bam

new file mode 100644 (file)

index 0000000..a32bdfb

Binary files /dev/null and b/tests/data/polymerase/production.scraps.bam differ
diff --git a/tests/data/polymerase/production.scraps.bam.pbi b/tests/data/polymerase/production.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..5ef119d

Binary files /dev/null and b/tests/data/polymerase/production.scraps.bam.pbi differ
diff --git a/tests/data/polymerase/production.subreads.bam b/tests/data/polymerase/production.subreads.bam

new file mode 100644 (file)

index 0000000..452aad5

Binary files /dev/null and b/tests/data/polymerase/production.subreads.bam differ
diff --git a/tests/data/polymerase/production.subreads.bam.pbi b/tests/data/polymerase/production.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..7ff2fcc

Binary files /dev/null and b/tests/data/polymerase/production.subreads.bam.pbi differ
diff --git a/tests/data/polymerase/production_hq.hqregion.bam b/tests/data/polymerase/production_hq.hqregion.bam

new file mode 100644 (file)

index 0000000..66d436b

Binary files /dev/null and b/tests/data/polymerase/production_hq.hqregion.bam differ
diff --git a/tests/data/polymerase/production_hq.hqregion.bam.pbi b/tests/data/polymerase/production_hq.hqregion.bam.pbi

new file mode 100644 (file)

index 0000000..ec8f166

Binary files /dev/null and b/tests/data/polymerase/production_hq.hqregion.bam.pbi differ
diff --git a/tests/data/polymerase/production_hq.scraps.bam b/tests/data/polymerase/production_hq.scraps.bam

new file mode 100644 (file)

index 0000000..716e098

Binary files /dev/null and b/tests/data/polymerase/production_hq.scraps.bam differ
diff --git a/tests/data/polymerase/production_hq.scraps.bam.pbi b/tests/data/polymerase/production_hq.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..1017562

Binary files /dev/null and b/tests/data/polymerase/production_hq.scraps.bam.pbi differ
diff --git a/tests/data/polymerase/qnameFiltered.subreads.dataset.xml b/tests/data/polymerase/qnameFiltered.subreads.dataset.xml

new file mode 100644 (file)

index 0000000..c200ded
--- /dev/null
+++ b/tests/data/polymerase/qnameFiltered.subreads.dataset.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd"
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c"
+    TimeStampedName="subreadset_150304_231155"
+    MetaType="PacBio.DataSet.SubreadSet"
+    Name="DataSet_SubreadSet"
+    Version="3.0.0"
+    CreatedAt="2015-01-27T09:00:01">
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193"
+        TimeStampedName="subread_bam_150304_231155"
+        MetaType="PacBio.SubreadFile.SubreadBamFile"
+        ResourceId="production.subreads.bam">
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource
+        UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195"
+        TimeStampedName="scraps_bam_150304_231155"
+        MetaType="PacBio.SubreadFile.ScrapsBamFile"
+        ResourceId="production.scraps.bam">
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="qname_file" Value="qname_whitelist.txt" Operator="="/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
diff --git a/tests/data/polymerase/qname_whitelist.txt b/tests/data/polymerase/qname_whitelist.txt

new file mode 100644 (file)

index 0000000..0004061
--- /dev/null
+++ b/tests/data/polymerase/qname_whitelist.txt
@@ -0,0 +1,3 @@
+ArminsFakeMovie/0/3116_3628
+ArminsFakeMovie/0/3722_4267
+ArminsFakeMovie/0/6812_7034
diff --git a/tests/data/polymerase/scrapless.scraps.bam b/tests/data/polymerase/scrapless.scraps.bam

new file mode 100644 (file)

index 0000000..7b989c4

Binary files /dev/null and b/tests/data/polymerase/scrapless.scraps.bam differ
diff --git a/tests/data/polymerase/scrapless.scraps.bam.pbi b/tests/data/polymerase/scrapless.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..140af8a

Binary files /dev/null and b/tests/data/polymerase/scrapless.scraps.bam.pbi differ
diff --git a/tests/data/polymerase/scrapless.subreads.bam b/tests/data/polymerase/scrapless.subreads.bam

new file mode 100644 (file)

index 0000000..739b3b4

Binary files /dev/null and b/tests/data/polymerase/scrapless.subreads.bam differ
diff --git a/tests/data/polymerase/scrapless.subreads.bam.pbi b/tests/data/polymerase/scrapless.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..a20a00f

Binary files /dev/null and b/tests/data/polymerase/scrapless.subreads.bam.pbi differ
diff --git a/tests/data/referenceset.xml b/tests/data/referenceset.xml

new file mode 100644 (file)

index 0000000..3099906
--- /dev/null
+++ b/tests/data/referenceset.xml
@@ -0,0 +1,21 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<pbds:ReferenceSet TimeStampedName="referenceset_150304_231155" MetaType="PacBio.DataSet.ReferenceSet" Name="lambdaNEB" CreatedAt="2016-01-04T18:02:13.181-08:00" UniqueId="3ad75a14-f43a-48bd-9dc1-8a08af29f587" Version="3.0.1" Author="pbscala 0.50.1-SNAPSHOT reference_info_dataset_0.4.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd">
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource Name="First References FASTA" Description="Points to an example references FASTA file." MetaType="PacBio.ReferenceFile.ReferenceFastaFile" ResourceId="./lambdaNEB.fa" UniqueId="fc549593-3c6b-4d21-82a6-5236fbbdf1c8" TimeStampedName="pacbio_dataset_index-fc549593-3c6b-4d21-82a6-5236fbbdf1c8" Tags="converted">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex UniqueId="3a83d609-f6f1-49df-84d6-ace38fd6e9c2" TimeStampedName="pacbio_dataset_index-3a83d609-f6f1-49df-84d6-ace38fd6e9c2" MetaType="PacBio.Index.SamIndex" ResourceId="./lambdaNEB.fa.fai"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource Name="Second References FASTA" Description="Points to an example references FASTA file." MetaType="PacBio.ReferenceFile.ReferenceFastaFile" ResourceId="./chimera_minimal.fasta" UniqueId="fc549593-3c6b-4d21-82a6-5236fbbdf1c8" TimeStampedName="pacbio_dataset_index-fc549593-3c6b-4d21-82a6-5236fbbdf1c8" Tags="converted">
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:DataSetMetadata>
+        <pbds:TotalLength>48502</pbds:TotalLength>
+        <pbds:NumRecords>1</pbds:NumRecords>
+        <pbds:Organism>lambdaNEB</pbds:Organism>
+        <pbds:Ploidy>haploid</pbds:Ploidy>
+        <pbds:Contigs>
+            <pbds:Contig Name="ref000001" Description="lambda_NEB3011" Length="48502" Digest="a1319ff90e994c8190a4fe6569d0822a"/>
+        </pbds:Contigs>
+    </pbds:DataSetMetadata>
+</pbds:ReferenceSet>
diff --git a/tests/data/refskip.bam b/tests/data/refskip.bam

new file mode 100644 (file)

index 0000000..47804a0

Binary files /dev/null and b/tests/data/refskip.bam differ
diff --git a/tests/data/relative/a/test.bam b/tests/data/relative/a/test.bam

new file mode 100644 (file)

index 0000000..bd06b8a

Binary files /dev/null and b/tests/data/relative/a/test.bam differ
diff --git a/tests/data/relative/b/test1.bam b/tests/data/relative/b/test1.bam

new file mode 100644 (file)

index 0000000..bd06b8a

Binary files /dev/null and b/tests/data/relative/b/test1.bam differ
diff --git a/tests/data/relative/b/test2.bam b/tests/data/relative/b/test2.bam

new file mode 100644 (file)

index 0000000..bd06b8a

Binary files /dev/null and b/tests/data/relative/b/test2.bam differ
diff --git a/tests/data/relative/relative.fofn b/tests/data/relative/relative.fofn

new file mode 100644 (file)

index 0000000..755c589
--- /dev/null
+++ b/tests/data/relative/relative.fofn
@@ -0,0 +1,3 @@
+a/test.bam
+b/test1.bam
+b/test2.bam
diff --git a/tests/data/relative/relative.xml b/tests/data/relative/relative.xml

new file mode 100644 (file)

index 0000000..0e78fe4
--- /dev/null
+++ b/tests/data/relative/relative.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" CreatedAt="2015-01-27T09:00:01" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+        <pbbase:ExternalResources>
+                <pbbase:ExternalResource MetaType="SubreadFile.SubreadBamFile" ResourceId="./a/test.bam" />
+                <pbbase:ExternalResource MetaType="SubreadFile.SubreadBamFile" ResourceId="./b/test1.bam" />
+                <pbbase:ExternalResource MetaType="SubreadFile.SubreadBamFile" ResourceId="./b/test2.bam"/>
+        </pbbase:ExternalResources>
+</pbds:SubreadSet>
diff --git a/tests/data/relative/relative2.fofn b/tests/data/relative/relative2.fofn

new file mode 100644 (file)

index 0000000..f1969ac
--- /dev/null
+++ b/tests/data/relative/relative2.fofn
@@ -0,0 +1,4 @@
+a/test.bam
+b/test1.bam
+b/test2.bam
+relative.xml
diff --git a/tests/data/segfault.bam b/tests/data/segfault.bam

new file mode 100644 (file)

index 0000000..755c7eb

Binary files /dev/null and b/tests/data/segfault.bam differ
diff --git a/tests/data/softclip_deletions.bam b/tests/data/softclip_deletions.bam

new file mode 100644 (file)

index 0000000..af72134

Binary files /dev/null and b/tests/data/softclip_deletions.bam differ
diff --git a/tests/data/stitching/test_qstart.scraps.bam b/tests/data/stitching/test_qstart.scraps.bam

new file mode 100644 (file)

index 0000000..663836e

Binary files /dev/null and b/tests/data/stitching/test_qstart.scraps.bam differ
diff --git a/tests/data/stitching/test_qstart.scraps.bam.pbi b/tests/data/stitching/test_qstart.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..144a3df

Binary files /dev/null and b/tests/data/stitching/test_qstart.scraps.bam.pbi differ
diff --git a/tests/data/stitching/test_qstart.subreads.bam b/tests/data/stitching/test_qstart.subreads.bam

new file mode 100644 (file)

index 0000000..98f8feb

Binary files /dev/null and b/tests/data/stitching/test_qstart.subreads.bam differ
diff --git a/tests/data/stitching/test_qstart.subreads.bam.pbi b/tests/data/stitching/test_qstart.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..860efc5

Binary files /dev/null and b/tests/data/stitching/test_qstart.subreads.bam.pbi differ
diff --git a/tests/data/test_GenomicIntervals/adjacent_intervals.alignmentset.xml b/tests/data/test_GenomicIntervals/adjacent_intervals.alignmentset.xml

new file mode 100644 (file)

index 0000000..2ef2ce3
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/adjacent_intervals.alignmentset.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;" Value="10"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="3"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;" Value="17"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="10"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/contig_name_only.alignmentset.xml b/tests/data/test_GenomicIntervals/contig_name_only.alignmentset.xml

new file mode 100644 (file)

index 0000000..4105c54
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/contig_name_only.alignmentset.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/disjoint_intervals.alignmentset.xml b/tests/data/test_GenomicIntervals/disjoint_intervals.alignmentset.xml

new file mode 100644 (file)

index 0000000..8fba1be
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/disjoint_intervals.alignmentset.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;" Value="7"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="3"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;" Value="17"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="13"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/empty.alignmentset.xml b/tests/data/test_GenomicIntervals/empty.alignmentset.xml

new file mode 100644 (file)

index 0000000..33967ea
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/empty.alignmentset.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;" Value="5"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="5"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/invalid_missing_rname.alignmentset.xml b/tests/data/test_GenomicIntervals/invalid_missing_rname.alignmentset.xml

new file mode 100644 (file)

index 0000000..d24f5b7
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/invalid_missing_rname.alignmentset.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="tstart" Operator="&lt;" Value="10"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="3"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/invalid_rname_operator.alignmentset.xml b/tests/data/test_GenomicIntervals/invalid_rname_operator.alignmentset.xml

new file mode 100644 (file)

index 0000000..30f7a8c
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/invalid_rname_operator.alignmentset.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="&lt;" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;" Value="10"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="3"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/invalid_tstart_operator.alignmentset.xml b/tests/data/test_GenomicIntervals/invalid_tstart_operator.alignmentset.xml

new file mode 100644 (file)

index 0000000..72e2f92
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/invalid_tstart_operator.alignmentset.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="eq" Value="10"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="3"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/no_filter.alignmentset.xml b/tests/data/test_GenomicIntervals/no_filter.alignmentset.xml

new file mode 100644 (file)

index 0000000..71a0a76
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/no_filter.alignmentset.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/out_of_range.alignmentset.xml b/tests/data/test_GenomicIntervals/out_of_range.alignmentset.xml

new file mode 100644 (file)

index 0000000..61cd5c0
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/out_of_range.alignmentset.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;" Value="10000"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="1000"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/single_interval.alignmentset.xml b/tests/data/test_GenomicIntervals/single_interval.alignmentset.xml

new file mode 100644 (file)

index 0000000..fbcfb88
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/single_interval.alignmentset.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;" Value="10"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="3"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/single_interval_end_gte.alignmentset.xml b/tests/data/test_GenomicIntervals/single_interval_end_gte.alignmentset.xml

new file mode 100644 (file)

index 0000000..1585065
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/single_interval_end_gte.alignmentset.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;" Value="10"/>
+                               <pbbase:Property Name="tend" Operator="&gt;=" Value="3"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/single_interval_start_lte.alignmentset.xml b/tests/data/test_GenomicIntervals/single_interval_start_lte.alignmentset.xml

new file mode 100644 (file)

index 0000000..b47bb68
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/single_interval_start_lte.alignmentset.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;=" Value="10"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="3"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/test_contigs.bam b/tests/data/test_GenomicIntervals/test_contigs.bam

new file mode 100644 (file)

index 0000000..b1958a7

Binary files /dev/null and b/tests/data/test_GenomicIntervals/test_contigs.bam differ
diff --git a/tests/data/test_GenomicIntervals/test_contigs.bam.pbi b/tests/data/test_GenomicIntervals/test_contigs.bam.pbi

new file mode 100644 (file)

index 0000000..2b593d1

Binary files /dev/null and b/tests/data/test_GenomicIntervals/test_contigs.bam.pbi differ
diff --git a/tests/data/test_GenomicIntervals/test_contigs.fasta b/tests/data/test_GenomicIntervals/test_contigs.fasta

new file mode 100644 (file)

index 0000000..19d40af
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/test_contigs.fasta
@@ -0,0 +1,4 @@
+>contig1
+CTGTTATCTCGAGCGTTATG
+>contig2
+TGTCAAATGG
diff --git a/tests/data/test_GenomicIntervals/two_contigs.alignmentset.xml b/tests/data/test_GenomicIntervals/two_contigs.alignmentset.xml

new file mode 100644 (file)

index 0000000..baac8bb
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/two_contigs.alignmentset.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;=" Value="10"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="3"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig2"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;" Value="7"/>
+                               <pbbase:Property Name="tend" Operator="&gt;=" Value="3"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_GenomicIntervals/whole_contig.alignmentset.xml b/tests/data/test_GenomicIntervals/whole_contig.alignmentset.xml

new file mode 100644 (file)

index 0000000..f272358
--- /dev/null
+++ b/tests/data/test_GenomicIntervals/whole_contig.alignmentset.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:AlignmentSet CreatedAt="2018-12-11T21:24:26" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-181211_202425989" Tags="" TimeStampedName="pacbio_dataset_alignmentset-181211_202425989" UniqueId="e836d9c0-8b18-4159-fd44-9d49a1400fc5" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="test_contigs.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-181211_202425989" UniqueId="4828832c-c33b-4f0b-a9ec-e9dba2a84e30">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="test_contigs.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-181211_202426034" UniqueId="8044e806-405a-44b5-9539-665ff7dbd394"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="test_contigs.bam.bai" TimeStampedName="pacbio_index_bamindex-181211_202426034" UniqueId="bb782ecb-a9f4-44a6-be0c-07b3fb6e733e"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="rname" Operator="=" Value="contig1"/>
+                               <pbbase:Property Name="tstart" Operator="&lt;" Value="20"/>
+                               <pbbase:Property Name="tend" Operator="&gt;" Value="0"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>0</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:AlignmentSet>
diff --git a/tests/data/test_group_query/group.fofn.in b/tests/data/test_group_query/group.fofn.in

new file mode 100644 (file)

index 0000000..4af9e82
--- /dev/null
+++ b/tests/data/test_group_query/group.fofn.in
@@ -0,0 +1,3 @@
+@PacBioBAM_TestsDir@/data/test_group_query/test1.bam
+@PacBioBAM_TestsDir@/data/test_group_query/test2.bam
+@PacBioBAM_TestsDir@/data/test_group_query/test3.bam
diff --git a/tests/data/test_group_query/test1.bam b/tests/data/test_group_query/test1.bam

new file mode 100644 (file)

index 0000000..5673abc

Binary files /dev/null and b/tests/data/test_group_query/test1.bam differ
diff --git a/tests/data/test_group_query/test2.bam b/tests/data/test_group_query/test2.bam

new file mode 100644 (file)

index 0000000..565b224

Binary files /dev/null and b/tests/data/test_group_query/test2.bam differ
diff --git a/tests/data/test_group_query/test2.bam.pbi b/tests/data/test_group_query/test2.bam.pbi

new file mode 100644 (file)

index 0000000..384ad28

Binary files /dev/null and b/tests/data/test_group_query/test2.bam.pbi differ
diff --git a/tests/data/test_group_query/test3.bam b/tests/data/test_group_query/test3.bam

new file mode 100644 (file)

index 0000000..3b1e21b

Binary files /dev/null and b/tests/data/test_group_query/test3.bam differ
diff --git a/tests/data/transcript.subreads.bam b/tests/data/transcript.subreads.bam

new file mode 100644 (file)

index 0000000..13f5efd

Binary files /dev/null and b/tests/data/transcript.subreads.bam differ
diff --git a/tests/data/transcript.subreads.bam.pbi b/tests/data/transcript.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..2729f5d

Binary files /dev/null and b/tests/data/transcript.subreads.bam.pbi differ
diff --git a/tests/data/transcriptset.xml b/tests/data/transcriptset.xml

new file mode 100644 (file)

index 0000000..60e68c7
--- /dev/null
+++ b/tests/data/transcriptset.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:TranscriptSet CreatedAt="2018-04-20T17:33:59.218Z" MetaType="PacBio.DataSet.TranscriptSet" Name="2016-08-30_AppsInst_SMS_Flea_IsoSeq_RC0_largeSizeTranscripts_2 (P5--P3) (unpolished)" Tags="barcoded,unpolished" TimeStampedName="2016-08-30_AppsInst_SMS_Flea_IsoSeq_RC0_largeSizeTranscripts_2 (P5--P3) (unpolished)-180420_173359218" UniqueId="66b978ae-27bb-a9e9-abde-5a4e00d256e3" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.TranscriptReadFile.TranscriptReadBamFile" ResourceId="./transcript.subreads.bam" TimeStampedName="pacbio_transcriptreadfile_transcriptreadbamfile-180420_173359218" UniqueId="47874301-b8c8-415e-bcb9-d547a8527a78" Version="3.0.1">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="./transcript.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-180420_173359218" UniqueId="097702db-54b9-4b7b-aa87-f21bfb820097" Version="3.0.1"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="movie" Operator="=" Value="transcript"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>4</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:TranscriptSet>
diff --git a/tests/data/unmap1.bam b/tests/data/unmap1.bam

new file mode 100644 (file)

index 0000000..3fe2af5

Binary files /dev/null and b/tests/data/unmap1.bam differ
diff --git a/tests/data/unmap1.bam.bai b/tests/data/unmap1.bam.bai

new file mode 100644 (file)

index 0000000..dd19971

Binary files /dev/null and b/tests/data/unmap1.bam.bai differ
diff --git a/tests/data/unmap2.bam b/tests/data/unmap2.bam

new file mode 100644 (file)

index 0000000..8feed79

Binary files /dev/null and b/tests/data/unmap2.bam differ
diff --git a/tests/data/unmap2.bam.bai b/tests/data/unmap2.bam.bai

new file mode 100644 (file)

index 0000000..f495714

Binary files /dev/null and b/tests/data/unmap2.bam.bai differ
diff --git a/tests/data/vcf/structural_variants.vcf b/tests/data/vcf/structural_variants.vcf

new file mode 100644 (file)

index 0000000..8a05b60
--- /dev/null
+++ b/tests/data/vcf/structural_variants.vcf
@@ -0,0 +1,32 @@
+##fileformat=VCFv4.2
+##fileDate=20180509
+##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variant">
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant described in this record">
+##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
+##INFO=<ID=SVANN,Number=.,Type=String,Description="Repeat annotation of structural variant">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Per-sample read depth of this structural variant">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth at this position for this sample">
+#CHROM POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  UnnamedSample
+chrI   12573   pbsv.INS.1      A       ATGGGTAACAGGTGGTAATGAAGACGTAATTTCTGACTTGTTGATTGTATGGAAAGGTGGTTAATGATGAAGTAATTTCTGATTGGTGTTGTATGGTAAACAGGTGGGTAATGAAGAAGTATTTCGGACTTTGTTGCCACGGTAACGGTGGAAGATGAAGTAAGTAATTTCATGACTTGTTGTTGTCTTGTACTGGGTAACAAGGTAGGTAATGATGAAGTAATTTCTGACTTGTTGTTTACTGGTAACAGGTGGTACTGAAGAAAGTAATGCCTGACTGTTGTTGCACGGGTAACCGGTGGTAATGATGGAAGTTGAAGTAATTCCTGATTGTTGTTGTACTGGAACAGGGTGTAAGAAGAAAGAAGTAATTTCCTGACTTGTTGTTGTA .       PASS    IMPRECISE;SVTYPE=INS;END=12573;SVLEN=390;SVANN=TANDEM   GT:AD:DP        1/1:13:13
+chrIII 91471   pbsv.DEL.2      GCTATCGATGCTACAGGTGTTCCACTTCCAGATGAGGCGCTGGAAGCCTCCAAGAAGGCTGATGCCGTTTTGTTAGGTGCTGTGGGTGGTCCTAAATGGGGTACCGGTAGTGTTAGACCTGAACAAGGTTTACTAAAAATCCGTAAAGAACTTCAATTGTACGCCAACTTAAGACCATGTAACTTTGCATCCGACTCTCTTTTAGACTTATCTCCAATCAAGCCACAATTTGCTAAAGGTACTGACTTCGTTGTTGTCAGAGAATTAGTGGGAGGTATTTACTTTGGTAAGAGAAAGGAAGACGATGGTGATGGTGTCGCTTGGGATAGTGAACAATACACCGTTCCAGAAGTGCAAAGAATCACAAGAATGGCCGCTTTCATGGCCCTACAACATGAGCCACCATTGCCTATTTGGTCCTTGGATAAAGCTAATGTTTTGGCCTCTTCAAGATTATGGAGAAAAACTGTGGAGGAAACCATC     G       .       PASS    IMPRECISE;SVTYPE=DEL;END=91953;SVLEN=-482       GT:AD:DP        0/1:2:5
+chrIII 169209  pbsv.INS.3      T       TTGCCACACGGTGACTGGTTATTCAATAGCGGTATAGCTTCACTGACTGCGTGTTTTCTGCCTTCTGTAGTTTGAAGTGCCTGTTAACGAGCCCTTTGTCGACTTTCCTCCAATCACTTTTCCGTTGAGTAGGAAAATGTTCCCAATTTGTGACTTGTAATATGGTTTTTACCATATGACAGCATCGCTTTGCGACTAGTTTATTATCTGTTGGTAGGTTTGTTTTGTGCCATATACTTGTTTATCTCTAGTGTCCCACTGAATTGTATTACACTCATATGTCATGTCTAAAACTTGCCTAAGGGGAAAGTATTGTTGAGCAAGTGTGTTTGATGTAGTATATAAGTCAATCTACATGTTTATATCCAACATATGAATCTAGACCCAATCAACTTTTGCATTTTCATGTACCTTCTCTTTGTATTCATCTCATCTCTTTCAGTTCATCCTGTCATATAAAACCTGTTGCCCTGAGCGCTAATTTTTCTTCCTTTTGGAGTCAAAGTACGTTTAATTTGGGGTAATTTTCTCAGTTAATGAGTTTTCCATACCTAAATTTCATGTATTTACCTCTTTGATATTCGTTTCTAAGGCCAAGTATGTATATTGCATTTCCTCATCACTTCGCCCTAGATTTATAATCTTGGTGTCGTATTGCCTCTTAAGCTTTCTTCTCTAAATTCTTTGTTTGAATTTATATTTTATGCTAAACAATACCATACATATCATCCTACCGACTAAACAAATTGTCACTTGACTGTTTTTTAAATACGCAATGACCATCCACGAACTTCTTCCCTACCAATTGTTGTTCAGGTAATGATTTGATAGTTTCGTACCGTTCGCTCCACTTTGTTTTCAATCCATAAAGTTGATTTCTTTCTTCAAACGTATAACTTATCATTCATCCTAATGTGGTGGAGGTCTTATGTATATTCTTCTTTATTCCTGCATACAAATATGCCGAAGATATGTCTAATTGGTACTATAGTAGTTATTGTCTAAGCAAGTGACAGGGTGTCATTAATCTAGTGAGTACGGGTATTGGATTGCATGCCTGAGTCGTAAGTGTCGGATGCTGAATATCACCTCTTGCAACAAATCTAGCTTTATGAGGTACCAGTCACGTTTCTTGTTAAGATAAAACTGAGTTTATTACTCTTTTAGGGTCTATTTCGTTCTGTCCATAATATTCGTCAGTGTCCCCAAGTTTTCACTTCAACAGTTTGATTGACGTCTTCTTTGTGGTTATGCCTCGATCTATTTTTCTTTTTCTTTAATATCTTTATTATAGGTGATTGCCCTCATCGTATCGTAAGGTTGTCCGTATTGGTTTGATTGATTTTACTGCTTTTAACAGCTGGCAATCGGGAATTCGTTTCTCTCTCGATCTCGGAGGTTCTAACTACGGCATTTCTAGTATTCCCATGTGTCTCGTGATACCTTACTTTCAGTTTCTTATTTCTAATGATCTTTTCTTCTCTTTTCTTACTGTTGATAGTAGTATAGGCCATTAGCGTCACCATACCACCCACACTGGAATTAGTTTGACGAGAATTTATCGGTGGGAGTTTCTTTACATGGGTCAGGGACTTCGGTAAGGAGATTCTGGAGGTAGATCAGGAGTGGGAGATCAGCGATGATAGATTCTCGGTATTCTGTTTAGAAACATGTTGGCGTTTTGATAGGACAATATTGGTGCGATGAATTTTTTCCCGGTGGAGAAGCATCGATTGAAGGTGAAACGGTGGGGTATAATCCCTTTTTCTCAGTCTCTTCTTGGTCACTTATCTTGCGGACATTTTGTTGTTGGTAACCCCGTACTGGATATTGGTACGTTTGTATGATTAGTCCCATTTTCACTGTACGAGTTCTGAATGTCGAAATCTTAGATTACTGGCGTGCGACGACCATGTGTGTTAGATTGGACATGGGAGCAAGTAAAGGAACATTTAATTTATGCCATACCACCCAACCGGTACTTACTCTCGATATTGGAAATTTGGGGGGTGCTAGATCTCTTCTTTGAATGGAAGCAATATTAAGATTCAATAGTGGGGGTCCAACTCTCTGGTGCAACAAGATTGATTTCAGATATAGGTTGGAATCTTCAGTATGAGTTGACGGAGGTGTGGAATCGGTTGGACTCACAGCTTTTGACAGACATTTCTCGGTTGCCTCAGGATGTAGTTCAATGTCAGATTGGAAGTCAGGTCAGATTCTAATGTAAAAAACATTGGATCTTGATCTCCTTTGACGCAATACGAATGATATGAAAGTCATTTCAACGGTTTAAGTCTTCATCGAAAGTGAGTGCTCGTAAGTGAATGATCTTCTGGATTCCTTGCCCTGAAGAATAACATGTAGTTGTCTCTACTGTCTTCTTTAAGGAGGAAGATAGATGAATATATCCATAAGATTTCAGACGGATGTAGAGCTTAACCTGGATGCCACGAGGGATGTATTTGGAGTTAGGGTTGTATCATTGACTATAACAGGTTGACCGAAGTAACACTACTGATATCAAGTCCTCCAAGCCAGCATGTGGCTTGGCAGATTTTTTGCTTTAGGTGAAGCTAAGTGACTTCCTCACAATAGTAGAAAATCGATTGGCAGAAACCATAAATGGTTCGGTAAACCACTACCATGCAGTTGAGTAACGGCAGTCATCTAATAAGGTACTGTTGAGGCCGTTCAGCGACTCCAGTCTCGGGATCCGCTGTGGGTTGTATAGCAATGGATTTACCATTTTTTTCAAGGAATTTATGGAGAGTTCTGATAGTATACTCAGAACCACGGTCCTTTGTATAACCAAGACACTGCCTGAATTGGTTTTAATGAAAGCTAGTAATCGTGGTAAATAAAATCGAGGTATAGTCCTCCGCGACGGTCGTGCAATGGATCAACCCAACGGAATTTTGTTGTCTCATCAGACTGAGATGAAATAGGATGGTGCACTTTTTTGGTAGGTTGTGACTGGACCAAATATGTCATAGTAGGTATTGAAGGGTTCGTAAAATGACATTTTTGGATTTTATCGTGAACCTTGATATGTCTGTGGTTTGGTGCTTTTGCCGATTTAAACAATCAGGACATTGATAGTCAATAGCACTAACCAGTTCGACATCTGATTCGTTAACAATACGTGATTGGTGTTATTTTTTAAGTGATATCGAATTGTCTTGCTTGGTCATATGCGCAAGCATTCGATGAATGAAAGATAAGGATATTTGCGTGTACTTTCACTTGTATGACAGTATTGAGGTGGGTAACGGAGATATTTGGACTGAAGCAAGTACTTTTTAGATACCCGTAAAAGTCTCCAATTGTACATAGTGCAAGTCCATGCCGTTAGATCGTTCTAAGACGTTTTTGGTAAAGCTGCTTGCTATCTACTGCAGCAATTCATCAAACTGAGTAAGTCATAGGCTAGGTTAGGAGTTTGCAATACCTTTATTGATGTTTTTGGGTTGTTGTCCTGGAAAGTGAAATTGTAGGTCACCAATAGCGTTAATTGGTAATTTCGTTTTTTAGATCACTACGTTTATGCAGGGATTAGATGATGCTGATGTATGTGATGAGCAGATCTTTAAGGGTTCGTGATGCTCCTGAATCGAGAAAGAGGTGTCATGGATTTCATCTCAGAATGAATTAGTGTGATTTACCGTATATTCAGTAAAGTTCCTTGCCTAAGGTGAAGTCGGCTTATTGTTCAAGGAATCGTTCAGTAGTTTATTTACTGATGGAATCGTTGTCCGTGCTGGGAGAGTTATTAGATGTGGATACTTGTGGAGCCTGGCTGTTTCGATTTCGAATTATTTGTTTTTTGAGGATTCCGAGCTATAACTTTTTTGGGGTTGGTTGTATTCGTATAGCTGCAGAGAATCCTTTCTTCTCATCCTCAGATTTCTCCTGTATTAGGTTTGCTGTTTCTCGATCCCTTTTGTTCTTCATAAATAGCATGGATTCTAGAACAGTTCAGCGACTGTCATATTTAAGATGTCGAGAGTGTTAGCGTAAAAAATTTATATTCGCCAGATAGACCTCTTCCATAATTATTCATGCGACCTTGTTATTGATATGAATCCTTATTGTTTCAGTTCTGTCGATAATGTTTGTGACTTTTGTTTCAAATGCCTCTGCAGGTGTACCTGCCATTATATTGCAAATTTGCCAGGGTCACAATGTCGTTTTGCCTCTTGGTCATCAGATTGCTTTTTTCAATGACTTTTTGAAAGAATTTTCATGAGTATCCGTATAATCAACGGATAGGATGTCTTTGAACCCGGTAGGTTATGAATGAGAGGAGCAAATATTTGAAAAAGTGTTATACAAGAAGGGTATTCAATCCTCAGTGATCTTGACTACGGTTTTCGTTTACTGTCGGAATAAACCACCGAGATTCGAGTTTTTGTAAAATTTGATGTATGTTTTAACCCAATTTGGGAAAGTCCTTAGGTGAGGTTACCATTGGGGTGGTCTGACTATTTTTTAGGTGGATGCCATATCAGAGTCCGCGTGGGATGAATCAGTAAATGTAGTACCTGCTCAGTTGATGGAGTGCTCAGAGTCGTTCCACCAACTGATGATGGATACTGCGGAAACGTGATTGTGGCCAGGTGGGAAAGTAACCATAGGCGACATTTTGATAAGTGTATACGGAATCCTAGATGGGTGTCCCCTAAAATGACCAACCAGATGGATTTGCTTGGTTTTTGGGTCATCAAGCACTGCTGTGGGTACGGCCCATTCTGTGGAGTGGTACTGAAGCAGTGAGGAGAGGCATGATGGGGGTTCTCTCTGGAACAGCTGATGACGCAGTGTTGTTGTCTGTTTGAGAGTTTAGCCTTATGGAAGCCCTTATCATATTCTGGAATTTTGAAGCTGAAAACGTCTAATCGGATCTTGATTTGTGTTGGACTTCCTTATAATCACCGAAGCACAGGCGCTACATGAGAATTTTTGTGGGTAATTAGTAATTAGTTGGGATTCTTGTTGATAAAGCTATAATAATTATGTATCGAATATACTAGAAGTTCTCCTCGATGATTTAGGATCCAGAAAAGGGAATCTGCAATTCTACACAATTCTATAAATATTATTATCACGTTTTATATGTTAATATTCATTGATCTATGACACTTATCATCCTTGCGTTTTCAGCTTCCACTAATTTAGATACTATTTCTCAATCATTTGCGTCATCTTCTAACACCGTATATGATCATATACTCGTAACGTAAATACTAGTTAGTAATGATGTATTTTTTATCCACATTATATCCATATATAAGAAATAACATTCCGTGAATAATCTGATAACTGTTTTGAAAACTGGTTACTTCCCTAAAGACTGTTTATAATTAGGATTTCAAGACACTCCGGTATTACTCGAGCCTCGTAATACAACATAATATCCTACATTCTAGATCCCGACATATCAATCATAAATCTGAATATTGAGACATCTTTTAATTACGTTTGGTTGCCGCAAACCAAAAATGCACTTTACATGGGTGAATAATTGAATGTTAGAAATCTTTTGTTTTTGAATAAAAATCCACTTCGCTATCAACTAATAGTTATATTCTCATATATTAATCATATATACGGTGTTAAGATGATGACATAGTTAATAGAAAGCTGTCATCGTGTTAGAGGAAGCTGAAACGCAAGGATTGATACTGTAATAGGATCAATGAATATAAACATATAAACGGAATGAGGAATAATCGTAATATTAGTATGTAAAAATATAGATTCCATTTTGAGGATTCCTATATCCTCGCCGGAGAACTTCTAGTATATTCGGTTACCTAATATTATAGCCTTTATTCACACTGGAATCCCAACAATTATCTAATTACCCACACACACATTTCCTTTCTCATGGTAGCGCTGTGCTTCGGTTATTCTAAGGAAGGCCACACAATCAAGATCCGTTAGAACGTTTCGCTTCCAAAACCAAGAATGTGAGAAGGCCTCCACTAAGGCTACCTCTCAACAACAACAACACCTGCTTCATCGCTGTTCCATAGAACCCCCATCATGCCTCTCCTCAAACTGCTCAGTCACATTCACCACAGAATGGCCGTACCCACACAGTGCATGATGACCAAAACCAAGCCAATCCATCTGGTTGGTCATTTTACGGACACCCACTATGATTCCGTATACAACCTGATCAAATGTCGCCTATGTACTTCCACCTGGGGCCACAATCACAGTTTCCGCCGTCCCTCATCATTTGGAACGCCTCTGAGCACTCCATCACCTGAGTCAGGTAATACATTTACTGATTCATCCTCAGCAGACTCTGATATGACATCCCTAACAAATATGTCAGACCACCACCAATGTTAACCTCACCTAATGACTTTCCGATTGGTTAAAACATACCCAATTTTTACAAAACGTCGAACTCGTGGTATTAATTTCCCCCCCGAACAGTAACGAAAACCCGTACGTCGATCACTGCTGATGAAACTACTTTCTGTATAACACTTTTCAATATTGCTCCCTCTCACTTCCTACCTACCTGTCAAAGACATCCTATCCGTTGATTATACGGATCTCATGAAAATTCTTTCCAAGTATTGAAAAATGCAATCTGATACCCAAAGCAAACGACTTTGACCCTGGCACATTTTGCCATATAATGGCAAATACACCGCAGATGCCATTTGAACAAAGTCCCAAACATTTACTCGACAGACTACAATAATGGCATTCATATCAATAACACGGTCCGCATGCCAATTAATTATTAAGGTCTATCGGGCGAATATAAATTTTTACGCTAACACCGTCTCGACATCTAATATGACAGTCGCTGAACTGTCTTAATACCATGCTATTTATGAAGACAACAGGGATCGAAAACAGAAACTAATTCAGGAGAAATCGAGTGATGAGAAGAATGATTCTCGCAGCTCTACGAATTACCACCAAACCCAAATTTTAGCTCGGATCCTCAAAAAACACATAATTCGAAAATCGAAAACAGCAGGGCCCACAATGTATCCACATCTATCACTCTCCCAGCACGGACAACGATTCCATCATAAATCAACTAACGGAACCTTCAAGTTGAACAATTAAGCCCCGACCTTCATCTTATGCCAGAAACTTACTGAATCTACCGTAAATCATACTCATCTCTGATAGAACTCCCTGGACACCTCCTTCTCTGATTCAGAGCATCACGAACCCTTATAAGATCTGCTCATCACATACACTCCAGCATCATCAATCCTTACCTAAACGTAGTTGATGCTCAAAAAAAAATATACCAATTACGCTAATTGGTGACCCACAATTCACTTCAGGACAACACCAAAACATCAATAAAGGTATTGACACCTCCTAACATAGCCTATGACTTACTCAGTTTGAATTATTGGCTCAGTAGATATCACAGCATGGCTTACCAAAAACGTCTTAGACGGTCTGAGGCACTTACTTGCACCTATCGTAAAATAGGAGACGTTTACTGGGTACTAAAAAGTAACTTTGCTTCTCAATATCTCCTACCCCACCATCATATGTCCATCCCAGTGAAAGTTACACGCAAATATCCTTATCCTTTCATTCATCGAATGCTTGCGCATGCCCATGCATCAGACAATTCGATCTCACTTAAAATAACACCCCATCACGTAATTTAACGAATCAGATGTACGACTGGCTATGCTATTGACTATCAATGTCCTGATTGTTTTAATCGGCAAAAGCACCAAACACAGACATATCAAAGTTCACGACCTAAAATACAAAATTCATACGAACCCGTTCAATACCTACATACTGACCATTTGGTCAGTTTCACACCCTACCAAAAGTGCACCATCCTATTTCATCTCATTACTTTAACAACAAATTCCGTTGGTTTATCCTTGCACCCGTCCGAGGACTCATCCTCGATGTTTTACCCACGATACTAGCTTTTCATTAAAAACTCACTTTTCAGCCATGTCTGGTTATACAAATGGACCGTGGTTGCCTGAGTAACTACAGAACCTCCATACCTTCCTTGAAAAAAATGGGTATAACTCCATGCTATAACAACCACAGCGGATCCCGAGCACATGGAGTCGCTGACGGCTCAACCGTACCTTATTATATGACTGCCGTACTCACTGCACTGTAGTGGTTTACCGAACCATTTAGGGTTCTCTGCAATCGAATTTTCTACTCTTGTGAGAAATCACTAGCTTCACCAAAAGCAAAAATCTGCAAGCCAACATGCTTGGCTGGCAGACTTGATATCCAGTACTTTGTACCTTTCGGTCAACACTGTTATCGTCAATTATCACAACCCTAACTCCAAAATACATCCTCGTGGCTCCCAGGTTACGCTCTACATCCGTCTCGAAACTCTTATTGATATATATCTATCTTCCTCCTAAAGAAGACAGTAGATCAACTAACTATTTTATTCTTCCGGGGCAAGGATCCAATTAGATCAATTCAATCTCGACGCACTCACTTTCGATATGAAAGGACTTCACCGTTTACAATGCTTCCTATCATTCGTTTTCATTTGCGTCAAAATGAGATCCAAGCTGCCAATGATCTAACATAGAATCTGACATGACTTCCCATCTGACTTGAACACAATCCTGACAAAAAACCGAGAAATGTCCTTTCAAAAGCTGTAGTCCCAACCGATTCCACACCTCCGTCAACCATATGAAGATTTCAAACCTATTCTGAATCATCTTCGTGCACCCAGAGAAGTTGACCCCAACATATTGAATCTAATATTCTCCATCAAAAGCGATCTAAGCACCCCCCAAATTTCCAATATCGAGAGTACCCGGTTCGGGTGGTATGGCCATCAATTAAATGTTCCTTTACTGCTCCCAATGGTCCCAATCTAACACACATGAGTCTTTCGCAACGCCATAAATCTAAAGATTTCAGACACTCAAACTCGTAACATTGAAAATGAACTATCATACAAACGTCCAATTCCAGTACGGTGGTACCCAACACAAACTTTCCGCCGATAGTGGACCAAGAGACTGAGAAACGGATATAACACCGTTCAACCTTCAATCGATGCTTCTCTCCAACCGGACCAATAATTCACCAATCGCCCAATATTGTTCCTAGCAAAACGCCAACTCTTGTTTCTGACAGAAACCGAGGGAACTATCACATCGTGATCTCCACCCCTATCTACCTCCAGAATCTCCTACCGCAATTCCCTGACCCATTTAAAGAACCCCCCCCCCGATAAATTCCTCGTCAAACTATGCCAGTTTTTTTTTGGGTGGTATTGTTGACTCTAATGCTATACTAACTATCAACCGTAAGAAAAGATCAATTAGAAGATAATGAAACTGAAAATTACGGTATCACGAACACTGGAATACTAATAATATGCGTCGTTTAGAACCTCCGAGATCGAAGAAACGAATCCCCTGTTGCAGCTATAAAAGCAGTACACTCAATCAACCAATACGGACCCCCTTACGATACGATGATGCACACCTATAATAAAGATATTAAAAGAAAAAGAAAAATATATCAGGCTACCAACAAAGAAGTCAATCAACTGTTGAAGATGAAAACTGGGACACTGACGAATATTATGACAGAAAAGAAATAGACCTAAAAGAGTAATAAACTCAATGTTTTATCTTCAACACAAACGTGACGGGTACCATAAAGCTAGATTTGTTGCAAGCAGGGTATATTCAGCATCCTGAACACTACGACTCAGGCATGCAATCCACTACCGTACATCACTATCCTTAATGACATCCCTGTCACTTGCATCAGACCATAACTACTATATTACACACTTAGACCTAGTCTTCGGCAGATTTGTATGCAGACCTCAAAGAAGAAATTCTAATACATAAGACCTCCACCACATTTAGGAATGAATGATAAAAGTTGATACGTTTGACGAAATACTTATGGATTGAACAAAGTGAGCGAACCTGGTACGAAACTATCAAATCCATACTGATAACAAGTGTATGGAAGCGCAAGTTCGTGGATGTATGCTAAAATTAAACAGTCAGTGACCATTTGTTTTATTCGTCGATGATATGGTATTGTTTCGCAAAAATCTAATTCAAACAACAGAATTATAGAGGCTTAAGATGCAATACGACACCAAGATATCAAATCTAGGCGAAAGTGATGAGGAATCATCTGACATACTTGGCTTAGAATCAAATATCAAAGAGGTACATACATGAAATTAGGTAGGGCACATCTTTGACCGCCAAAAATTTAACCAAATTAAACGTAACCTTTGACTCCAAAAGAAAAACTTACGTCCAGGTCAACCAGGTCTTAATAACCAGATGAAACTAGAAATAGATGAAGAGAATACAAAGAGAAGTACATGAAATGCAAAAGTTTGATTGGTCGAGCTTCATATTTGGATATAAATTTAGATTTGACTTACTATACTACATCAACCCACTGCTCACATATACTATTCCCCTTCCCCTCTAGGCAAGTTTTAGACATGACATATGAGTTAATACAATTCATGTGGACACTAAGAAATACAACACTGATATTTCAACCAAAAAACAAACCGACGAGCCAGATAATAAACTTCCGCAATAAGTGGAGGCTTCCTATGGTAAACCAACCATATACAAGTCCAACATTGTCAACCATATATTTACTTATGGAAAGGTAATTGGAGAAGTCCCCCAGAGGCTTCATTCACATGTACTACTTCACAACTAGGACGCAGAAATACACCGCGATAAGTGATCTGCCCCATTATTAAAATAACCTCATCCACCTTGTGCAAGAACTTAACAAAAACCAATTACTAAAGGATTACTAACCGACAGTAAATCCCCCCCCCCTCCGTCTCCTTCTCCTCCCTCCCTGCCGCGCCCCCTTTCGCACCCCTTTTTTGGTACCTACGCAATGAGACTCGCCAGTGATATCAGGAACCTCCGCTGCACGTATGCTCCTATCGCCCCCCAAAACAGAATACTGCAGACGTATGACCAAACCTCTTCCGATAACAAATTCAAACTATTAACAAACAAATGGATCATTAGATTATTACATTATGGGTGGTAATG        .       PASS    IMPRECISE;SVTYPE=INS;END=169209;SVLEN=11071     GT:AD:DP        0/1:2:6
+chrIII 169539  pbsv.INS.4      T       TCTATGGTAGCGATGTGCTTCGGGACTCTAAGGAAGTCCACACAAATTCAAGCATCCGTTTTAGAACGTTTCAGCTTCAAAACAGAAGAATGTGAGAGGCGTCCCATAGGCCTTAACTCTCAACAGACAACAAACATGCTCATCAGCTTGTTCCAGAGGAACCCCCCATCATGCCTTCCTAAAACTGCGCAGTCACATTCACACAGAATGCGGACGTACCCAAGAGGCATATGACCAAAACAAGGGCAAATCCATCTGGTTGGTCATTTACGGACACCCATCTATGATTCGTATACACCTATCAAAATGTTCGCGATGTACTTTCCACATGGGCCACCCAATCACAGTTTCCGCAGTATCAATATCAGGTTGGAACGCTCGAGCACTCAATCACCGAGTCAGGTACTACATGTACTGATTCATCCTCAGCAGACTCTGATATGACATCACTAAAAAATATGTCAGACCCACCCACCAATTAAACCCACTATGACTTTCCAAATTGGGTTAAAAACATACATCAAATTTTACAAAACTCGAATCTCGGTGGTATTATTCCGACAGTAACGGAAAACCCGTACGTAGATCACTGGATGATGAACTCACCTTCTTGTATAAACACTTTTCACAATATTTGCTCCCTCTCAATTCCTACTACCGGGGTCAAAGACATCTAGCCGTTTGATTATACGGATATTCATGAAAATTCTTTCCAAAGTATTGAAAAAAATGCAATCGATACCCAAAGAGCAAACGACATGTGACCTGGCAAATTTGCCAATATAATGGCAGTCACCTGCAGAGGCTTTGAAACAAAAGTCAAAAACATTTCGACCAGGGCAGACTGAACAATAATGGCATTCATATCAATAACAGGGTCGCATGCCAATTAATTATGAGAGGTCTATCTGGCGAATATAAATTTTACGCTACACACGTCCTCGACATCTAAAAGACAGTCGCTGAACTGGTCTTAGATATCCATGCTATTTATGAAAGAAAACAGGGATCGGAAACAGGCAACCTAATTACAGGAGAAATCGAGTGGAGAAAGTGATGAGAAGAATGATTCCGCAGTATACATACAACCAACCCAAAGGTATAGCTCGGAATCCTCAAAAAAACAATAATTCGAAATCGAGACCACAGCCAGGGCTCACAATGTATCCACATGCTAATAACTCTCCCAGCCGGACAACGATTCTATCAGTAAATCAACTACTGAACCGATCAATTGGAACAATAAGCAACACCTTCCCATCTTCGGCCAGAAACTTACTGCATCTACAGTAAATCATATAATCATTCTGATGATGAACTCCCTGCGACACCTCTTCTCGATTCAGGGCATTCCGAACCCTATAAGATCTGCTCATCACATTACACTCAGCATAATCTATCCGAATGAAACGTAGGTTGATGCTCAAAAAGAAATATACCAATTAAACGCATTGGTGACTACCAATTTAACTTCAGGACAACACCAAACATCAATAAAGGTATTGCCACACTCCTAACATAGCCTATTTGCTTACTCAGTTTGAATGAATTGGCTCAGTTAGATATCACAGCATGCGTTACACCAAAAACGTATTAGAACGGTCGACGGCACGTGTACTTGCACCTATAGTAAATTGGAGACTTTTATGGGTATCTAAAAAGTACTTGCTGCCATCAAAGATCTCCGGTACCACCATCAATAATGTCAGTACAAGTGCCAGTACACGCAAAATCCTTATCCTTTCATCATCGAATGCTTGCGCATGCAATGCATAGACCAATGTACGATACTTCACTAAAATAACACCCATCAACGTATTTAACGAATCGAGTCGACTGGTCTATGCTATTGACTATCATGTCTGATTGTTTAACGGCAAAAAGCACCAAACACAGGACATATCAAAGTTCACGACTCTAAAAAATACCAAATTATTACGAACCCTTTCAATACCTACATACTGAATATTTGGTTCAGTCCCAAAACCTACCCAACAAAGTGCACCATCCTATTTCATCTTATTACTGATGAGACAACAAAATTCCGTTGGGTTTATCATGCCACGACCGTCGCGAGGACTCTATCCTCGATGTGTATTTACCACCGATACTAGCTTCATTAAAAACCAATTAGGCCAGGTCTTGGGATACAAATGGACGTGGTCTGGAGTATTACTAACAGAACTCTCCATAAATTCTTGGAAAAAAATGGAAAACTCCATTGCTATACAAACCACAGCGATTCCCGAGCACATGGAGTCGTGAACGGCTCAACCGTACCTTTTAGAGGACTGCCGTACGCAACTGCAAATGTGTGGTTTACCGAACCATTATGGTTCTCTGCATCGAATTTTCTACTATTGTGAGAAATTTCACTAGGCTTCACCTAAAAGCAAAAAATCTGCAAGCACATGCTGGTTGGCAGGACTTGATTATCAGTTTACTTGTTACCTTTCGGTCAACCTGTTATCGGCAATGATCAAAACCCTAACTCCAAAATACATCCTCGGGGGCAATCCCAGGTTACGCTTCTACATCCGTCTCGAAACTCTTATGGATATCTCATCTATCTTACATCTTAAAGAAGACAGTGAGATACAACTCACTATGTATTCTTCAGGGCAAGGGAATCCAGAATAAGAATCAATTCAATTACGACGCACTCATTTCGATGAAGACTTAAAACAAACCGTTTAACTGCTTCATATCATTCGTCAATTGCGTCAAATGAGATCCAAGAATCCAATGATATAACATAGATCTGACCATGACTTCCAATCTTGACATTGAATAATCCTGAGCAACCGAGGAAATGTCCTTCAAAAGCTGTGAGTCCACCCGATTCAACTACCGGCGTCAAACTCATACTGAAGATCAAACATAAACCATATCTGAAATCCAATTCGTGCACCCAGGAAGTGACCCCAACAATCTGAATCTAATATTCTTCCATCAAAGAGAGATCTAGCACCCCCCAAAGTTCCAGGATCGAGAGTACAGGTTCGGGTGGAGGCATAAATAAATGTTCCTTTACTTGCTCCATGTCCCAATATACACACATGAGTACCGCACGCCAGTAAATCTAAAGATTTCAGACACGCAGACTGTTACAGTGAAATGAGACTAATCATACAAACGTACCAATATCCAGTACGGGTGGTACCAACAACACAACTGTTCCGCAGATAAGTGACCAAAGAGACTGAGAACAGGATTATACACCGTTTGCACCTTCAATCGATGCTTATCCACCGTGAAAATATTCATCGCACAATATTGTTCTGATCAAAACGCAACTACTGTTTTGGAACAGAAATACGAGGAATCTATAATCGTGATCTCCCTCCCTGATTACCTACGAATCTCCTACCGAAATTCCTGACCCATTTAAAGAACTCCCACCGATAAATTCTCGGTCAAACTAATTCCAGTTTGGGTGGTATTGTGACTCTAATGCCTAGACTACTATCAACAGTAAGAAAAAGATCATTAGAGATAAGTGAAAACTGAAATTAAGGTATACGAGACACATGGGAATCCGAAGAATATGCGTACGTAGTTTAAACTCCGAGATCGAAGAAGAAACGATTCACCTGATTGCAGCTGTAAAGCCGTAAAATTCAATCAAACCAAACGGACAACCTTGACGATACGATGAGGCACTCACCTAGAATAAGGATATTAAAGAAAAAGAAATATATCGAGGCATACACAAAGAAGTCAATCAAACTGTGGAAGATGAAAACTTGGGACGACACGGACGAATATTATGACAGAAAAGAAATAGACCCAAAGAGTAATAACTCAAGGTTATCTCAACAAGGAACGTGACGGTACTCATAAAGTAGATGTGTTGCAAGAGGTGATATTCAGCATCCTGACCCTGACGACTCAGGCATTGCAATCCAATACCGACATCACTATGCATAATGACATCCCTGTCATGATTAGCAATTAAACTCAATATATTACACAATTAGCCCTATCTTCGGCATATTGTATGCGACATCAAAGAAGAAATTATACATAAGACCTCACCACATTTAGGAATGATGATAAGTTGATACGTTTGAAGAATCACTTGATGGATTGAACAACGTGGAGCGAGATGGTACGAACTATCCAAATCATACCTGATAAAACAATGTGGTGATGGAAGAGTTCGTGGATGGTCATGCGTATTTAAAAACAGTCAGTGGACAATTGTTTATTCGTAGATGATATGGTATTGTTTAGCAAAAATCTAAATTCAAACAAAAAGAATTATAGAGAAGCTTAAGATGCAATACGACACGAAGATTATAAATCTAGGCGAAAGTGTGAGAAAATTTCAAATGACATACTTGGCTTAGAAATCAACTATCAAAGAGGTAAATACAGGGAAATTAGCGCTATGGAAAAGCTTTGACAGAAAAATTACCCAAATTAAACGTACCTTTTGAATCAAAGGAAGAAAACTTAGCGCGCCAGTCAACAGGTCTTTCTAATAGACCAGATGTGAAACTAGAAATAGATGAAGATGAATACAAAGAGGAAGGTACATGAAATGAAAAGTTGATGGTTAGCTTCATATTTGATATATAAATTTAGATTTGACTTACTATACTACATCAAACACACTTGACTCACAATTAATCCCCTCTAGGCAAGTTTTGACTATGACATATGAGTTAATACAATTCATGTGGGACACTAGAGATAAACAACTGATATGGCACAAAAACAAAACCTACCGAGCCAGATAATAACTAGTCGCAATAAGTGAGCTGCATATGGTACCAAAACCATATAAGTCACAAATTGGTAAAAATTGGCAAAAATATATTTACTTAATGGAAAGGTAATGGAGGAAAGTTCCACCAAGGCTTCATTACATGTACTTCAAAATACGGAAGCAGAAATAACACGCGGTAAGTGGAAATCTGTCCCCCATTATTAAATAACCCTCAGTCCACTTGTGCAAGAACTTGAACAAGAAACCAATTACTAAGGATTACACGACAGTAAATCTACAAGTCAGTATAGTATAATTATATCCATAATGAAGAGAAAAATTGAGAAAAGATTTTGTGGTATAAAGCAATGAGACTAAGAGATGAGTATCAGGAAATCATCTGCACGTATGCTCTATCGAAACCCAAAAGAATATGCAGACGTATGACCAAACCTCTTCCGATAAACACATTCAAACTATTAACAAACAAATGGATCTAGATCTATTACATTATGGGTGGTATGTGGAATAAAAATCCACTATCGTCTATACTACAGGAGTTATATTATCAATATCTTATCATATCGTGTTAAGATGATGACATAAGTTATGAGACGCTGTCATCGAGTTTGTTAGAGGAAAGCTGAAACGCTAAGGATGATAATGTAATAGGATCAATGAATATAAACATTATAAAAAACGGAATGAGAATAATCGTAATATTAGTATGAGGAAATATAGATTCCATTTGGAGGATCCTATATCCTCGGAGGAGAACGTGAGTATATCTGTATACCTAATATTATAGCCTTATCAACAATGGAAATCCCAACACTTATCTAATTACCCACAACATT     .       PASS    IMPRECISE;SVTYPE=INS;END=169539;SVLEN=5570      GT:AD:DP        0/1:2:10
+chrIII 200765  pbsv.DEL.5      CGGTGTAAAACAAAATGTCTTGTCTTCTCTGCTCGCTGAAGAATGGCACGCGGACAAAATGCAGCACGGAATATGGGACTAC      C       .       PASS    IMPRECISE;SVTYPE=DEL;END=200846;SVLEN=-81       GT:AD:DP        0/1:7:13
+chrIV  461745  pbsv.DEL.6      GGTCGAAAAAAGAAAAGGAGAGGGCCAAGAGGGAGGGCATTGGTGACTATTGAGCACGTGAGTATACGTGATTAAGCACACAAAGGCAGCTTGGAGTATGTCTGTTATTAATTTCACAGGTAGTTCTGGTCCATTGGTGAAAGTTTGCGGCTTGCAGAGCACAGAGGCCGCAGAATGTGCTCTAGATTCCGATGCTGACTTGCTGGGTATTATATGTGTGCCCAATAGAAAGAGAACAATTGACCCGGTTATTGCAAGGAAAATTTCAAGTCTTGTAAAAGCATATAAAAATAGTTCAGGCACTCCGAAATACTTGGTTGGCGTGTTTCGTAATCAACCTAAGGAGGATGTTTTGGCTCTGGTCAATGATTACGGCATTGATATCGTCCAACTGCATGGAGATGAGTCGTGGCAAGAATACCAAGAGTTCCTCGGTTTGCCAGTTATTAAAAGACTCGTATTTCCAAAAGACTGCAACATACTACTCAGTGCAGCTTCACAGAAACCTCATTCGTTTATTCCCTTGTTTGATTCAGAAGCAGGTGGGACAGGTGAACTTTTGGATTGGAACTCGATTTCTGACTGGGTTGGAAGGCAAGAGAGCCCCGAA      G       .       PASS    IMPRECISE;SVTYPE=DEL;END=462354;SVLEN=-609      GT:AD:DP        0/1:8:9
+chrIV  1307718 pbsv.DEL.7      TGCAGTATCCTCGACGTACACGTCTTCACCATCAGCACCTGCTGCAATATCCTCAACGTACACGTCTTCACCATCGGCACCTGTTGCAGTATCCTCGACGTACACGTCTTCACCATCGGCACCTGCTGCAATATCCTCAACGTACACGTCTTCACCATCGGCACCTG T       .       PASS    IMPRECISE;SVTYPE=DEL;END=1307884;SVLEN=-166;SVANN=TANDEM        GT:AD:DP        1/1:6:6
+chrIX  25576   pbsv.INS.8      G       GCTTGAGACACCAGAAGGAGAGGAAGCGATTGACTGACAGAGTTGAGACATCAGAAGCTGAGGAAGAAGATTGATGAAAGAGGTTGATACATCAGAGGCTGAAGAAGAAGATTGACTGACAGAAGCTTGGACATCAGAAGTAGAGGAAGCTGATGGACTGCAGACCCCCGAG    .       PASS    IMPRECISE;SVTYPE=INS;END=25576;SVLEN=171;SVANN=TANDEM   GT:AD:DP        0/1:5:8
+chrV   116286  pbsv.INS.9      C       CTGTGGAAATAGAAATCACTATCATCTACCTAACCTAGTATTTACATTACTAGTATATATCATATACGTGTTAGCATATGACGCAAATGATGAAAATAGTCCATCCTAAATTAGTGGAAGGCTTAAACGCAAGGATTGATAAGTATAGGACAATGAATATAAACATATAAAAATGAAATGTAATAATATTTATAAAATTGTGTGAATGTGCAGATCCCTTTTATGGATTCCTAAATCCTTGAGGAGACTTCTAGTATATTCTGGGGTTACCTAATATTATAGGCCTTTATTCAACATGGAATCCCAACAATTATCTCAACACATTCACATATTTCTCATGGTAGCGAGCGCCTGTGCTCGGTTACGTCTAAGGAAGTCCAACACAAATCAAATCCGTGTAGACTTTTCAGCTTCCAAACAGAAGAATGTGAGAATGCTTCCACTAAGCGCTAACTCCAACAGACAACAACACCTGCTTCATCAGCTGTTCCAGAGAACCCCCATCATGGCCTCTCCTCAAACTGCTCAGTCACATGCACCACAGAATGGCCGTAACCCACAGCATGCATGATGACCCAAACCAAGCCAATCCTCGGTTGGTCTTTAACGGACACCCATCTCCATCTATGATCCGTATCAACACTTATCAAAATGTCGCCTATGTACTTTCCCCCTGGCCACCTCCAGTTTCCGCAGTACCCATCATCAGTTGGAACTGCCTCTGCAAGCACTCCATCACCTGAGTCAGGTATACATTTTACTGATTCATCCTTCAGCGGACTCTGAGATGACATCCACTATAAAAATATGTCAGACCACCACCCTTTTAACCTCACCTATGACTTTCCAAATTGGGTTAAACATACATCAAAAATTTTTACAAAACTCGAATCTCGGTGGTATTATTCCGACAGTAAACGGAAAACCGTACGTCAGATCACTGATGAATGACCTCACTTCTTGTATCACACTTTCAATATTTGCTCCCTCTCAAATTCCTACCTACCCTGGGTCAAGACATCCTAATCCGTTGATTATACGGTATCATATGAAAATTCTTTCCAAAAGTATGAAAAAATGCCATCTGATACCAAGAGGCAAACGACCATTGTGACCCTGGCAATTTGCAATATATGCATACACCTGCAGATGCATTTGAAACAAAAGTCACAAACATTTCGAAAAAAACAGACTGAACAATAAGGCATTCATATCATAACAAGTTCGCATGCCATTAATTATGAGAGGTCTATCTGGCGAATATAAATTTTTACGCTACAACACGTCATCGACATCCTAAATAGGACAGTCGCTGAAACTGTTCTTAGATATCCATGGCTATTTAATGAAAACAACAGGGATCGAGAACATTAAAACCTAATGACAGGAGAATCCGAAGTGATGATAAGAATGATTCTCGCAGCTATAACGAATACAAACCAAACCCAAATTTCTATCTCGGCGGAATCCTCAAAAAACAAATATTTCACTCGAAACAGCCATGGCTCACAATTATCCACATCCAATAACTCTCCCCAGCACGGACAAAACGATTCCATCAGTAAATCAACTACTGAACACCGTTCAAGTGAACAATAACACGACCGTTCATCTAGGCCAGAAACCTTACTGATCTACAGTAAATCATCTAATCATTCTGATGATGAACTCCCTGGACACCTCCTTTCGATTCAGGCGCATCACGCACCCTTGAAAGATCTGCTCACCACATCCTCAGCATCATCTAATCCTGACATAACGTATTGATGCTCAAACAAAGAAATATACATTAACGCTATTGGTGACCTACAATTCACTCAGGACACCAAAACATCAATAAAGTTATTGCACACTCCTAACATAGCCTATTACTTACTCAGTTTGAATGAATTGGCTGGCAGTAGATATCACAGCATGCTTACCAAAACGTCTTAGCACGTCTGACGGCAACTGTACTTGCACCTATCGTAAAATATGGAGACTTTTACTGGTCTCTAAAAAGTACTTGCTTCCATCAATATCTGTCCGTACCACCCTCAATAATGTCCCATACAGTGAAGTACAACACGCAAATATCCTATCCTTTCCTTTCTCATCGAATGCTTGCGCAATGCAATGCACAGACACTTCGATATCACTAAAAATAACACATCACGTACTTTTACGAATATGTCGACTGGTCTAGTGCTATTGACTATCAATGCCTGATTTTTTAATCGGCACAACCAACACAGACCTATCAAAGTTTCACGACTCAAATACCAAAATTAATACGACCCTTTCAATACCTACATACTGACATATTTGTCCAGTTCACAACCTACCCAATAGTGGCATCAACCATCCTATTTCAATCTCATTACTGTGAGACAACAAAATTCCGTTTGGTTTATCCTTACACGACCGTCGCGAAGGACCTATCCTCGTGTTTGTTTACTACGATACTAGCTTTATTAAGAACCATTTTTCAGGCCAGTGGTCTGGTTATACAAATGGACCGTGGTTCTGAGTATACTAACAGAACTCTCCATAAAATTCCTTGAAAAACAAATGGGTATAACTCCATGCTATACACCACAGCGGATTCCGAGCACTGGAGTCGCTGAACGGCTCAACCTACCTTATTAGATGACTGGCCGTACTCAACTGCAATGTAGTGGTTTTAACCGAACGCATTTTTATGGTTCTCTGCAATCGAATTTCTAACTATGTGAGAAATTCCACTAGCTTCACCTAAAGCAAAAATCTGCAAGACAACATGCTGGCTTGGCGGACTTGATATCAGTACTTTGTTACCTTCGGTCAACCTGTTATCGTCAAATGATCCACAACCCTAACTAACTCTCCAAAATACATCCTCGTGGCATCCCCGCTACCTCTACATCCGTCTCGAAACTCTTCTGGATCTATCATCTATCTTCATCCTTAGAAGAATAGATACACTAACTATGTTATTCTTCCAGGCAGGAATCCAGATTAATCAATTCATTAACGAACCGCCCTCACTTTCGATGAAGACTTAAACCGTTTAAACTGCTTCATATCAATCGTTCATTGCGTCAAATGAATTCCACAATCCGATGATCTTACAAAAATCTGACCATGACTTCCAATCTCCATCGAACTACATCCTGAGCAACCGAGAACTGTCCTTTTCAAAAGCTTGTGAGTCCAACCGATTCACACCTCCGTCAACTCATACTGAATATCGAAACGTGGTTTCAAAACCAATATCGATTCGCCATTCGCGCACCAGTAAGTTGGAACCCCCACATATCTGATCTAATTTCTTCCGATCAAAGAAGAGTTAGCACCCCCCAAATTTCCAATATCGAGAGTACCGGGTCGGGTGGTATGCATAAATTAATGTTCCTTTGTTACTTGCTCCCATGTCCCAATCTAACACACATGAGTCGTCGCACGCCAGTAAAATTAAGATTCAGACACTCAACTCGTACAGTGAAAATGAACTAATCATCAAACGTACCAAGACCCTACGGGTGGTAACCAACAACAAAACTGTTCCGCGATAAGTGACCAAGAGACTGAGAAAATGATTATACAACACCGTTCACTTCACCTTCAATCAATCGTCTTCTCCACCGGAACATAATTCATCGCCATATTGTTCCTTCAAACCGCCAACTACTGTTTCTGAACGAATACCGAGAATCTATCATCGCTGATCTCCCACTCCCGATCTACCTCCAGAATCTACCACCTACGAATTCCCTGACGCCATTTAAAGAACTCCACCGATCATTCTCGTCAACTAATTCCAGTTTGGGTGGTATTGGTGACTCTAATGCTATACTACTATCAACAGTAAGAAAAGATCATTAGAAGATAATGAACTGAAATTAAGGTATCACGAGACACATGGAATACTAAGAATATGGCGTAGTTTAGACCTCCGAGATCGAGAACGAATTCACCTGATTGGCAGCTGTAAAAGCAGTTCAAAATCAATCAAACCAATACGGACACCTTACGATACGATGAGGCAAGCACCTATAATAAGATATTAAAGAAAAGAAAAATATATCGAGGCAACCACAAAGAAGTCATCAACTGTTGAAGATGAAACTTGGGACACTGACGACTATTTGACAGAAAGAAAGACTCCTCAAAGAGTACTAACTCAATGTTTTATCTTCAACCAAGAAACGTGACGGTACTCTAAAGCTAGATTTTTGCAAGAGGTGATTATTTCAGCATCCTGACACTTACGACTCAGGCATGGCAATCCAATATACCGTACATCACTATGCATAATGACCTCCCTGTCATTGCATTAGACAATAAACACTATATTACCCAATAGACATATCTTCGGCATATTTGTATGCAGACATCAAAGAAGAATAACATAAGACCCCTCCACCACATTTTAGGAATGAATGATAAGTTGATATCGTTGAAGAAATCCTTAGGATTAAACAAAAGTGGAGCGAACTGGTACGAAACTATCAATCATACCTGATACACACAATTGTATGGAAGAAGTTCGTGGATGTCATGCTATTTAAAAACAGTCAAGTGCTCAATTTGTTCTATTCGTAGATGATATGTATTGTTAGCAAAAATCTAAATTCAAACAAAAGATTATAGAAAGCTTAGATGCAATACACACCAAGATTATAAATCTAGGCGAAAGTGGATAAGGAAATTCCCTATGACATTTGGCTAGAATCAAATATCAAAGAGGTAAATACTGAAATTAGGTATGGAAAACTCATTAACTGAGAAAATACCAAATTAAAACGTAACCTTTGAATCCAAAAGAAGAAACTTAGCGCTTCCAGGTCAACGCAGGTCTTTATATAGACCAGGATGAACTAGAAATAGAAATGAAGATGAATACAAAGAGAAGGTACATGAAATGCCAGTTGATTGGTCTCTTCATATGTTTTGGATATAAAGTTAGATTTTGACTTACTATCTACATCAACACACTGCTCAACATATCACTTTCCCCTCTAGGCAGTTTTAGAACAGACTATTAGTTTAATTACAATTCATGTGGGCCACTAGAGATAACAACTGATATTGCCAAAAACAAAACCTACCGAGCCAGATATAAACTCGTCGCAATAAGCGTGCTTCTATGGTACCAACCATATTACAAGTCACAAATTGGTAACTTTTTTCCTACTCAACGGAAAGTGATTGGAGGAAAGTCGACAAAGGCTCATAACATGTACTTCAACTACGAAGGGAGAAGAATACACGATAAGTGAATCTGTCCATTAAAATAATCTAAGTTACCTGATACAAAGAACTTAACAAGAAACCAATTATTAAAGGCTTACTTACTGATAGTTAAAAAAGAATTTCCCAAAAAACGATCAGTATACTAGTCTACAAATGAAGAGAAATTTAGAAAAAAGATTTTTTGGCACAAAGCAATGAGACTTCGAATGAAGTATCAGGTAATAATTTATACGTATCCTAACTACATCGAGACCAAGAGAACATTGGCTGATTGATGACAAACCTCTTCCGATAAAAACAATTTAAACTATTAACTACATGGATTCATTAGATCTATTACATTATGGGTGGTAATGTTTGGAATAGAAATCCAACTATCATCTCTAACTAGTATTTCTTACTAGTATATTATCATATACGGTGTTAGTAAGATGAACGCAAAATGATTGAGAATAGTCCATCTAAATTAGTGGAAGCTGAAACGCAAGGATTGATATGTAATAGGATCATGAATATACATTAAAATGGATGACATAATATTTTAGAATTGTGTAGAATTGCAGATTCCCTTGTTGGATCCTAAATCCTTGAGGAGAACGTCTAGTATATTCTGTATACTCATATTATAGCCTTTATCAACAATTGACTCCCAACATTATCTCAAACCTTCACATATTCTCAGTACC    .       PASS    IMPRECISE;SVTYPE=INS;END=116286;SVLEN=5899      GT:AD:DP        0/1:6:7
+chrV   444238  pbsv.INS.10     T       TGTTGTGCATATCAGGTGGTTATCTCTAGTGTCCACATGAATTGATATTCCAACTATAGGTCATGTCTAAAACTTGCCCTAGAGGGGAATAGTATATGTGAGCAAGGTGTTGAGTAGTATAGTAAGTCAAATCTAAATTTATATCCAACATATGAAGGGGAGACCAATCCAACTTTTGCAGTTATGTACCTTCTCTTGTAGTCATCTCAGCTATTTCTAGTTCATCCCTGGTCGATATAAGACGCCTGGTTGACCTGGAGCGTAAGTTTTTCTTCCTTTGGATGCAAGGTACGTTTAATTGGGGTATTTCTCAGTTGAATGGTTTCCATACCGAATTCAGGTATTTACCTCTGTGATATTTGATTCTAGCCAAGTATGTCGCGTATGAATTTGTGATCCACTTGCACCCAATTTTATTATCTTTGTATCGTACTTGTTTCTGAGTGGTGTGATGATTTTCTGATGTGCATTTAAGTCGTTGCTGAATAAAATCAGTCGTCAACAAAATAAACAGATTGTTACGATAGTTTAAATACGCATGACCATCCACGAAAACTTATTCCCAACCAACTGTTTTATCAGGTATGATTTGATAGTTTTGTACCCAGTTCGCGCCACTTTGTTCAATCCATAAAGTGATTTCTTCAAACGTTCAACTTATCATTAATTCCGAAATGTGGGGGGAGGTCTTATTGTGATAATTCTTCTTTGATGTCTCATACAAATATGCCCGAAGATGATGTCTAATTGTGTTAATATAGTAGTATTGTCTAATGCAAGTGACAGGGATGTTCATTAATGCATAGGGATGTGACGGGTATTGGATGCAGCCTGAGTCGTAAGGGTCTAGGATGCTGAATATCACCCTTGCAACAAATTAGCTTTAGTGAGTACGTCACGTCTTGTTGAAGCTACACATGAGTTTATTACTCTTTTAGGGTCTATTCTTTTCTGTCATAATCTTCGTCAGGTCCCAAGGTTTTCCTCTTCAAAGTATGACGGGTCTTGTGGTATGCCTCGAGATATTGTGCTTTCTTTAATACTTTATTTATAGGTGATTTGCTATCTATCGTAAGGTTGGCCGTAATTGGTTTGATTGGATTTTTAGGCGTTACAGCTGCAATCAGGTGAATTCGTTCGTCGATCTCGGAGGTTCTAAACTACGCATAGTATTAGTATTCCATGTGTCTGTGATACCTTAATTTCAGTTTCATATCTTCTAATGCTCTTTTCTTACTGTTGATGTAGTATAGGCATTTAGAGTCACCACTACACCAAACTGGAATGTTGATGAGAATTTACGGGGGGAGTTTCTTTAACATGGGTCAGGGGAATCGGTTAGGAGATCGGGAGGTAAGATCAGGGCGTGGGAGATCAGCGAGGGATAGATTCCTCGGTATTCTGTTCAGAAAAACAGTAGTTGGCGTTCAGTTTTGGGATAGGAAAACAAGGATTGGCGATGAATTATTTTCCGGTTTGGAGATGGAGAAGCATCGATTGAAGGTGAACGGTTTGGATAATCCTTTTCTCAGTCGCTTGGTCCCTTATCTGCGGAAAACAGTTTTGTTGTTGGTACCACCCGTATGGATATTGGTTACGTTGTGTATGATTAGTCTCATTTTCACTGTACGAGTCTGGTGTCATGAAAAAATCGTGAGATTTACTGGCGTGCGACGACTCTGTGTGTTAGTTGGGACATGGGGCAAGTAAAGGAACATTTAAGTTAGCATACCACCCGACACCGGTACTCTCGATATGGAAATTTGGGGGGTGCTAGAGGCTCGTCTTTGATGGAAGCATTTAGATTCAGATGATGTTGGGGGTGGTAAACTTCTCTGGGTGCGCGGGAATATTGGTTTTAGAAACACGTTTCGAATCTTCAGTAGGAGTGACGAGGTGTGGAATCGGTTGGGACTCCAACAGAGCTTGTGAAAGGAAACATTTCTCGGTTGCTCAGATGTCGGTCGAATGTCAGATGGAAAGTCATGGTCAGATGTCTATTAAGATATCGATTTGTTGGATCTATTTGACGCAATGAACGATTGATATGAAGCAGTTAACGGTTTAATCTTAATGGAAAGGAGGGCGTCGTAATTGAATTGATCTAATGGATTCCTTGCCCTGAAGCAATAACATATTAGTTGATCTACTGCTTCTTTAGGATGGAAGGATTAGAGATATATCATTAAGAGTTTCGAGACGGATGTAGAGCGTAGCTGGGATGCCACGATGAATGTATTTGGAGTTAGGTTGTGATCATTGACGATAACAGGTTGACCGAAAGGAACAAAAGTATGATAATAAGTCTGCCAAGCCAGCATGTTGTCTTGCTTTGCGATTTTTGCTTTGAGGTTGAAGCTAGGTGGATGAATGTTCACAATAGTAGAAAAATTCGATTGCAGAGAACCAAAATGGTTCGGTTGGTTAAACCTACATTGCAGGGGAGTACGGGCAGTCATTAATAAGGTAAGGTTGAGCAGGTTCAGCGACTCCATGTGCTAGGGATCGCTTGTGGTTGTATAGCATGGAGTTATTTTATACCATTTTTTCAAGGAATTGTATGGAGAGTCTGTTAGTATATCAGAAACCCGGTCCATGTATAACCAAGACACTGGCGACTGAAACTGGTTCTTCATAAAAGCTAGTACGTAGTAAAAACATCGAGGAATAGAGATCCTCGCGACGGTCGTTTGTAAAGATACACCCAACGGAATTTTTGTTGTGCATCAGTAAAGGAGATGAAATAGGTGGTTGCAACCTTTGTGGTAGGTTGTGAACGGGACCGCAAATAGGTCAGTATGTAGTATTGAAGGGTCGTATGAAATTTTGGTATTTTAGTCGGTTGAACCCTTGAGATGTCTGTGTTGTGCTTTCCGAGTAAACAATCGGACATTGATAGTCATAGCCATAGAGTCCAGTCGACATCTGATCGTTTAAAATAGTGATGGTGGTATTTTTAAGTGAGTATCGAATTGTATGGGCATGGGCATGTGCAAGCATTGTAATATCCGATGATGAAAGGATAAGGATATTGCGTGTGTATTTCACTTGTATGGACTTATTGATGGTGGGTACGGCGGATTTGATTGGAAGCAAGTACTTTTAGAGACCCAGTAAAAGTCTCCATATTTTACGATAGGTGCAAGTGGACCGTGCCGTCAGATCGTTCTAAGAGTTTTTGGGAAGCATGCTGTGATATCTATGCAGCCAATCATTCAAACTGAGTAAGTCAAGTCATAGGCTAGGTTAGGAGTGTGGCAATAAACCTTTATTGATGTTTTGTGTTGTCTGGGAAGTGAATTGTAGGCACCAATAGGTTAATTGGGATATTTCTTTTGAGCATCAACTACGTTTATGTAAGGATTAGTGATGCTGAGTGTAGTGATGAGCAAGATCTTATAAGGTTTCGTGATGCTCCTGAATCGAGAAGGAGGTGTCCAGGGGAGTTCATCTCAGAAATGATTAGGTGATTAACCGTTAGATTCAGTAAGGTTCCGGCCCTAAGGTGAAGGTCGTGCTTAGTGTTCAATTGAATCGGTTCAAGTAGGTGATTTACTGATGGAATCGTTGTCCGTGGCTGGAGAGAGGAGTTATTAAGTGGTACAGTTGAGCCCTGGTGTTTTCCGATTTCGATTATTGGTTTTTGAGGATTCCGAGCTTAACTTTGGGTTTGTTGTATTCGTATAGCTGCGAGAATCATTTTTCTTCATCACTCAGATTTCTCCTGTAATTAGGTTGTGTTTCTCGATCCCTGTTGTTCTTCATAAATAGCATGGTATCTAAGAACAGTTCAGCGATGTCAATGTAGATGTCGATGACGTGTTGTTAGCGTAAAAATTTATAGTCGCAAGATAGACCCTTCTCATTAATTAATTGGCATGCGACCTTGTTATTGATAGGGGAGCCATTATTGTTCAGTCGTCGGATAATGTTTGTGACTTTTGTTTAAATGCATCTGCAGGTGTTAACTGCCATTACATTATTTGCAAATTCCGGGTACATGTCGTTTGCCTCTTGGTATCAGATTGCATTGTTTTCAATACTTTGGAAAGAATTTCATGAGCCGTATAATCAAAGGATAGGATGTCTTTGGACCCAGGTAGGTAGGAATTGAGGGAGCAAATTTGAAAAGTGTTATACAAAGAGGTGAGTTATCATCAGTGATTGACGTTACGGGTTTTCCGTTTACTGTCGAATAATACACACCGAGATTGCGAGTTTTGTAAAATTTGAGTGGTTTAACCCAATTTGGAAGTCATGAGGTGAGGTTACATTGGTGGGGGTCTGACATATTTTTAGTGATGTCATATCAGAGTCCGCTGAGGATGAAGCAGTAAATGTATTACCTGGATCAGGTGATGGAGTGCTTCAGAGGCGTTAACTGATGATGGATACTGCGGAAACACTGTTGTTGTGGCAGGTGGAAAGTACATAGGCGCATTGATAAGGTGTATACGGATCATAGATGGGTTGTCCGTTAAATGACCACCAGATGGATTGGCTTGGGTGTGGGTCTCATGCACTGCTGTGGGACGGCCCATTCTTGGAGTGGTATGAAGCCAGGTTGGGAGAGCATGATGGGGGTTCTCTGGAACAGCTGATTGAAGCAGGTTGTTGTTGTCTTCTGTTGAGAGTTAGCCTTAGGGAAGCTTCTCACATTCTTCTGTTTGGAAGCTGGAAACGTCTAAACGGATCTTGATTGTTGGATTCCTAGAAGTAACCGAAGCACAGGCGCTACCATGAGAAATGGGTTGAATGTTGAGATAGATAATTGTTGGATTCCATTGTTGATAAAGGCCTTATAAGATTAGGTATACAGAATATACTAGAAGTTCTCCTCGAGGCTAATAGGAATCCTAAAATGGCATCTATATTTCGTACTTAATATTACGATTTTCCTCATTCCGTTTTATATGTTTATATTCATTGATCTATTACAGTATCAATCCTGGGTTCAGCTTCACTAATTTAGATGGACTATTTCTCATAATTGTGCGATCTTCTAACACCGTATATGATATATACTAGTAATGTAAATACTAGTAGTAGATGATAGTGAGTTCGATCCAACAACCACCCATAAGTGTAATAGATCTAATGAATCCATTGGTTAGTTAATAGTTTAAATGTTTTTATCGGAGAGGTTTTTGTCATCACATCCAGCAATGTTTCTGGTTCGAATGTAGGATACGTATAATGATTACCTGATACTTCATCTCTAAGTCTCATTGCTTTGTGCCAAAAAAATCTGTTTCTAAATTTTCTTCAGTGTAGACTTAATTATTACTGATCGTTGATTATACAGTTAAGGAAGCCTTAATAAGTGGTTTCATTGTTAAGTTCTGGTATCAGGTAACTTAGATTAGTAATAATGGACAGATTTCCTTACGCGTGTATTTCTGCTTCCGTAGGTGAAAAAGTACATGTTTAATGAAGCCTTGGGGGATTTCTCCAAGTACCGTTCCATTAAGTAAATATATGTGCCAATTGTGATTTATAATCGGTTGAGTTGCCCTACGAAGCATCAATTATGAGATCGTTATTATCTGCTCGGTAGGTTTTGT    .       PASS    IMPRECISE;SVTYPE=INS;END=444238;SVLEN=5563      GT:AD:DP        0/1:2:7
+chrVII 530040  pbsv.INS.11     A       AGGAGCAGGCAGCAGGCAGCAGCAGCAGAAGCAGCAGGCAGCAGGCCAGAGCGGGAGCGCAGCCGCGAGAGAGACCGAGCCGCAGGCA        .       PASS    IMPRECISE;SVTYPE=INS;END=530040;SVLEN=87;SVANN=TANDEM   GT:AD:DP        0/1:3:5
+chrVII 530453  pbsv.INS.12     C       CGGAAGGGGTTGGGGGGGGGGGGGGGGTTGGGGGGGGGGTTGGGGGGGGTGGGGTGGGGTGGGGGGG     .       PASS    IMPRECISE;SVTYPE=INS;END=530453;SVLEN=66;SVANN=TANDEM   GT:AD:DP        0/1:2:5
+chrVIII        1807    pbsv.INS.13     A       AGGGTAGTCGCACTAGTCCGGAAGGGGGAGGAGTTTTTGGCAGTAGTAGTAGCACTAGTCCTGACGTTGGTGATGGCAGTTGGTAGTAGCATGAGTGCTGAGTTGGTACTTTCAGTGGTAGTGCACTAGTGTTGGAGTGGTACTTCA     .       PASS    IMPRECISE;SVTYPE=INS;END=1807;SVLEN=146;SVANN=TANDEM    GT:AD:DP        1/1:12:13
+chrX   715090  pbsv.INS.14     T       TGGAGAAGTTGTAGAAAGTTGTAGAAGTTGTAGAAGTTGTAGAAGTGGTTCAGAGGTT      .       PASS    IMPRECISE;SVTYPE=INS;END=715090;SVLEN=57;SVANN=TANDEM   GT:AD:DP        0/1:8:14
+chrXII 319384  pbsv.INS.15     A       AGAAACCTATATGGGGATCTTAAGGTTAGAGGGTTGAAGACATGCTGAAAATTTTAAGTGTCAGTCAGAAAAATGAATTGGGGGG   .       PASS    IMPRECISE;SVTYPE=INS;END=319384;SVLEN=84        GT:AD:DP        0/1:2:9
+chrXIII        202197  pbsv.INS.16     T       TATAGCTTTATCAACATGGAATCCCCACACTTATCTCACTCACATTCACCCCCCATTTCTCACTAGAATAGTACCTGAAAAGGTGAATTTTTGAAATTGTTTGGGATTCCATTGTTGATAAAGGCTATAATATCAAGCTCTACAGAATAC  .       PASS    IMPRECISE;SVTYPE=INS;END=202197;SVLEN=149       GT:AD:DP        1/1:3:3
+chrXIII        908459  pbsv.INS.17     A       AGCTCAGTAAGTTCGGAAAGCCCATTGGCAACGTCTAGCGTAGTGAGGTTTCAGAAGCTCCATCGTCAAACATCTAGCTCAGTGAGGTTCAGAAGATCCATGTCAAACAACATCTAGTCAGTGATGTCAGAAGCTCCATCGTCAACGTCTAGCTCAGTAAGTTCAGAACTAAATCGGCAACGTCTTAGCGTAATTAGTTCAGAAGCTTCATGGGAACGTTCTAGCTCAGTGAGCTCGGAAGTACCATTGGCAACGTCTAGCGTAGTAGTTAGAAAGCTCCATCGGCAAACTAGTTCAGTGGTTCGAAATTCGTCAACAACAGCTA   .       PASS    IMPRECISE;SVTYPE=INS;END=908459;SVLEN=324;SVANN=TANDEM  GT:AD:DP        0/1:3:8
+chrXIII        908705  pbsv.INS.18     A       ACGTCTAGCTAGTGAGTTCAGAAGCTCCCATCGTCAACCATCTAGCTCGTGAGTTCCGAAATTTCGTCAACAACATCTAGCTCAGTAAGTTCGAAGCTTCCATTGGCAAACGTCGAGCGTAGTGAGTTCAGGAAGCTCCATCGTCAAACATCTCGCTCAGTGAGTTCAGAGCTCCATCGTCAACATCTAGCTCCGTGAGGTTCAGAAGCTCCATCGGCAAACGTCAGGCTCAGTGAGTTCGGAAATTAGTCACAAAATCTAGCGTAATGAGTTCAGAAGTGTCATTCGGCAACGTTTCTGCATAGGAGTCGGGAAGCTCATCGGCAA .       PASS    IMPRECISE;SVTYPE=INS;END=908705;SVLEN=326;SVANN=TANDEM  GT:AD:DP        0/1:4:8
+chrXV  31152   pbsv.INS.19     G       GCTGACCTGGATGTAATGGAAGTAGAGGAACCGGAGATGGAGCCCGATTCCAGTGGGGAGG   .       PASS    IMPRECISE;SVTYPE=INS;END=31152;SVLEN=60 GT:AD:DP        0/1:8:9
+chrXV  721730  pbsv.DEL.20     TTTCTTTTTCTATTACTCTTGGCCTCCTCTAGTACACTCTATATTTTTTTATGCCTCGGTAATGATTTTCATTTTTTTTTTTCCACCTAGCGGATGACTCTTTTTTTTTCTTAGCGATTGGCATTATCACATAATGAATTATACATTATATAAAGTAATGTGATTTCTTCGAAGAATATACTAAAAAATGAGCAGGCAAGATAAACGAAGGCAAAGATGACAGAGCAGAAAGCCCTAGTAAAGCGTATTACAAATGAAACCAAGATTCAGATTGCGATCTCTTTAAAGGGTGGTCCCCTAGCGATAGAGCACTCGATCTTCCCAGAAAAAGAGGCAGAAGCAGTAGCAGAACAGGCCACACAATCGCAAGTGATTAACGTCCACACAGGTATAGGGTTTCTGGACCATATGATACATGCTCTGGCCAAGCATTCCGGCTGGTCGCTAATCGTTGAGTGCATTGGTGACTTACACATAGACGACCATCACACCACTGAAGACTGCGGGATTGCTCTCGGTCAAGCTTTTAAAGAGGCCCTAGGGGCCGTGCGTGGAGTAAAAAGGTTTGGATCAGGATTTGCGCCTTTGGATGAGGCACTTTCCAGAGCGGTGGTAGATCTTTCGAACAGGCCGTACGCAGTTGTCGAACTTGGTTTGCAAAGGGAGAAAGTAGGAGATCTCTCTTGCGAGATGATCCCGCATTTTCTTGAAAGCTTTGCAGAGGCTAGCAGAATTACCCTCCACGTTGATTGTCTGCGAGGCAAGAATGATCATCACCGTAGTGAGAGTGCGTTCAAGGCTCTTGCGGTTGCCATAAGAGAAGCCACCTCGCCCAATGGTACCAACGATGTTCCCTCCACCAAAGGTGTTCTTATGTAGTGACACCGATTATTTAAAGCTGCAGCATACGATATATATACATGTGTATATATGTATACCTATGAATGTCAGTAAGTATGTATACGAACAGTATGATACTGAAGATGACAAGGTAATGCATCATTCTATACGTGTCATTCTGAACGAGGCGCGC       T       .       PASS    IMPRECISE;SVTYPE=DEL;END=722762;SVLEN=-1032     GT:AD:DP        0/1:3:6
+chrXVI 660831  pbsv.INS.21     C       CAAAGGAATGGTAAAGATGGGGGGTCAACGGACAAGGGAAAGGATCCATGGGGGCA        .       PASS    IMPRECISE;SVTYPE=INS;END=660831;SVLEN=55        GT:AD:DP        0/1:2:5
diff --git a/tests/data/vcf/unsorted.vcf b/tests/data/vcf/unsorted.vcf

new file mode 100644 (file)

index 0000000..62933f4
--- /dev/null
+++ b/tests/data/vcf/unsorted.vcf
@@ -0,0 +1,12 @@
+##fileformat=VCFv4.2
+##fileDate=20180531
+##contig=<ID=ctg1,length=4200,assembly=foo,md5=dead>
+##contig=<ID=ctg2,length=5000,assembly=foo,md5=beef>
+##contig=<ID=ctg3,length=3000,assembly=foo,md5=1234>
+#CHROM POS     ID      REF     ALT     QUAL    FILTER  INFO
+ctg1   1       variant0        A       T       .       PASS
+ctg1   10      variant1        A       T       .       PASS
+ctg3   50      variant2        A       T       .       PASS
+ctg2   20      variant3        A       T       .       PASS
+ctg3   10      variant4        A       T       .       PASS
+ctg1   5       variant5        A       T       .       PASS
diff --git a/tests/meson.build b/tests/meson.build

new file mode 100644 (file)

index 0000000..c5c6403
--- /dev/null
+++ b/tests/meson.build
@@ -0,0 +1,70 @@
+subdir('src')
+
+pbbam_cram_script = find_program('cram', required : false)
+if not pbbam_cram_script.found()
+  warning('Using bundled cram script')
+  pbbam_cram_script = find_program('scripts/cram.py', required : true)
+endif
+
+pbbam_gtest_dep = dependency('gtest_main', fallback : ['gtest', 'gtest_main_dep'])
+
+pbbam_PbbamTestData_h_config = configuration_data()
+pbbam_PbbamTestData_h_config.set('PacBioBAM_BinDir', join_paths([meson.current_build_dir(), '../tools']))
+pbbam_PbbamTestData_h_config.set('PacBioBAM_TestsDir', meson.current_source_dir())
+pbbam_PbbamTestData_h_config.set('CMAKE_CURRENT_BINARY_DIR', meson.current_build_dir())
+pbbam_PbbamTestData_h_config.set('GeneratedDir', meson.current_build_dir())
+pbbam_PbbamTestData_h_config.set('GeneratedTestDataDir', meson.current_build_dir())
+
+pbbam_group_fofn_in = configure_file(
+  input : files('data/group/group.fofn.in'),
+  output : 'group.fofn',
+  configuration : pbbam_PbbamTestData_h_config)
+pbbam_PbbamTestData_h = configure_file(
+  input : files('src/PbbamTestData.h.in'),
+  output : 'PbbamTestData.h',
+  configuration : pbbam_PbbamTestData_h_config)
+pbbam_test_cpp_sources += pbbam_PbbamTestData_h
+
+pbbam_test = executable(
+  'pbbam_test',
+  pbbam_test_cpp_sources,
+  dependencies : [pbbam_gtest_dep, pbbam_boost_dep, pbbam_htslib_dep, pbbam_zlib_dep, pbbam_pbcopper_dep],
+  include_directories : pbbam_include_directories,
+  link_with : pbbam_lib,
+  cpp_args : pbbam_warning_flags,
+  install : false)
+
+custom_target('pbbam_generate_data',
+  output : 'input.fa',
+  command : [
+    pbbam_python,
+    files('scripts/generate_data.py'),
+    join_paths([meson.current_source_dir(), 'data']),
+    meson.current_build_dir()],
+  build_by_default : true,
+  install : false)
+
+pbbamify_synthetic_dataset = configure_file(
+  input : files('data/pbbamify/synthetic_movie_all.subreadset.xml.in'),
+  output : 'synthetic_movie_all.subreadset.xml',
+  configuration : pbbam_PbbamTestData_h_config)
+
+#########
+# tests #
+#########
+
+test(
+  'pbbam formatting check',
+  pbbam_clang_formatter,
+  args : [
+    '--all'],
+  workdir : meson.source_root())
+
+test(
+  'pbbam gtest unittests',
+  pbbam_test,
+  args : [
+    '--gtest_output=xml:' + join_paths(meson.build_root(), 'pbbam-gtest-unittests.xml')],
+  env : [
+    'ARGS=-V',
+    'VERBOSE=1'])
diff --git a/tests/scripts/generate_data.py b/tests/scripts/generate_data.py

new file mode 100755 (executable)

index 0000000..278acb5
--- /dev/null
+++ b/tests/scripts/generate_data.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os, shutil, sys
+from io import StringIO
+
+# FASTA generation
+fastaSeq_1 = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG"""
+
+fastaSeq_2 = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC"""
+
+fastaSeq_3 = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT"""
+
+# FASTQ generation
+
+fastqSeq_1   = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG"""
+fastqQuals_1 = """[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[["""
+
+fastqSeq_2   = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC"""
+fastqQuals_2 = """[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[["""
+
+fastqSeq_3   = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT"""
+fastqQuals_3 = """]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]"""
+
+
+# file creation decorator
+def fileMaker(func):
+    def inner(*args, **kwargs):
+        print(" - Creating file: %s..." % args[1], end='')
+        sys.stdout.flush()
+        retval = func(*args)
+        print("done.")
+        sys.stdout.flush()
+        return retval
+    return inner
+
+# symlink creation decorator
+def fileLinker(func):
+    def inner(*args, **kwargs):
+        print(" - Creating symlink: %s..." % args[1], end='')
+        sys.stdout.flush()
+        retval = func(*args)
+        print("done.")
+        sys.stdout.flush()
+        return retval
+    return inner
+
+# return a copy of original, minues any lines that contain an entry in blacklist
+def trimXmlElements(original, blacklist):
+    out = StringIO()
+    for line in original.splitlines():
+        if all(x not in line for x in blacklist):
+            out.write(line + '\n')
+    result = out.getvalue()
+    out.close()
+    return result
+
+class TestDataGenerator:
+
+    def __init__(self, source, dest):
+
+        # source/destination directories
+        self.testDataDir      = source
+        self.generatedDataDir = dest
+
+        # generated output files/symlinks & 'maker' functions
+        self.outputFiles = {
+            'truncated.bam' : self.makeTruncatedBam,
+            'chunking_emptyfilters.subreadset.xml'   : self.makeChunkingXml,
+            'chunking_missingfilters.subreadset.xml' : self.makeChunkingXml,
+            'normal.fa' : self.makeNormalFasta,
+            'normal.fq' : self.makeNormalFastq
+        }
+        self.outputSymlinks = {
+            'aligned.bam'      : self.makeAlignedBamCopy,
+            'aligned.bam.bai'  : self.makeAlignedBamCopy,
+            'aligned.bam.pbi'  : self.makeAlignedBamCopy,
+            'aligned2.bam'     : self.makeAlignedBamCopy,
+            'aligned2.bam.bai' : self.makeAlignedBamCopy,
+            'aligned2.bam.pbi' : self.makeAlignedBamCopy,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam'     : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi' : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam'     : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi' : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam'     : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi' : self.makeChunkingSymlink,
+            'missing_pbi.bam' : self.makeMissingPbiBam,
+        }
+
+    def editChunkingXml(self, outputFn, removeFiltersNode):
+        inputXmlFn  = os.path.join(self.testDataDir,'chunking','chunking.subreadset.xml')
+        outputXmlFn = os.path.join(self.generatedDataDir,outputFn)
+
+        blacklist = ['pbds:Filter>', 'pbbase:Properties>', '<pbbase:Property']
+        if removeFiltersNode:
+            blacklist.append('pbds:Filters>')
+
+        inputXml = ''
+        with open(inputXmlFn, 'r') as xml_infile:
+            inputXml = xml_infile.read()
+        outputXml = trimXmlElements(inputXml, blacklist)
+        with open(outputXmlFn, 'w') as xml_outfile:
+            xml_outfile.write(outputXml)
+
+    @fileLinker
+    def makeAlignedBamCopy(self, outputFn):
+        source = os.path.join(self.testDataDir,outputFn)
+        dest   = os.path.join(self.generatedDataDir, outputFn)
+        os.symlink(source, dest)
+
+    @fileLinker
+    def makeChunkingSymlink(self, outputFn):
+        source = os.path.join(self.testDataDir,'chunking', outputFn)
+        dest   = os.path.join(self.generatedDataDir, outputFn)
+        os.symlink(source, dest)
+  
+    @fileLinker
+    def makeMissingPbiBam(self, outputFn):
+        source = os.path.join(self.testDataDir, 'phi29.bam')
+        dest   = os.path.join(self.generatedDataDir, outputFn)
+        os.symlink(source, dest)
+
+    @fileMaker
+    def makeChunkingXml(self, outputFn):
+        if outputFn == 'chunking_emptyfilters.subreadset.xml':
+            removeFiltersNode = False
+        else:
+            removeFiltersNode = True
+        self.editChunkingXml(outputFn, removeFiltersNode)
+
+    @fileMaker
+    def makeNormalFasta(self, outputFn):
+        content = ">1\n" + fastaSeq_1 + "\n>2\n" + fastaSeq_2 + "\n>3\n" + fastaSeq_3
+        dest = os.path.join(self.generatedDataDir, outputFn)
+        with open(outputFn, 'w') as fasta_out:
+            fasta_out.write(content)
+
+    @fileMaker
+    def makeNormalFastq(self, outputFn):
+        content = ("@1\n" + fastqSeq_1 + "\n+\n" + fastqQuals_1 + "\n" +
+                   "@2\n" + fastqSeq_2 + "\n+\n" + fastqQuals_2 + "\n" +
+                   "@3\n" + fastqSeq_3 + "\n+\n" + fastqQuals_3 + "\n")
+        dest = os.path.join(self.generatedDataDir, outputFn)
+        with open(outputFn, 'w') as fastq_out:
+            fastq_out.write(content)
+
+    @fileMaker
+    def makeTruncatedBam(self, outputFn):
+        source = os.path.join(self.testDataDir, 'phi29.bam')
+        dest   = os.path.join(self.generatedDataDir, outputFn)
+        shutil.copyfile(source, dest)
+        with open(dest, 'r+b') as in_file:
+            in_file.truncate(200)
+
+    # main entry point
+    def generate(self):
+
+        # skip file if it exists
+        os.chdir(self.generatedDataDir)
+        filenames = list(self.outputFiles.keys())
+        for file in filenames:
+            if os.path.exists(file) :
+                del self.outputFiles[file]
+
+        # skip symlink if it exists
+        symlinks = list(self.outputSymlinks.keys())
+        for link in symlinks:
+            if os.path.lexists(link):
+                del self.outputSymlinks[link]
+
+        # only print message & run makers, if any files/symlinks to be created
+        # else silent success
+        if self.outputFiles or self.outputSymlinks:
+            print('Generating test data in %s ' % self.generatedDataDir)
+            for file, func in self.outputFiles.items():
+                func(file)
+            for link, func in self.outputSymlinks.items():
+                func(link)
+
+# script entry point
+if __name__ == '__main__':
+    g = TestDataGenerator(sys.argv[1], sys.argv[2])
+    g.generate()
diff --git a/tests/src/FastxTests.h b/tests/src/FastxTests.h

new file mode 100644 (file)

index 0000000..76fa3b8
--- /dev/null
+++ b/tests/src/FastxTests.h
@@ -0,0 +1,92 @@
+// Author: Derek Barnett
+
+#ifndef FASTXTESTS_H
+#define FASTXTESTS_H
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/FastaSequence.h>
+#include <pbbam/FastqSequence.h>
+
+#include "PbbamTestData.h"
+
+using FastaSequence = PacBio::BAM::FastaSequence;
+using FastqSequence = PacBio::BAM::FastqSequence;
+
+namespace FastxTests {
+
+// clang-format off
+
+const std::string fastxDataDir = PacBio::BAM::PbbamTestsConfig::Data_Dir + "/fastx/";
+
+const std::string simpleFastaFn        = fastxDataDir + "simple.fa";
+const std::string simpleFastaFaiFn     = fastxDataDir + "simple.fa.fai";
+const std::string simpleFastaGzipFn    = fastxDataDir + "simple-gzip.fa.gz";
+const std::string simpleFastaBgzfFn    = fastxDataDir + "simple-bgzf.fa.gz";
+const std::string simpleFastaBgzfGziFn = fastxDataDir + "simple-bgzf.fa.gz.gzi";
+
+const std::string simpleFastqFn        = fastxDataDir + "simple.fq";
+const std::string simpleFastqFaiFn     = fastxDataDir + "simple.fq.fai";
+const std::string simpleFastqGzipFn    = fastxDataDir + "simple-gzip.fq.gz";
+const std::string simpleFastqBgzfFn    = fastxDataDir + "simple-bgzf.fq.gz";
+const std::string simpleFastqBgzfGziFn = fastxDataDir + "simple-bgzf.fq.gz.gzi";
+
+const std::string chunkingFastaFn    = fastxDataDir + "chunking.fa";
+const std::string chunkingFastaFaiFn = fastxDataDir + "chunking.fa.fai";
+const std::string chunkingFastqFn    = fastxDataDir + "chunking.fq";
+const std::string chunkingFastqFaiFn = fastxDataDir + "chunking.fq.fai";
+
+const std::vector<FastaSequence> ExpectedFasta {
+    FastaSequence{ "seq1", "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG"},
+    FastaSequence{ "seq2", "GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA"},
+    FastaSequence{ "seq3", "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG"},
+    FastaSequence{ "seq4", "GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA"},
+    FastaSequence{ "seq5", "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG"},
+    FastaSequence{ "seq6", "GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA"},
+    FastaSequence{ "seq7", "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG"},
+    FastaSequence{ "seq8", "GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA"}
+};
+
+const std::vector<FastqSequence> ExpectedFastq {
+    FastqSequence{
+        "seq1",
+        "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG",
+      R"(ZABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~)"},
+    FastqSequence{
+        "seq2",
+        "GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA",
+      R"(~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@)"},
+    FastqSequence{
+        "seq3",
+        "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG",
+      R"(!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_)"},
+    FastqSequence{
+        "seq4",
+        "GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA",
+      R"(_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!)"},
+    FastqSequence{
+        "seq5",
+        "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG",
+      R"(;;>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~)"},
+    FastqSequence{
+        "seq6",
+        "GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA",
+      R"(~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJHGFECB@>;;)"},
+    FastqSequence{
+        "seq7",
+        "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG",
+      R"(ZABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~)"},
+    FastqSequence{
+        "seq8",
+        "GCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA",
+      R"(~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@)"},
+};
+
+
+
+} // namespace FastxTests
+
+#endif  // FASTXTESTS_H
+\ No newline at end of file
diff --git a/tests/src/PbbamTestData.h.in b/tests/src/PbbamTestData.h.in

new file mode 100644 (file)

index 0000000..3a620af
--- /dev/null
+++ b/tests/src/PbbamTestData.h.in
@@ -0,0 +1,23 @@
+// Author: Derek Barnett
+
+#ifndef PBBAMTESTDATA_H
+#define PBBAMTESTDATA_H
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace PbbamTestsConfig {
+
+const std::string Source_Dir = std::string("@PacBioBAM_TestsDir@");
+const std::string Bin_Dir    = std::string("@CMAKE_CURRENT_BINARY_DIR@");
+const std::string Data_Dir   = std::string("@PacBioBAM_TestsDir@/data");
+const std::string Generated_Dir     = std::string("@GeneratedDir@");
+const std::string GeneratedData_Dir = std::string("@GeneratedTestDataDir@");
+const std::string Bam2Sam    = std::string("@PacBioBAM_BinDir@/bam2sam");
+
+} // namespace PbbamTestsConfig
+} // namespace BAM
+} // namespace PacBio
+
+#endif // PBBAMTESTDATA_H
diff --git a/tests/src/cram/bam2sam.t.in b/tests/src/cram/bam2sam.t.in

new file mode 100644 (file)

index 0000000..9b93994
--- /dev/null
+++ b/tests/src/cram/bam2sam.t.in
@@ -0,0 +1,47 @@
+Setup:
+
+  $ BAM2SAM="@PacBioBAM_BinDir@/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+
+Normal:
+
+  $ $BAM2SAM < $DATADIR/phi29.bam | head -n 5
+  @HD\tVN:3.0.0\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\tPU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0 (esc)
+  @PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0 (esc)
+  @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/2067_4072\t4\t*\t0\t255\t*\t*\t0\t0\tAAGTCATGTATAGAGTTATTGGCTCAGCGGTGGCAAGCAGCCAACTCAGCTCCTTTCGGGCTTGTTAGCAGCCGGATCCACACTCTGAAATTCCTGCAGCTCGAGTTATTTGATAGTAAAAGTGGGTCATCAAACCGCAACTACGCCACCCCGGTACCTGAACAGGCTTCGGTTTCATTTTGAGACGAGAAAAACCCACTTTGAAGTTTTCGAAAATCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAAGAGAACTTGATGTCAGTGTTAGTCGTCAGGAGAGCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGATGTAAGGTTTTCTGACGCAGATATTGTTGGCAACGCTTAAAAAGTGGGATTCGTGTGGCCAGTAGCCCAGGTTATCTTTCGGGTCCACGATGTCTTTGATAACGTCCAGAATTTCAGTACCAGTTCAGGTGAATAGAACGGTATCGCAGTTTCAGATAATACGATCATGACAGCCTGCGCTGGCTGTGATGGTCGTATAGCGTGCCCAGGCCGTGATAAAGACACCCATCCGGGGTATATACAGGGTCTTTCGTTTCCTCCCTAGCCTAAGAAAGAAAACCCAGAGCACCGTTCTCTTCAAGGTATGGCACTTTACCGTAACGTCCGGGTTGGAAAGCGAAATTTACCGTACAGGCCTGGTTCAGCATCAGCTTCCCGCCAGCTGTTTGATGGCGTCCCTTCAGAGGTAGTTTTGATATACGTCCATTTTGTCGATAAAGTCCTTGAACAGGCAAGTGGTTCCTTTGAAACTTCAGACCAAGAGGATAATATTCAACATTGTACAGGTCGTAGTGTCTTTCATCAGTTCCAGATCAAACATTAGAACAGCCACAAGGTCAGCAAATTTCACCGCCGGAAGATTTCAGGTAATTCGTTACGCTTGTAAAGAAATTGGGACGCTTGATCTGGGATGGTCGGGATCTAGCCTCCTTCAGTTCAAATTCACTTAACGAATGTGCTGAATGTGCCAGAGGGTAATCCTCGTCCAAAGACGGTATTTACCCTCGAAACGATCGGCTCGCCGTACGGCAACGCAGGCGAGAGTACATTCTGCAGCTGGGTACAGGGAATTAAACATCAAAAACCATACCTTCGCCGATCTCTTTTTCCTTTAAAACGTCATTCAGCCACGGTGAAGACCAACCACGGTAAGGCATAACGAACTTCCTTGTCCAGACCCAGGCTCAGGGTCCGGGAAAACCTTTTTAAACTTCTTGTGGTGATAATGTCTTTGAAGCCTTTCAGAAGTCAGAACCTGCCGTCATACGATCCAAGACCACTGCTTGAACCTGGATCAGCAGACTTCTGCGATAATCTGAATATCGTTTTTAATGTAGGCAATATTTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAAGTCGATGTCGCCTTTCAGTAGCCGTCAGTTAAAAGTCTTTCGCAATTTTCTTAACGCGGAAAGGCAAGTTTCTAGAGAGTCGTAAGATCACGTGTGGAATCTTGCGTTTACCCTTGTAACCCAGGCAATACAATCATATAACCACTGGCCCAATGCGAGAAATGATGGTGTTGTAGGTATTTTGGCAGACCATCTGCGGACCGCATTTAAAACGTTACGTTCCAGCCAGTTGATGATGAAATGCGCCATCAAAATTTCAGATTTGTGGAAGTACAGGTCCCAGCCTGAAAACTTTGCAGAAACCCAAGCCATAAAATTCATCCAGGGAGTTACCAATCTTATACTCGGAGTGGTCTTCGATGTTCATTGTAACCGTATGCCCATACGCGGCAATCTTCAAGCCTTGGTGTAGTCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTCATATGTCATTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTTATCCGCCTCACATTTCCCTATAGTGAGTCGTATTAATTTCGCGGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:4072\tqs:i:2067\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+
+Explicit Filename (not stdin):
+
+  $ $BAM2SAM $DATADIR/phi29.bam | head -n 5
+  @HD\tVN:3.0.0\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\tPU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0 (esc)
+  @PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0 (esc)
+  @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/2067_4072\t4\t*\t0\t255\t*\t*\t0\t0\tAAGTCATGTATAGAGTTATTGGCTCAGCGGTGGCAAGCAGCCAACTCAGCTCCTTTCGGGCTTGTTAGCAGCCGGATCCACACTCTGAAATTCCTGCAGCTCGAGTTATTTGATAGTAAAAGTGGGTCATCAAACCGCAACTACGCCACCCCGGTACCTGAACAGGCTTCGGTTTCATTTTGAGACGAGAAAAACCCACTTTGAAGTTTTCGAAAATCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAAGAGAACTTGATGTCAGTGTTAGTCGTCAGGAGAGCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGATGTAAGGTTTTCTGACGCAGATATTGTTGGCAACGCTTAAAAAGTGGGATTCGTGTGGCCAGTAGCCCAGGTTATCTTTCGGGTCCACGATGTCTTTGATAACGTCCAGAATTTCAGTACCAGTTCAGGTGAATAGAACGGTATCGCAGTTTCAGATAATACGATCATGACAGCCTGCGCTGGCTGTGATGGTCGTATAGCGTGCCCAGGCCGTGATAAAGACACCCATCCGGGGTATATACAGGGTCTTTCGTTTCCTCCCTAGCCTAAGAAAGAAAACCCAGAGCACCGTTCTCTTCAAGGTATGGCACTTTACCGTAACGTCCGGGTTGGAAAGCGAAATTTACCGTACAGGCCTGGTTCAGCATCAGCTTCCCGCCAGCTGTTTGATGGCGTCCCTTCAGAGGTAGTTTTGATATACGTCCATTTTGTCGATAAAGTCCTTGAACAGGCAAGTGGTTCCTTTGAAACTTCAGACCAAGAGGATAATATTCAACATTGTACAGGTCGTAGTGTCTTTCATCAGTTCCAGATCAAACATTAGAACAGCCACAAGGTCAGCAAATTTCACCGCCGGAAGATTTCAGGTAATTCGTTACGCTTGTAAAGAAATTGGGACGCTTGATCTGGGATGGTCGGGATCTAGCCTCCTTCAGTTCAAATTCACTTAACGAATGTGCTGAATGTGCCAGAGGGTAATCCTCGTCCAAAGACGGTATTTACCCTCGAAACGATCGGCTCGCCGTACGGCAACGCAGGCGAGAGTACATTCTGCAGCTGGGTACAGGGAATTAAACATCAAAAACCATACCTTCGCCGATCTCTTTTTCCTTTAAAACGTCATTCAGCCACGGTGAAGACCAACCACGGTAAGGCATAACGAACTTCCTTGTCCAGACCCAGGCTCAGGGTCCGGGAAAACCTTTTTAAACTTCTTGTGGTGATAATGTCTTTGAAGCCTTTCAGAAGTCAGAACCTGCCGTCATACGATCCAAGACCACTGCTTGAACCTGGATCAGCAGACTTCTGCGATAATCTGAATATCGTTTTTAATGTAGGCAATATTTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAAGTCGATGTCGCCTTTCAGTAGCCGTCAGTTAAAAGTCTTTCGCAATTTTCTTAACGCGGAAAGGCAAGTTTCTAGAGAGTCGTAAGATCACGTGTGGAATCTTGCGTTTACCCTTGTAACCCAGGCAATACAATCATATAACCACTGGCCCAATGCGAGAAATGATGGTGTTGTAGGTATTTTGGCAGACCATCTGCGGACCGCATTTAAAACGTTACGTTCCAGCCAGTTGATGATGAAATGCGCCATCAAAATTTCAGATTTGTGGAAGTACAGGTCCCAGCCTGAAAACTTTGCAGAAACCCAAGCCATAAAATTCATCCAGGGAGTTACCAATCTTATACTCGGAGTGGTCTTCGATGTTCATTGTAACCGTATGCCCATACGCGGCAATCTTCAAGCCTTGGTGTAGTCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTCATATGTCATTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTTATCCGCCTCACATTTCCCTATAGTGAGTCGTATTAATTTCGCGGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:4072\tqs:i:2067\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+
+Header-Only:
+
+  $ $BAM2SAM --header-only < $DATADIR/phi29.bam | head -n 5
+  @HD\tVN:3.0.0\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\tPU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0 (esc)
+  @PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0 (esc)
+  @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2 (esc)
+
+No-Header:
+
+  $ $BAM2SAM --no-header < $DATADIR/phi29.bam | head -n 5
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/2067_4072\t4\t*\t0\t255\t*\t*\t0\t0\tAAGTCATGTATAGAGTTATTGGCTCAGCGGTGGCAAGCAGCCAACTCAGCTCCTTTCGGGCTTGTTAGCAGCCGGATCCACACTCTGAAATTCCTGCAGCTCGAGTTATTTGATAGTAAAAGTGGGTCATCAAACCGCAACTACGCCACCCCGGTACCTGAACAGGCTTCGGTTTCATTTTGAGACGAGAAAAACCCACTTTGAAGTTTTCGAAAATCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAAGAGAACTTGATGTCAGTGTTAGTCGTCAGGAGAGCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGATGTAAGGTTTTCTGACGCAGATATTGTTGGCAACGCTTAAAAAGTGGGATTCGTGTGGCCAGTAGCCCAGGTTATCTTTCGGGTCCACGATGTCTTTGATAACGTCCAGAATTTCAGTACCAGTTCAGGTGAATAGAACGGTATCGCAGTTTCAGATAATACGATCATGACAGCCTGCGCTGGCTGTGATGGTCGTATAGCGTGCCCAGGCCGTGATAAAGACACCCATCCGGGGTATATACAGGGTCTTTCGTTTCCTCCCTAGCCTAAGAAAGAAAACCCAGAGCACCGTTCTCTTCAAGGTATGGCACTTTACCGTAACGTCCGGGTTGGAAAGCGAAATTTACCGTACAGGCCTGGTTCAGCATCAGCTTCCCGCCAGCTGTTTGATGGCGTCCCTTCAGAGGTAGTTTTGATATACGTCCATTTTGTCGATAAAGTCCTTGAACAGGCAAGTGGTTCCTTTGAAACTTCAGACCAAGAGGATAATATTCAACATTGTACAGGTCGTAGTGTCTTTCATCAGTTCCAGATCAAACATTAGAACAGCCACAAGGTCAGCAAATTTCACCGCCGGAAGATTTCAGGTAATTCGTTACGCTTGTAAAGAAATTGGGACGCTTGATCTGGGATGGTCGGGATCTAGCCTCCTTCAGTTCAAATTCACTTAACGAATGTGCTGAATGTGCCAGAGGGTAATCCTCGTCCAAAGACGGTATTTACCCTCGAAACGATCGGCTCGCCGTACGGCAACGCAGGCGAGAGTACATTCTGCAGCTGGGTACAGGGAATTAAACATCAAAAACCATACCTTCGCCGATCTCTTTTTCCTTTAAAACGTCATTCAGCCACGGTGAAGACCAACCACGGTAAGGCATAACGAACTTCCTTGTCCAGACCCAGGCTCAGGGTCCGGGAAAACCTTTTTAAACTTCTTGTGGTGATAATGTCTTTGAAGCCTTTCAGAAGTCAGAACCTGCCGTCATACGATCCAAGACCACTGCTTGAACCTGGATCAGCAGACTTCTGCGATAATCTGAATATCGTTTTTAATGTAGGCAATATTTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAAGTCGATGTCGCCTTTCAGTAGCCGTCAGTTAAAAGTCTTTCGCAATTTTCTTAACGCGGAAAGGCAAGTTTCTAGAGAGTCGTAAGATCACGTGTGGAATCTTGCGTTTACCCTTGTAACCCAGGCAATACAATCATATAACCACTGGCCCAATGCGAGAAATGATGGTGTTGTAGGTATTTTGGCAGACCATCTGCGGACCGCATTTAAAACGTTACGTTCCAGCCAGTTGATGATGAAATGCGCCATCAAAATTTCAGATTTGTGGAAGTACAGGTCCCAGCCTGAAAACTTTGCAGAAACCCAAGCCATAAAATTCATCCAGGGAGTTACCAATCTTATACTCGGAGTGGTCTTCGATGTTCATTGTAACCGTATGCCCATACGCGGCAATCTTCAAGCCTTGGTGTAGTCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTCATATGTCATTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTTATCCGCCTCACATTTCCCTATAGTGAGTCGTATTAATTTCGCGGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:4072\tqs:i:2067\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/4151_6183\t4\t*\t0\t255\t*\t*\t0\t0\tGATCCCGCGAATTAATTACGACTCACTATAGGGGAATTGTGAGCGGATAACAATTCCCGCCTCTAGAAATAATTTTGTTTAAACTTTTAAGAAAGGAGATATTACATATGAAACACAGCCACGTAAAATGTATTCCTGCGACTTGGAGACTACCACCAAGGTGAAGATTTGCCGCGTAATGGGCATACGGTTTACATGAAACATCGAAGAACAAACTCGAGTATAAGATTGGTAACTCCCCTGGATGAATTATGGCTTGGGTTACTGAAAGTTCGAGGTCTGACCTGTACTTCGCACAAATCTGAAAATTTGATGGCCGCAAATTTCAATTCATCACTGGCTGGAACGTAAACGGTTTTAAATGGTCCGCAGATCGGTCTGTGCCAAATACCCTGATCAACACATCATTTCTTCGCAATGCGGCCAGTGTAATGATTGATATCTTGCCCTGGGTTGACAAGGGGTAAACGCAAGATCCACACCTGTGATCTACGACCTCTCTGAAGAAAACTGCGTTTCCGGTTAAGAAAATTGCGAAAGACTTTAAGCTGAACGGTACTGAAAAGCGACATGACTATCATAATGAGCGCCCGGTCGTTACAAAATCACCCCGGAAAGAATATGCCTACATTTAAAAACGATATTCAGATTATCGCAAGAACTCTGCTGATCAGTTCAAGCAAGGGTCTGGATCGTAAATGACGGCAGGTTCTGACTCTCCTGAAAGGCTTCAAAAGACATTATCACCACCTAAAAGAAGTTTAAAAAGGTTTTTTCACCGACCCTGAGCCTAGGGCTGGACAAGGAAAGTTGTTAATGCCCATACCGTGGTGGTTTCACCTGGCTGAAAGACCGTTTTAAAGAAAAAGAGATCGGCGAAGGTATGGTTTTTGATGTTAATTCCCTGTAACCAAGCCTTCAATGTACTCTCGCCTGCTTGCCGTCACACGGGCGAGCGACGTATTCGAAAGGGTAAAATACGTTCTGGGACGGAGGATTTACCCTCTGCAATTCGGCACATTCCGTTGTGAATTTGGAACTGAAAGGAAGGCTTAGATCCCGACCATCCCAGATCAAGCGTTCCCATTTCTAACAAAGGGTAACGAATACCTGAAATCTTCCAGGCGGTGAAATTGCTGACCTGTGGCTGTCTAAATGTTTGATCTTGGAAACTGATGAAAGAGCACTACGACCTGGTACAATGTTGAATATATCTCTGGTCTGAAGTTCAAAGCAACCACTGGCCTGTTCAAGGACTTTATCGACAATGGACGTATATCAAAAGACTACCTCTGAAGACGCCATCAAACAGCTGGCGAAGCTGATGCTGACAAGCCTGTACGGTAAATTCGCGTCCCACCCGGACGTTTACCGGGTAAAGTGCCATATGCTGAAAGAGAAAGCGGTGCTCTGGTTTTTCGTCTAGGTGGAAGGAGGAAACGAAGACACTGTATATACCGCCGAATGGGTGTCTTTATCCAAGCGGCCTGGCACGCTATACGACCATCACAGGCAAGCGCAGGCTTTTGTTAATGATCGTATTATCTACTGCGATTACCGATTCTACTTCACTGACTGGTACTGAAATCTGGACGTTATCAAAGACATCGTAGACCCGAAGAAACTGGGCTACTGGCACACGAATTCCACTTTAAGCGTGCAAAATATCTGCGTCAGAAAACCTACATCCCAGGATATTTACATGAAAGAAGTAGACGGCAAACTGGTAGAGGGCTCTCCGTGACGACTACACTGACATCAAGTTCTCTGTGAAATGCGCAGGCAAATGACGGCACAAAATCCAAAAAAGGAAGTGACTTTCGAAAACTTCAAAAGTGGGTTCTCGTAAAATGAAACCGAAAGCTGTTCAGGTTTAAACCCGGGTGGCGTAGTGCCTGGTTGATGAACACTTTTTACTATCAAAATAACTTCGAAAGCTGCAGGAATTCAAGCTGATCCGGCTGCTAACAAAGCCCGAAGGAAGCTGAGTTGGCTGCTGCCACCGTGAGCAATACTCTAAATACATGACTCT\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:6183\tqs:i:4151\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/6234_8214\t4\t*\t0\t255\t*\t*\t0\t0\tAGAGTCATGTATAAGAGTTATTGCTCAGCGGTGGCAGCAGACAACTCAGCTTCCTTTCGGGCCTTTGTTAGCAGCCGGATCCAAGCTTGAATTCCTGCAAGCTCGAGTTATTTGATAGTAAAAGTGTCATCAAACCAGCACTACGGCCGAACCCGGTACCTGAACAGATTCGTTTCATTTTACGAGAAAAACCCACTTTGAAGTTTTGCCGAAAGTCACTTCTTTTTGATTTGTCCGTCATGCTGCGCATTTCACAGAGACTTGAATGTCAGTGTAGTCGTCATCGGGGGGGGGAAGAGCCCTCTACCAGTTTTGCCGTCTACTTCTTTCATGTAAATATCTGGATGTAGGTTTTCTGAACGCAGATATTTGCAGCTTAAAAGTGGATTCGTGTGCCCAGTAGCCCGTTTTCTTCGGGTCCTACGATGTCTTTTGATAACGTCCAGAATTTCAGTACCAGTCAGGTGAATAGAATCGGTATCGCAGTAGATAAATACGATCATAACAAGCCTGCGCTGCCTGTTGATGGTCGTATAGCGTGCCCAGGCCCGTGATAAAGAACCATCGGGGTATATAACAGGGTCTTTCGTTCCTCCTCACCTAGACGAAAAACCCAGAGCACCGTTCTCTTTCAAGGTATGGCCTTTACCGGTAACGTCCGGGTTGGACGCGAATTTAAGCCGTAACAGGCTGTCTCAGCATACAGCTTTCGCCAGCCTGTTTGATGGCCGTCTTCAGAGGTAGTTTTGATATACGTTCCATTTGTCGATAAAGTCCCTTGAGCAGGCCCAGTGGGTTGGCTTTGAACTCAGACGCAGAATATATTCAACATTGTAACAGGTCGTAGTGCTCTTTCATCAGTTCAGATTCAACATTAGACAAGCCACAGGTCAGCATTTCACCGCCGGGAAGAATTTCAAGGTATTCGTTTACCCTTGGTAGAAATGGAACGCTTGTAATCTGGATGGTCGGGATCTAGCCTTTTCAGTTCAAAATTCACACGAATGTGCTGAATGTGCAGAGGGTAATACCTCGTCCCAGACGTATTTACCCTCGAATAAGCGAATCGGCTCGCCGTATCGCAGCAGGCGAGAGTAAACATTTGAGCTGGGTAACAGGGAATTACATCCAAAACCATACCCTTTCGCACGATCTCTTTTTCTTTAAAACGTCATTCAGCCAGGTGAAACCACCAGGTAGGCATAACGAACTTCCTGTCCAGACCCAAGGCTCAGGTCGGGAAAACTTTTTAAACTTCCTTGTGGTGATAATGTCTTTTGAAGCTTTCAGAGAGTCAGAACCTGCGTCATACGATCCAGACCGCTGCTTGAAGCTGGATCAGCAAGCTTCTGCGATAATCTGAATATCGTTTATTAATTAGGCATATTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAGTCGATGTCGCCCCTTTCAAGTACCGTCAGCTAAAGTCTTTCGCATTTTCTTACCGGAAACGGCAGTTTCTTCAGAGAGTCGTAAGATCACGTGTGGATCTTGCGTTTACCGCTTGTAACCCAGGCAAATATCAATCATATACCACTGGCCATGCGAGAAATGATGGTGTTGTAGGTTATTGGCAGACCATCTGCGGACCATTTTAAAACCGTTACGTTCAGCCAGTTGATGAATGAATGCGCCATGCAAATTTCAGATTGTGGAAGTACAAGGTCAGCCTGACTTTCAGAACCCAAAGCCATAAATTCATCCAGGGAGTTACCATCTTATACTCCGGAGTTGGTCTTCGATGTTCATGTAACCGTATGCCCATACGCGGCAAATCTTCAACCTTGGTGTGTAGTCCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTTCATAATGTATATCTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTATCCCGCTCACAATCCCCTATCAGTGAGTCGTATTAATTTCGCGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:8214\tqs:i:6234\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/8294_10277\t4\t*\t0\t255\t*\t*\t0\t0\tGATTCCCGCGAAATTAATACGAATCACTATAAGGGGAATTGTGAGCGGATAACAATTCCCCTCTAGAAATAATTTTGTTTAACTTTAAGAGGGACGATATACATATGAACACATGCCTACGTAAAATGTATTCCTGCGAACTGTTGAGACTACCACCAAGGTTGAAGATTTGCCGCGTAATGGGCATACGGTTACATGAACATCGAAGACCACTCCGATATGAAGATTGGTTAACCCCTGGATGAATTTATGGCTTGGGTTCTGAAAGTTCAGGCTGACCTGTACTTCACAATCTGAAATTTGATGGCCGCATTCATCAATCACTGGCTGGAACGTAAAACGGTTTAAAAATGGTCCCGCAGATGGTCTGACAAATTAACTACAACACCATCATTTCTCGCATGGGCCCAGTGGTATATGAAATTGATATTTGCCTGGGTTACAAGGAGGTAAACGCAAGATCCACACGTGGATCTACGACTCTTCTGAAGAAACCTGGCCGTTTCCGTTAAGAAAATGCGAAAGAACTTAAGCTGACGGTAACTGAAAGGCGACATCGACTATCATATAATGAAGCGCCCGTCGTTACAAAATCACCCCGGAAGAATATGCCTTACATTAAAAAACGATATTCAGATTTCGCAGAAGCTCTGCTGATCCAGTTCAAAGCAGGGTCCTGGATCGTAATGACGGCAGGTTCTGACTCTCTGAAAGGCTTCAAAGAACATTATCACCCACCAAGAAGTTTAAAAAGGTTTTCCCGACAACTGAGCCTGGGTCTGGACAAGGAAGTTTCGTTTGCCTACCGTGGTGGTTTTCAACCTGCTGACTGAACCGTTTTAAAAGAAAATAGAGATCGGCGGAAAGGTATGGTTTTTGATGTTAATTCCTGTAACCAGCCTCAAAATGTACTCTCGCCTGCTGCCGTACGGCGGCCGATCGTATTCGAAGGGTAAATACGTCTGGGACCGAGGATAGCCCTCTGCACATTCAGCACATTCGTTGTGAAATTTGAACTGAAGGAAGCTGATCCCGACGCATCCAGATCAAGCGTTCCCATTTTCTACAAGGTAACGAATACCTGAAATCTTCCCGGCGGTGAAATTGCTGCCTGTGGCTGTCTAATGTTGATCTGGAAACTGATGAAAGAGCACTACGAGACCTGTACAATGTTGAATATATCTCTGGTCTGAAGTTCAAAGCAACCACTGGCCTGTTCAAGGACTTTATCGACAAATGGCGTATTATCAAAACTACCTCTGAAGACGCCATCAAACAGCTGGCGAAGCTGATGCTGACAGCCTGTACGGTAAATTCGCGTCGCAACCCGGACGTTTCCGTAAAGTGCCCATACCTGAAAGAGAAACGGTGCTCTGGGTTTTCGTCTAGGTGAGGAGGAAACGAAAGACCCTGTAATATACCCGATGGTGTCTTTTATCACGGCCTGGGCACGCTAGTACGACCAATCACAGCAGCGCAGGCTTGTTATGATCGTATTTCTACTGCGGATACCGATTCTATTCCACCTGACTGGTACTGAAATTCTGGAACGTTATCAAAGACATCGTAGACCCGAAGAAACTGGGCTACTGGGGCACCACGAATCCACTTTTAAGCGTGGCAAAATATCTGACGTCAGAAAACCTACATCCAGGATATTTACATGAAAGAAGTAGACGGCAACTGTAGAGGGCTCTTCCTGACGAACCTACACTGACATCAAGTTCTCTGTGAAATGCGCAGGCATGACGGACCAAAATCAAAAAGGAAGTGAACTTTTCGAAAACTTCAAAGTGGGTTTTCTCGTAAAATGAAACCGAAGCCTGTCAGGTACCGGGTGGCGTAGTGCTGGTTGATCGGACACTTTACTATCAATAACTCGAGCTGCAGAATTCCAAGCTTGGATTCCGGCTGCTAACAAAGCCCGAAAGGAAGCTGAGTTGGCTGCTGCACCGCTGAGCAATAACTCTATACATGACTCAT\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:10277\tqs:i:8294\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/10327_12283\t4\t*\t0\t255\t*\t*\t0\t0\tAGAGTCATGTATAGAGTTATTGCTCAGCGGTGGCAGCACCAACTCAGCTTCCTTTCGGCTTTGTTAGCAGCCGATCCAAGCTTGAATTCCTGCAGCTCGGAGTTATTTGATAGTAAAAGTTGTCATCCAAACGCAGCACTACGCCCACCCGTACCTGAACAGGCTTTCGGTTTCATTTTACGAGAAAAACACTTTTGAAAGTTTTCGAAAGTCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAGAGAACTTGATGTCAGTGTAGTCGTCAGGAGAGCCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGAATGTAGGTTTTTCTGACGCAGATTATTTTGCACGCTTAAAAGTGGATTCGTGTGGCCCCAGTAGCCCAGTTTCTTCGGTCTACGATGTCTTTGATACGTCCAGAATTTCAGTAAACAGTCAGGTGAATAGAAATCCGGTATCGCAGTAGAATAATACGATCATAACAACCTGCGCTGCTGTGTGGTCGTATAGCGTGCCCAGGCCGTGATAACAGACACCTCGGGGTAATATACAGGGTCTTTCCGTTCCTCCTCAACCTAGACGAAACCCAGAGCACCGTTCTCTTTTCAGGTATGGCACTTTAACCGGTACGTCCGGGTTGGACGCGAATTTACCGTAGCAGGCTGTTCAGCATCAGCTTTCGCCAGCCTGTTTGATGGCGCTCTTCAGAGGTAGTTTGAATATACGTCCATTTGTCGAATAAAGTCCTTGGAACAGGCCCAGTGGTTGCTTTGAACTTCCAGACCAGAGATATATTTCAACATTGTACAGGTCGTAGTGCTCTTTCCACTCAGTTCCAGATCAACATTAAGACAGCCACAGGTCAGATTTCCCCGCCGGAAGATTCAGGTAATTCTAGTTACCCTTGTAGAAATGGCGACGCTTGATCTGGATGGTCGGGATCCTAGCTTCCCTTCAGTTCAAATTCACAACGAATGTTGCTGAATCTGTGCAGAGGGTAATCCTCGGTCCAGACGTATTTACCCTCGAATACGATGCTCGCCGTACGGCAGCAGCGAGAGTACATTTGAGCTGGTACAGGGAATTAACATCAAAAAACATACTTCGCCGATCTCTTTTTCTTTAAAACGGTCATTCAGCCAGGTGAAACCACCACGGTAGGCATAACGAAACTTCCTGTCCAGACCCAGGCTCAGGTCGGAAAACTTGTTAAACTTCTTGGTGGTGATAATGTCTTTGAAAGCCTTTCAGGAAGTCAGAACCATGCCGTCATCCGATCCAGACCCCTGCTTTGAACTGGAATCAGCAGAGGCTCTGCGATAATCGAATATCGTTTTTAAATGTAGGCATATTTTCTTCGGGGTGATTTGTAACGCGACCGGGCGCTCATTATGATAGTCGATGTCGCCTTTCAGTACCGTCAGCTTAAAGTCTTTCGCAATTTTCTTAACCGACGGCAGTTTCTTCAGAGAGGTCGTAGATCACGGTGTGGATCTTGCGTTTACCCTTGTAACCAGGCAAATATCAATCATATACCACTGGCCCATGCGAGAATGATGGTGTTGTAGGTATTTGGCAGACGCATCTGCGGACCATTTAAACCGTTACGTTCCAGCCAGTTGATGATGAATGCGCCCATCATTTCAGATTTGTGGAAGGTACAGGTCAGCCTGAACTTGTCAGAAACCCAAGCCATAAATTCATCCAGGGAGTACATCTTATAATCTCGAAGTGGTCTTCGATGTTCATGTAACCGTATGCCCATACGCGCAATCTTCACCTTGGTGGTAGTCTGCAGTCGCAGAATAATTTTACGTGGCATGTGTTTCATATGTTATTAGTCTCCTTCTTAAAGTTAAACAAAATTATTTTTAGAAGGGGAATTGTTATCCGCTCACAATTCCCCTATAGTGGAGTCGTATTAATTTCGCGGGTATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:12283\tqs:i:10327\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+
+Invalid-Args:
+
+  $ $BAM2SAM --header-only --no-header < $DATADIR/phi29.bam
+  bam2sam ERROR: conflicting arguments requested '--no-header' and '--header-only'
+  [1]
+
diff --git a/tests/src/cram/pbbamify.t.in b/tests/src/cram/pbbamify.t.in

new file mode 100644 (file)

index 0000000..c68ba43
--- /dev/null
+++ b/tests/src/cram/pbbamify.t.in
@@ -0,0 +1,147 @@
+Setup:
+
+  $ PBBAMIFY="@PacBioBAM_BinDir@/pbbamify" && export PBBAMIFY
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+
+  $ GENERATEDDATADIR="@GeneratedDir@/" && export GENERATEDDATADIR
+
+Forward alignments with and without user specified tags, one alignment with undefined mapq, some alignments with basic CIGAR operations, 2 alignemtns with hard clipping, and several invalid alignments (1 without a seq field and 1 not present in the dataset) which should be skipped:
+
+  $ $PBBAMIFY --log-file=pbbamify.log --input=$DATADIR/pbbamify/input-aligned-1.bam $DATADIR/pbbamify/synthetic-ref-1.fa $DATADIR/pbbamify/synthetic_movie_1.subreads.bam | $SAMTOOLS view -h
+  @HD\tVN:1.5\tSO:unknown\tpb:3.0.3 (esc)
+  @SQ\tSN:synthetic_ref_1\tLN:150\tM5:e1e940d621d949c9617566ddf3055922 (esc)
+  @RG\tID:8d2370c0\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;BINDINGKIT=100-862-200;SEQUENCINGKIT=100-861-800;BASECALLERVERSION=5.0.0.5552;FRAMERATEHZ=80.000000\tPU:synthetic_movie_1\tPM:SEQUEL (esc)
+  @PG\tID:Handmade\tPN:Handmade (esc)
+  @PG\tID:baz2bam\tPN:baz2bam\tVN:5.0.0.5552\tCL:/opt/pacbio/ppa-5.0.0/bin/baz2bam /data/pb/synthetic_movie_1.baz -o /data/pb/synthetic_movie_1 --metadata /data/pb/.synthetic_movie_1.metadata.xml -j 12 -b 12 --progress --silent --minSubLength 50 --minSnr 3.750000 --adapters /data/pb/synthetic_movie_1.adapters.fasta (esc)
+  @PG\tID:bazFormat\tPN:bazformat\tVN:1.3.0 (esc)
+  @PG\tID:bazwriter\tPN:bazwriter\tVN:5.0.0.5552 (esc)
+  synthetic_movie_1/1/0_100\t0\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t254\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tZE:f:1\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/2/0_101\t0\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/3/0_100\t4\t*\t0\t255\t*\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:3\tRG:Z:8d2370c0 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\tAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t8S1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I1=6S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+
+  $ grep -c "WARN" pbbamify.log
+  3
+  $ grep -c "INFO" pbbamify.log
+  1
+  $ rm pbbamify.log
+
+Reverse alignments: 2 primary alignments and 7 secondary, 6 alignments with extended CIGAR and 2 with basic CIGAR strings, 1 alignment with undefined (255) mapq, 2 alignments with hard clipping, 1 alignment with user defined tag. All alignments have a read group assigned which is different than the dataset.
+  $ $PBBAMIFY --log-file=pbbamify.log --input=$DATADIR/pbbamify/input-aligned-2.bam $DATADIR/pbbamify/synthetic-ref-1.fa $DATADIR/pbbamify/synthetic_movie_2.subreads.bam | $SAMTOOLS view -h
+  @HD\tVN:1.5\tSO:unknown\tpb:3.0.3 (esc)
+  @SQ\tSN:synthetic_ref_1\tLN:150\tM5:e1e940d621d949c9617566ddf3055922 (esc)
+  @RG\tID:7a515ee0\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;BINDINGKIT=100-862-200;SEQUENCINGKIT=100-861-800;BASECALLERVERSION=5.0.0.5552;FRAMERATEHZ=80.000000\tPU:synthetic_movie_2\tPM:SEQUEL (esc)
+  @PG\tID:Handmade\tPN:Handmade (esc)
+  @PG\tID:baz2bam\tPN:baz2bam\tVN:5.0.0.5552\tCL:/opt/pacbio/ppa-5.0.0/bin/baz2bam /data/pb/synthetic_movie_2.baz -o /data/pb/synthetic_movie_2 --metadata /data/pb/.synthetic_movie_2.metadata.xml -j 12 -b 12 --progress --silent --minSubLength 50 --minSnr 3.750000 --adapters /data/pb/synthetic_movie_2.adapters.fasta (esc)
+  @PG\tID:bazFormat\tPN:bazformat\tVN:1.3.0 (esc)
+  @PG\tID:bazwriter\tPN:bazwriter\tVN:5.0.0.5552 (esc)
+  synthetic_movie_2/1000001/0_100\t16\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t254\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tZE:f:1\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000002/0_101\t16\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_2/1000002/0_101\t272\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_2/1000002/0_101\t272\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_2/1000002/0_101\t272\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+
+  $ grep -c "INFO" pbbamify.log
+  1
+  $ rm pbbamify.log
+
+CCS read:
+
+  $ $PBBAMIFY --log-file=pbbamify.log --input=$DATADIR/pbbamify/input-aligned-3.bam $DATADIR/pbbamify/synthetic-ref-1.fa $DATADIR/pbbamify/synthetic_movie_3.subreads.bam | $SAMTOOLS view -h
+  @HD\tVN:1.5\tSO:unknown\tpb:3.0.1 (esc)
+  @SQ\tSN:synthetic_ref_1\tLN:150\tM5:e1e940d621d949c9617566ddf3055922 (esc)
+  @RG\tID:67e06f58\tPL:PACBIO\tDS:READTYPE=CCS;BINDINGKIT=100-862-200;SEQUENCINGKIT=101-093-700;BASECALLERVERSION=5.0.0.5049;FRAMERATEHZ=80.000000\tPU:synthetic_movie_3\tPM:SEQUEL (esc)
+  @PG\tID:Handmade\tPN:Handmade (esc)
+  @PG\tID:ccs-3.0.0\tPN:ccs\tVN:3.0.0\tDS:Generate circular consensus sequences (ccs) from subreads.\tCL:ccs (esc)
+  synthetic_movie_3/3000001/ccs\t0\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\tCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\tRG:Z:67e06f58\tnp:i:6\trq:f:0.993687\trs:B:i,8,1,0,0,0,0\tsn:B:f,6.21632,11.7596,4.35394,8.45458\tza:f:nan\tzm:i:3000001\tzs:B:f,nan,nan,nan,nan,nan,nan,-inf,nan,nan (esc)
+
+  $ grep -c "INFO" pbbamify.log
+  1
+  $ rm pbbamify.log
+
+No verbose output:
+
+  $ $PBBAMIFY --log-file=pbbamify.log --input=$DATADIR/pbbamify/input-aligned-1.bam --verbose-level=0 $DATADIR/pbbamify/synthetic-ref-1.fa $DATADIR/pbbamify/synthetic_movie_1.subreads.bam | $SAMTOOLS view -h
+  @HD\tVN:1.5\tSO:unknown\tpb:3.0.3 (esc)
+  @SQ\tSN:synthetic_ref_1\tLN:150\tM5:e1e940d621d949c9617566ddf3055922 (esc)
+  @RG\tID:8d2370c0\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;BINDINGKIT=100-862-200;SEQUENCINGKIT=100-861-800;BASECALLERVERSION=5.0.0.5552;FRAMERATEHZ=80.000000\tPU:synthetic_movie_1\tPM:SEQUEL (esc)
+  @PG\tID:Handmade\tPN:Handmade (esc)
+  @PG\tID:baz2bam\tPN:baz2bam\tVN:5.0.0.5552\tCL:/opt/pacbio/ppa-5.0.0/bin/baz2bam /data/pb/synthetic_movie_1.baz -o /data/pb/synthetic_movie_1 --metadata /data/pb/.synthetic_movie_1.metadata.xml -j 12 -b 12 --progress --silent --minSubLength 50 --minSnr 3.750000 --adapters /data/pb/synthetic_movie_1.adapters.fasta (esc)
+  @PG\tID:bazFormat\tPN:bazformat\tVN:1.3.0 (esc)
+  @PG\tID:bazwriter\tPN:bazwriter\tVN:5.0.0.5552 (esc)
+  synthetic_movie_1/1/0_100\t0\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t254\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tZE:f:1\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/2/0_101\t0\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/3/0_100\t4\t*\t0\t255\t*\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:3\tRG:Z:8d2370c0 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\tAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t8S1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I1=6S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+
+  $ grep -n "INFO"  pbbamify.log
+  [1]
+  $ rm pbbamify.log
+
+Test on a dataset, input contains alignments from all subread sets.
+
+  $ $PBBAMIFY --log-file=pbbamify.log --input=$DATADIR/pbbamify/input-aligned-all.bam $DATADIR/pbbamify/synthetic-ref-1.fa $GENERATEDDATADIR/synthetic_movie_all.subreadset.xml | $SAMTOOLS view -h
+  @HD\tVN:1.5\tSO:unknown\tpb:3.0.3 (esc)
+  @SQ\tSN:synthetic_ref_1\tLN:150\tM5:e1e940d621d949c9617566ddf3055922 (esc)
+  @RG\tID:67e06f58\tPL:PACBIO\tDS:READTYPE=CCS;BINDINGKIT=100-862-200;SEQUENCINGKIT=101-093-700;BASECALLERVERSION=5.0.0.5049;FRAMERATEHZ=80.000000\tPU:synthetic_movie_3\tPM:SEQUEL (esc)
+  @RG\tID:7a515ee0\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;BINDINGKIT=100-862-200;SEQUENCINGKIT=100-861-800;BASECALLERVERSION=5.0.0.5552;FRAMERATEHZ=80.000000\tPU:synthetic_movie_2\tPM:SEQUEL (esc)
+  @RG\tID:8d2370c0\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;BINDINGKIT=100-862-200;SEQUENCINGKIT=100-861-800;BASECALLERVERSION=5.0.0.5552;FRAMERATEHZ=80.000000\tPU:synthetic_movie_1\tPM:SEQUEL (esc)
+  @PG\tID:Handmade\tPN:Handmade (esc)
+  @PG\tID:baz2bam\tPN:baz2bam\tVN:5.0.0.5552\tCL:/opt/pacbio/ppa-5.0.0/bin/baz2bam /data/pb/synthetic_movie_1.baz -o /data/pb/synthetic_movie_1 --metadata /data/pb/.synthetic_movie_1.metadata.xml -j 12 -b 12 --progress --silent --minSubLength 50 --minSnr 3.750000 --adapters /data/pb/synthetic_movie_1.adapters.fasta (esc)
+  @PG\tID:bazFormat\tPN:bazformat\tVN:1.3.0 (esc)
+  @PG\tID:bazwriter\tPN:bazwriter\tVN:5.0.0.5552 (esc)
+  @PG\tID:ccs-3.0.0\tPN:ccs\tVN:3.0.0\tDS:Generate circular consensus sequences (ccs) from subreads.\tCL:ccs (esc)
+  synthetic_movie_1/1/0_100\t0\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t254\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tZE:f:1\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/2/0_101\t0\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/3/0_100\t4\t*\t0\t255\t*\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:3\tRG:Z:8d2370c0 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\tAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t8S1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I1=6S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_2/1000001/0_100\t16\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t254\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tZE:f:1\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000002/0_101\t16\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_2/1000002/0_101\t272\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_2/1000002/0_101\t272\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_2/1000002/0_101\t272\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_3/3000001/ccs\t0\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\tCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\tRG:Z:67e06f58\tnp:i:6\trq:f:0.993687\trs:B:i,8,1,0,0,0,0\tsn:B:f,6.21632,11.7596,4.35394,8.45458\tza:f:nan\tzm:i:3000001\tzs:B:f,nan,nan,nan,nan,nan,nan,-inf,nan,nan (esc)
+
+  $ grep -c "WARN" pbbamify.log
+  3
+  $ grep -c "INFO" pbbamify.log
+  1
+  $ rm pbbamify.log
+
+No-args:
+
+  $ $PBBAMIFY | grep -c "pbbamify - pbbamify converts an arbitray aligned BAM file"
+  1
diff --git a/tests/src/cram/pbindexdump_cpp.t.in b/tests/src/cram/pbindexdump_cpp.t.in

new file mode 100644 (file)

index 0000000..3ae93d9
--- /dev/null
+++ b/tests/src/cram/pbindexdump_cpp.t.in
@@ -0,0 +1,34 @@
+Setup:
+
+  $ PBINDEXDUMP="@PacBioBAM_BinDir@/pbindexdump" && export PBINDEXDUMP
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+
+Normal C++:
+
+  $ $PBINDEXDUMP --format=cpp $DATADIR/polymerase/production_hq.hqregion.bam.pbi
+  PbiRawData rawData;
+  rawData.Version(PbiFile::Version_3_0_1);
+  rawData.FileSections(PbiFile::BASIC);
+  rawData.NumReads(1);
+  
+  PbiRawBasicData& basicData = rawData.BasicData();
+  basicData.rgId_       = {-898246524};
+  basicData.qStart_     = {2659};
+  basicData.qEnd_       = {7034};
+  basicData.holeNumber_ = {0};
+  basicData.readQual_   = {0.01};
+  basicData.ctxtFlag_   = {0};
+  basicData.fileOffset_ = {20054016};
+
+--(leave the blank lines above this)--
+
+Request C++, with JSON options (stdout includes usage/help, so we just want to check stderr):
+
+  $ $PBINDEXDUMP --format=cpp --json-indent-level=2 $DATADIR/polymerase/production_hq.hqregion.bam.pbi > /dev/null
+  pbindexdump ERROR: JSON formatting options are not valid on non-JSON output
+  [1]
+
+  $ $PBINDEXDUMP --format=cpp --json-raw $DATADIR/polymerase/production_hq.hqregion.bam.pbi > /dev/null
+  pbindexdump ERROR: JSON formatting options are not valid on non-JSON output
+  [1]
diff --git a/tests/src/cram/pbindexdump_json.t.in b/tests/src/cram/pbindexdump_json.t.in

new file mode 100644 (file)

index 0000000..0bccdee
--- /dev/null
+++ b/tests/src/cram/pbindexdump_json.t.in
@@ -0,0 +1,83 @@
+Setup:
+
+  $ PBINDEXDUMP="@PacBioBAM_BinDir@/pbindexdump" && export PBINDEXDUMP
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+
+Default settings (JSON):
+
+  $ $PBINDEXDUMP $DATADIR/polymerase/production_hq.hqregion.bam.pbi
+  {
+      "fileSections": [
+          "BasicData"
+      ],
+      "numReads": 1,
+      "reads": [
+          {
+              "contextFlag": 0,
+              "fileOffset": 20054016,
+              "holeNumber": 0,
+              "qEnd": 7034,
+              "qStart": 2659,
+              "readQuality": 0.009999999776482582,
+              "rgId": -898246524
+          }
+      ],
+      "version": "3.0.1"
+  }
+
+JSON indent level(2):
+
+  $ $PBINDEXDUMP --json-indent-level=2 $DATADIR/polymerase/production_hq.hqregion.bam.pbi
+  {
+    "fileSections": [
+      "BasicData"
+    ],
+    "numReads": 1,
+    "reads": [
+      {
+        "contextFlag": 0,
+        "fileOffset": 20054016,
+        "holeNumber": 0,
+        "qEnd": 7034,
+        "qStart": 2659,
+        "readQuality": 0.009999999776482582,
+        "rgId": -898246524
+      }
+    ],
+    "version": "3.0.1"
+  }
+
+JSON raw:
+
+  $ $PBINDEXDUMP --json-raw $DATADIR/polymerase/production_hq.hqregion.bam.pbi
+  {
+      "basicData": {
+          "ctxtFlag": [
+              0
+          ],
+          "fileOffset": [
+              20054016
+          ],
+          "holeNumber": [
+              0
+          ],
+          "qEnd": [
+              7034
+          ],
+          "qStart": [
+              2659
+          ],
+          "readQual": [
+              0.009999999776482582
+          ],
+          "rgId": [
+              -898246524
+          ]
+      },
+      "fileSections": [
+          "BasicData"
+      ],
+      "numReads": 1,
+      "version": "3.0.1"
+  }
diff --git a/tests/src/cram/pbmerge_aligned_ordering.t.in b/tests/src/cram/pbmerge_aligned_ordering.t.in

new file mode 100644 (file)

index 0000000..58171bb
--- /dev/null
+++ b/tests/src/cram/pbmerge_aligned_ordering.t.in
@@ -0,0 +1,197 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ INPUT_1="$DATADIR/dataset/bam_mapping_1.bam" && export INPUT_1
+  $ INPUT_2="$DATADIR/dataset/bam_mapping_2.bam" && export INPUT_2
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/aligned_ordering_merged.bam" && export MERGED_BAM
+  $ MERGED_BAM_PBI="@GeneratedTestDataDir@/aligned_ordering_merged.bam.pbi" && export MERGED_BAM_PBI
+
+Sanity Check:
+
+  $ $BAM2SAM --header-only $INPUT_1
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377 (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+
+  $ $BAM2SAM --no-header $INPUT_1 | cut -f 1,3,4 | head -n 10
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+
+  $ $BAM2SAM --header-only $INPUT_2
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377 (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+
+  $ $BAM2SAM --no-header $INPUT_2 | cut -f 1,3,4 | head -n 10
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+
+Normal Merge:
+
+  $ $PBMERGE $INPUT_1 $INPUT_2 > $MERGED_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+
+  $ rm $MERGED_BAM
+
+Shuffle Input:
+
+  $ $PBMERGE $INPUT_2 $INPUT_2 > $MERGED_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7046_7293\tlambda_NEB3011\t5136 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/38025/6255_7894\tlambda_NEB3011\t5427 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5311_5508\tlambda_NEB3011\t5943 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/36363/899_1197\tlambda_NEB3011\t6258 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/36363/605_853\tlambda_NEB3011\t6312 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/31174/0_1029\tlambda_NEB3011\t6487 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/31174/1075_1271\tlambda_NEB3011\t6499 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/38025/5743_6211\tlambda_NEB3011\t6606 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/50257/6944_7361\tlambda_NEB3011\t6942 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/50257/6546_6903\tlambda_NEB3011\t7010 (esc)
+
+  $ rm $MERGED_BAM
+
+Explicit Output Filename (also enables PBI):
+
+  $ $PBMERGE -o $MERGED_BAM $INPUT_1 $INPUT_2
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Found
+
+  $ rm $MERGED_BAM
+  $ rm $MERGED_BAM_PBI
+
+Explicit Output Filename (with disabled PBI):
+
+  $ $PBMERGE -o $MERGED_BAM --no-pbi $INPUT_1 $INPUT_2
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/cram/pbmerge_dataset.t.in b/tests/src/cram/pbmerge_dataset.t.in

new file mode 100644 (file)

index 0000000..1c7cb7a
--- /dev/null
+++ b/tests/src/cram/pbmerge_dataset.t.in
@@ -0,0 +1,144 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ INPUT_XML="$DATADIR/polymerase/consolidate.subread.dataset.xml" && export INPUT_XML
+  $ BAM_1="$DATADIR/polymerase/production.subreads.bam" && export BAM_1
+  $ BAM_2="$DATADIR/polymerase/production.scraps.bam" && export BAM_2
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/merged.bam" && export MERGED_BAM
+  $ MERGED_BAM_PBI="@GeneratedTestDataDir@/merged.bam.pbi" && export MERGED_BAM_PBI
+
+Sanity Check:
+
+  $ $BAM2SAM --no-header $BAM_1 | cut -f 1
+  ArminsFakeMovie/0/2659_3025
+  ArminsFakeMovie/0/3116_3628
+  ArminsFakeMovie/0/3722_4267
+  ArminsFakeMovie/0/4356_4864
+  ArminsFakeMovie/0/4960_5477
+  ArminsFakeMovie/0/5571_6087
+  ArminsFakeMovie/0/6199_6719
+  ArminsFakeMovie/0/6812_7034
+
+  $ $BAM2SAM --no-header $BAM_2  | cut -f 1
+  ArminsFakeMovie/0/0_2659
+  ArminsFakeMovie/0/3025_3047
+  ArminsFakeMovie/0/3047_3095
+  ArminsFakeMovie/0/3095_3116
+  ArminsFakeMovie/0/3628_3650
+  ArminsFakeMovie/0/3650_3700
+  ArminsFakeMovie/0/3700_3722
+  ArminsFakeMovie/0/4267_4289
+  ArminsFakeMovie/0/4289_4335
+  ArminsFakeMovie/0/4335_4356
+  ArminsFakeMovie/0/4864_4888
+  ArminsFakeMovie/0/4888_4939
+  ArminsFakeMovie/0/4939_4960
+  ArminsFakeMovie/0/5477_5498
+  ArminsFakeMovie/0/5498_5546
+  ArminsFakeMovie/0/5546_5571
+  ArminsFakeMovie/0/6087_6116
+  ArminsFakeMovie/0/6116_6173
+  ArminsFakeMovie/0/6173_6199
+  ArminsFakeMovie/0/6719_6740
+  ArminsFakeMovie/0/6740_6790
+  ArminsFakeMovie/0/6790_6812
+  ArminsFakeMovie/0/7034_7035
+
+Normal Merge from XML:
+
+  $ $PBMERGE -o $MERGED_BAM $INPUT_XML
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:8aaede36\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:BAZ_FORMAT\tVN:0.3.0 (esc)
+  @PG\tID:PPA-BAZ2BAM\tVN:0.1.0 (esc)
+  @PG\tID:PPA-BAZWRITER\tVN:0.2.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/0/4267_4289
+  ArminsFakeMovie/0/4289_4335
+  ArminsFakeMovie/0/4335_4356
+  ArminsFakeMovie/0/4356_4864
+  ArminsFakeMovie/0/4864_4888
+  ArminsFakeMovie/0/4888_4939
+  ArminsFakeMovie/0/4939_4960
+  ArminsFakeMovie/0/4960_5477
+
+  $ rm $MERGED_BAM
+  $ rm $MERGED_BAM_PBI
+
+Normal Merge from XML (disabled PBI):
+
+  $ $PBMERGE --no-pbi -o $MERGED_BAM $INPUT_XML
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:8aaede36\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:BAZ_FORMAT\tVN:0.3.0 (esc)
+  @PG\tID:PPA-BAZ2BAM\tVN:0.1.0 (esc)
+  @PG\tID:PPA-BAZWRITER\tVN:0.2.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/0/4267_4289
+  ArminsFakeMovie/0/4289_4335
+  ArminsFakeMovie/0/4335_4356
+  ArminsFakeMovie/0/4356_4864
+  ArminsFakeMovie/0/4864_4888
+  ArminsFakeMovie/0/4888_4939
+  ArminsFakeMovie/0/4939_4960
+  ArminsFakeMovie/0/4960_5477
+
+  $ rm $MERGED_BAM
+
+Write to stdout:
+
+  $ $PBMERGE --no-pbi $INPUT_XML > $MERGED_BAM
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:8aaede36\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:BAZ_FORMAT\tVN:0.3.0 (esc)
+  @PG\tID:PPA-BAZ2BAM\tVN:0.1.0 (esc)
+  @PG\tID:PPA-BAZWRITER\tVN:0.2.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/0/4267_4289
+  ArminsFakeMovie/0/4289_4335
+  ArminsFakeMovie/0/4335_4356
+  ArminsFakeMovie/0/4356_4864
+  ArminsFakeMovie/0/4864_4888
+  ArminsFakeMovie/0/4888_4939
+  ArminsFakeMovie/0/4939_4960
+  ArminsFakeMovie/0/4960_5477
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/cram/pbmerge_fofn.t.in b/tests/src/cram/pbmerge_fofn.t.in

new file mode 100644 (file)

index 0000000..34e9af6
--- /dev/null
+++ b/tests/src/cram/pbmerge_fofn.t.in
@@ -0,0 +1,120 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ INPUT_FOFN="$DATADIR/merge.fofn" && export INPUT_FOFN
+  $ INPUT_1="$DATADIR/aligned.bam" && export INPUT_1
+  $ INPUT_2="$DATADIR/aligned2.bam" && export INPUT_2
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/aligned_ordering_merged.bam" && export MERGED_BAM
+  $ MERGED_BAM_PBI="@GeneratedTestDataDir@/aligned_ordering_merged.bam.pbi" && export MERGED_BAM_PBI
+
+Sanity Check:
+
+  $ $BAM2SAM --header-only $INPUT_1
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:0d7b28fa\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\tPU:singleInsertion\tPM:SEQUEL (esc)
+  @PG\tID:bwa\tPN:bwa\tVN:0.7.10-r1017-dirty\tCL:bwa mem lambdaNEB.fa singleInsertion.fasta (esc)
+
+  $ $BAM2SAM --no-header $INPUT_1 | cut -f 1,3,4 | head -n 10
+  singleInsertion/100/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/200/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+
+  $ $BAM2SAM --header-only $INPUT_2
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:b89a4406\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;FRAMERATEHZ=100;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+
+  $ $BAM2SAM --no-header $INPUT_2 | cut -f 1,3,4 | head -n 10
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+
+Normal Merge from FOFN:
+
+  $ $PBMERGE -o $MERGED_BAM $INPUT_FOFN
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:0d7b28fa\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\tPU:singleInsertion\tPM:SEQUEL (esc)
+  @RG\tID:b89a4406\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;FRAMERATEHZ=100\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:bwa\tPN:bwa\tVN:0.7.10-r1017-dirty\tCL:bwa mem lambdaNEB.fa singleInsertion.fasta (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  singleInsertion/100/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/200/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+
+  $ rm $MERGED_BAM
+  $ rm $MERGED_BAM_PBI
+
+Normal Merge from FOFN (disabled PBI):
+
+  $ $PBMERGE --no-pbi -o $MERGED_BAM $INPUT_FOFN
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:0d7b28fa\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\tPU:singleInsertion\tPM:SEQUEL (esc)
+  @RG\tID:b89a4406\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;FRAMERATEHZ=100\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:bwa\tPN:bwa\tVN:0.7.10-r1017-dirty\tCL:bwa mem lambdaNEB.fa singleInsertion.fasta (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  singleInsertion/100/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/200/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/cram/pbmerge_mixed_ordering.t.in b/tests/src/cram/pbmerge_mixed_ordering.t.in

new file mode 100644 (file)

index 0000000..f558def
--- /dev/null
+++ b/tests/src/cram/pbmerge_mixed_ordering.t.in
@@ -0,0 +1,57 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ UNALIGNED_BAM="$DATADIR/polymerase/internal.hqregions.bam" && export UNALIGNED_BAM
+  $ ALIGNED_BAM="$DATADIR/dataset/bam_mapping_1.bam" && export ALIGNED_BAM
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/mixed_ordering_merged.bam" && export MERGED_BAM
+
+Sanity Check:
+
+  $ $BAM2SAM --header-only $UNALIGNED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+
+  $ $BAM2SAM --no-header $UNALIGNED_BAM | cut -f 1
+  ArminsFakeMovie/100000/2659_7034
+
+  $ $BAM2SAM --header-only $ALIGNED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377 (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+
+  $ $BAM2SAM --no-header $ALIGNED_BAM | cut -f 1,3,4 | head -n 10
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+
+Normal Merge - should fail:
+
+  $ $PBMERGE $UNALIGNED_BAM $ALIGNED_BAM > $MERGED_BAM
+  pbmerge ERROR: DataSet: cannot merge different dataset types
+  [1]
+
+Shuffle Input - should fail:
+
+  $ $PBMERGE $ALIGNED_BAM $UNALIGNED_BAM > $MERGED_BAM
+  pbmerge ERROR: DataSet: cannot merge different dataset types
+  [1]
+
+Cleanup:
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/cram/pbmerge_pacbio_ordering.t.in b/tests/src/cram/pbmerge_pacbio_ordering.t.in

new file mode 100644 (file)

index 0000000..f52759f
--- /dev/null
+++ b/tests/src/cram/pbmerge_pacbio_ordering.t.in
@@ -0,0 +1,457 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ HQREGION_BAM="$DATADIR/polymerase/internal.hqregions.bam" && export HQREGION_BAM
+  $ SCRAPS_BAM="$DATADIR/polymerase/internal.scraps.bam" && export SCRAPS_BAM
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/pacbio_ordering_merged.bam" && export MERGED_BAM
+  $ MERGED_BAM_PBI="@GeneratedTestDataDir@/pacbio_ordering_merged.bam.pbi" && export MERGED_BAM_PBI
+
+Sanity Check:
+
+  $ $BAM2SAM --header-only $HQREGION_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+
+  $ $BAM2SAM --no-header $HQREGION_BAM | cut -f 1
+  ArminsFakeMovie/100000/2659_7034
+
+  $ $BAM2SAM --header-only $SCRAPS_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+
+  $ $BAM2SAM --no-header $SCRAPS_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+Normal Merge:
+
+  $ $PBMERGE $HQREGION_BAM $SCRAPS_BAM > $MERGED_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/2659_7034
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+  $ rm $MERGED_BAM
+
+Shuffle Input:
+
+  $ $PBMERGE $SCRAPS_BAM $HQREGION_BAM  > $MERGED_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/2659_7034
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+  $ rm $MERGED_BAM
+
+Explicit Output Filename (also enables PBI):
+
+  $ $PBMERGE -o $MERGED_BAM $HQREGION_BAM $SCRAPS_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/2659_7034
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Found
+
+  $ rm $MERGED_BAM
+  $ rm $MERGED_BAM_PBI
+
+Explicit Output Filename (with disabled PBI):
+
+  $ $PBMERGE -o $MERGED_BAM --no-pbi $HQREGION_BAM $SCRAPS_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/2659_7034
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/meson.build b/tests/src/meson.build

new file mode 100644 (file)

index 0000000..c853955
--- /dev/null
+++ b/tests/src/meson.build
@@ -0,0 +1,86 @@
+pbbam_test_cpp_sources = files([
+  'test_AlignmentPrinter.cpp',
+  'test_BamFile.cpp',
+  'test_BamHeader.cpp',
+  'test_BamRecord.cpp',
+  'test_BamRecordBuilder.cpp',
+  'test_BamRecordClipping.cpp',
+  'test_BamRecordImplCore.cpp',
+  'test_BamRecordImplTags.cpp',
+  'test_BamRecordImplVariableData.cpp',
+  'test_BamRecordMapping.cpp',
+  'test_BamWriter.cpp',
+  'test_BarcodeQuery.cpp',
+  'test_BedReader.cpp',
+  'test_BedWriter.cpp',
+  'test_BgzipFastaWriter.cpp',
+  'test_BgzipFastqWriter.cpp',
+  'test_BgzipWriter.cpp',
+  'test_CCSPbiBuilder.cpp',
+  'test_CCSRecordIO.cpp',
+  'test_Cigar.cpp',
+  'test_Compare.cpp',
+  'test_DataSetCore.cpp',
+  'test_DataSetIO.cpp',
+  'test_DataSetQuery.cpp',
+  'test_DataSetXsd.cpp',
+  'test_EndToEnd.cpp',
+  'test_EntireFileQuery.cpp',
+  'test_FaiIndex.cpp',
+  'test_FaiZmwChunker.cpp',
+  'test_FastaReader.cpp',
+  'test_FastaSequence.cpp',
+  'test_FastaSequenceQuery.cpp',
+  'test_FastaWriter.cpp',
+  'test_FastqReader.cpp',
+  'test_FastqSequence.cpp',
+  'test_FastqWriter.cpp',
+  'test_FileUtils.cpp',
+  'test_Frames.cpp',
+  'test_GenomicIntervalQuery.cpp',
+  'test_GenomicIntervals.cpp',
+  'test_IndexedBamWriter.cpp',
+  'test_IndexedFastaReader.cpp',
+  'test_IndexedFastqReader.cpp',
+  'test_LongCigar.cpp',
+  'test_PacBioIndex.cpp',
+  'test_PbiFilter.cpp',
+  'test_PbiFilterQuery.cpp',
+  'test_QNameQuery.cpp',
+  'test_QualityValues.cpp',
+  'test_Pulse2BaseCache.cpp',
+  'test_ReadAccuracyQuery.cpp',
+  'test_ReadGroupInfo.cpp',
+  'test_SamWriter.cpp',
+  'test_SequenceUtils.cpp',
+  'test_StringUtils.cpp',
+  'test_SubreadLengthQuery.cpp',
+  'test_Tags.cpp',
+  'test_TextFileReader.cpp',
+  'test_TextFileWriter.cpp',
+  'test_TimeUtils.cpp',
+  'test_Validator.cpp',
+  'test_VcfFile.cpp',
+  'test_VcfFormat.cpp',
+  'test_VcfHeader.cpp',
+  'test_VcfReader.cpp',
+  'test_VcfSort.cpp',
+  'test_VcfQuery.cpp',
+  'test_VcfVariant.cpp',
+  'test_VcfWriter.cpp',
+  'test_Version.cpp',
+  'test_WhitelistedZmwReadStitcher.cpp',
+  'test_ZmwChunkedFastxReader.cpp',
+  'test_ZmwReadStitcher.cpp',
+  'test_ZmwQuery.cpp'])
+
+# cram files
+pbbam_cram_bam2sam_t_in = files('cram/bam2sam.t.in')
+pbbam_cram_pbindexdump_json_t_in = files('cram/pbindexdump_json.t.in')
+pbbam_cram_pbindexdump_cpp_t_in = files('cram/pbindexdump_cpp.t.in')
+pbbam_cram_pbmerge_pacbio_ordering_t_in = files('cram/pbmerge_pacbio_ordering.t.in')
+pbbam_cram_pbmerge_aligned_ordering_t_in = files('cram/pbmerge_aligned_ordering.t.in')
+pbbam_cram_pbmerge_mixed_ordering_t_in = files('cram/pbmerge_mixed_ordering.t.in')
+pbbam_cram_pbmerge_dataset_t_in = files('cram/pbmerge_dataset.t.in')
+pbbam_cram_pbmerge_fofn_t_in = files('cram/pbmerge_fofn.t.in')
+pbbam_cram_pbbamify_t_in = files('cram/pbbamify.t.in')
diff --git a/tests/src/test_AlignmentPrinter.cpp b/tests/src/test_AlignmentPrinter.cpp

new file mode 100644 (file)

index 0000000..a88deab
--- /dev/null
+++ b/tests/src/test_AlignmentPrinter.cpp
@@ -0,0 +1,120 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/AlignmentPrinter.h>
+#include <pbbam/BamFile.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/IndexedFastaReader.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace AlignmentPrinterTests {
+
+const std::string lambdaFasta = PbbamTestsConfig::Data_Dir + "/lambdaNEB.fa";
+const std::string singleInsertionBam = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+
+}  // namespace AlignmentPrinterTests
+
+TEST(AlignmentPrinterTest, Print)
+{
+    IndexedFastaReader r(AlignmentPrinterTests::lambdaFasta);
+    AlignmentPrinter pretty(r);
+
+    BamFile bamFile(AlignmentPrinterTests::singleInsertionBam);
+    EntireFileQuery bamQuery(bamFile);
+    auto it = bamQuery.begin();
+
+    // funky formatting used to format alignments
+    auto expected = std::string{
+        "Read        : singleInsertion/100/0_49\n"
+        "Reference   : lambda_NEB3011\n"
+        "\n"
+        "Read-length : 49\n"
+        "Concordance : 0.96\n"
+        "\n"
+        "5210 : GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGG : 5249\n"
+        "       \x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| "
+        "|\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| "
+        "||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n"
+        "   0 : GGCTGCAG-GTACAGCGGTCAGGAGGCCAATTGATGCCGG :   39\n"
+        "\n"
+        "5249 : ACTGGCTGAT : 5259\n"
+        "       |\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n"
+        "  39 : ACTGGCTGAT :   49\n"
+        "\n"};
+
+    auto record = *it++;
+    EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC));
+
+    expected = std::string{
+        "Read        : singleInsertion/200/0_49\n"
+        "Reference   : lambda_NEB3011\n"
+        "\n"
+        "Read-length : 49\n"
+        "Concordance : 0.96\n"
+        "\n"
+        "5210 : GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGG : 5249\n"
+        "       \x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| "
+        "|\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| "
+        "||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n"
+        "   0 : GGCTGCAG-GTACAGCGGTCAGGAGGCCAATTGATGCCGG :   39\n"
+        "\n"
+        "5249 : ACTGGCTGAT : 5259\n"
+        "       |\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n"
+        "  39 : ACTGGCTGAT :   49\n"
+        "\n"};
+
+    record = *it++;
+    EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC));
+
+    expected = std::string{
+        "Read        : singleInsertion/100/0_111\n"
+        "Reference   : lambda_NEB3011\n"
+        "\n"
+        "Read-length : 59\n"
+        "Concordance : 0.951\n"
+        "\n"
+        "9377 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCG : 9417\n"
+        "       "
+        "|||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||"
+        "\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||  |\n"
+        "   0 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGA--G :   38\n"
+        "\n"
+        "9417 : CAGCACGGT-AACAGCGGCAA : 9437\n"
+        "       |||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||| "
+        "||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||\n"
+        "  38 : CAGCACGGTAAACAGCGGCAA :   59\n"
+        "\n"};
+
+    record = *it++;
+    EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC));
+
+    expected = std::string{
+        "Read        : singleInsertion/100/0_111\n"
+        "Reference   : lambda_NEB3011\n"
+        "\n"
+        "Read-length : 59\n"
+        "Concordance : 0.951\n"
+        "\n"
+        "9377 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCG : 9417\n"
+        "       "
+        "|||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||"
+        "\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||  |\n"
+        "   0 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGA--G :   38\n"
+        "\n"
+        "9417 : CAGCACGGT-AACAGCGGCAA : 9437\n"
+        "       |||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||| "
+        "||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||\n"
+        "  38 : CAGCACGGTAAACAGCGGCAA :   59\n"
+        "\n"};
+
+    record = *it++;
+    EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC));
+}
diff --git a/tests/src/test_BamFile.cpp b/tests/src/test_BamFile.cpp

new file mode 100644 (file)

index 0000000..5ac71dd
--- /dev/null
+++ b/tests/src/test_BamFile.cpp
@@ -0,0 +1,105 @@
+// Author: Derek Barnett
+
+#include <unistd.h>
+#include <cstddef>
+#include <cstdlib>
+#include <stdexcept>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/../../src/FileUtils.h>
+#include <pbbam/BamFile.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace BamFileTests {
+
+template <typename T>
+void CheckFile(const T& input, const size_t expectedCount)
+{
+    size_t observedCount = 0;
+    EntireFileQuery entireFile(input);
+    for (const BamRecord& r : entireFile) {
+        UNUSED(r);
+        ++observedCount;
+    }
+    EXPECT_EQ(expectedCount, observedCount);
+}
+
+}  // namespace BamFileTests
+
+TEST(BamFileTest, NonExistentFileThrows)
+{
+    EXPECT_THROW(BamFile{"does_not_exist.bam"}, std::runtime_error);
+}
+
+TEST(BamFileTest, NonBamFileThrows)
+{
+    EXPECT_THROW(BamFile{PbbamTestsConfig::Data_Dir + "/lambdaNEB.fa.fai"}, std::runtime_error);
+}
+
+TEST(BamFileTest, RelativePathBamOk)
+{
+    // cache current working directory, then drill down so we can point to
+    // BAMs using relative path
+    const std::string cwd = FileUtils::CurrentWorkingDirectory();
+    ASSERT_EQ(0, chdir(PbbamTestsConfig::Data_Dir.c_str()));
+    ASSERT_EQ(0, chdir("relative/a"));
+
+    // BamFile from relative BAM fn
+    BamFileTests::CheckFile(BamFile{"../b/test1.bam"}, 3);
+
+    // dataset from relative BAM fn
+    BamFileTests::CheckFile(DataSet{"../b/test1.bam"}, 3);
+
+    // dataset from BamFile object (itself from relative BAM fn)
+    {
+        auto file = BamFile{"../b/test1.bam"};
+        BamFileTests::CheckFile(DataSet{file}, 3);
+    }
+
+    // restore working directory
+    ASSERT_EQ(0, chdir(cwd.c_str()));
+}
+
+TEST(BamFileTest, RelativePathXmlOk)
+{
+    // cache current working directory, then drill down so we can point to
+    // BAMs using relative path
+    const std::string cwd = FileUtils::CurrentWorkingDirectory();
+    ASSERT_EQ(0, chdir(PbbamTestsConfig::Data_Dir.c_str()));
+
+    // dataset from XML containing relative paths
+    BamFileTests::CheckFile(DataSet{"relative/relative.xml"}, 9);
+
+    // restore working directory
+    ASSERT_EQ(0, chdir(cwd.c_str()));
+}
+
+TEST(BamFileTest, RelativePathFofnOk)
+{
+    // cache current working directory, then drill down so we can point to
+    // BAMs using relative path
+    const std::string cwd = FileUtils::CurrentWorkingDirectory();
+    ASSERT_EQ(0, chdir(PbbamTestsConfig::Data_Dir.c_str()));
+
+    // dataset from FOFN containing relative paths
+    BamFileTests::CheckFile(DataSet{"relative/relative.fofn"}, 9);
+
+    // NOTE: doesn't yet support a FOFN containing an XML with relative paths
+    //       BamFileTests::CheckFile(DataSet{ "relative/relative2.fofn" }, 60);
+
+    // restore working directory
+    ASSERT_EQ(0, chdir(cwd.c_str()));
+}
+
+TEST(BamFileTest, TruncatedFileThrowsOk)
+{
+    EXPECT_THROW(BamFile{PbbamTestsConfig::GeneratedData_Dir + "/truncated.bam"},
+                 std::runtime_error);
+}
diff --git a/tests/src/test_BamHeader.cpp b/tests/src/test_BamHeader.cpp

new file mode 100644 (file)

index 0000000..43aa01d
--- /dev/null
+++ b/tests/src/test_BamHeader.cpp
@@ -0,0 +1,392 @@
+// Author: Derek Barnett
+
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <htslib/sam.h>
+
+#include <pbbam/BamHeader.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace BamHeaderTests {
+
+struct BamHdrDeleter
+{
+    void operator()(bam_hdr_t* hdr) const
+    {
+        if (hdr) bam_hdr_destroy(hdr);
+        hdr = nullptr;
+    }
+};
+
+}  // namespace BamHeaderTests
+
+TEST(BamHeaderTest, DefaultConstruction)
+{
+    BamHeader header;
+    EXPECT_TRUE(header.Version().empty());
+    EXPECT_TRUE(header.SortOrder().empty());  // default to unknown ?
+    EXPECT_TRUE(header.ReadGroups().empty());
+    EXPECT_TRUE(header.Sequences().empty());
+    EXPECT_TRUE(header.Programs().empty());
+    EXPECT_TRUE(header.Comments().empty());
+
+    EXPECT_THROW(header.Program("foo"), std::exception);
+    EXPECT_THROW(header.ReadGroup("foo"), std::exception);
+    EXPECT_THROW(header.SequenceId("foo"), std::exception);
+    EXPECT_THROW(header.SequenceLength(42), std::exception);
+    EXPECT_THROW(header.SequenceName(42), std::exception);
+}
+
+TEST(BamHeaderTest, DecodeTest)
+{
+    const std::string text{
+        "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+        "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+        "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+        "@RG\tID:rg1\tSM:control\n"
+        "@RG\tID:rg2\tSM:condition1\n"
+        "@RG\tID:rg3\tSM:condition1\n"
+        "@PG\tID:_foo_\tPN:ide\n"
+        "@CO\tipsum and so on\n"
+        "@CO\tcitation needed\n"};
+
+    BamHeader header = BamHeader(text);
+
+    EXPECT_EQ(std::string("1.1"), header.Version());
+    EXPECT_EQ(std::string("queryname"), header.SortOrder());
+    EXPECT_EQ(std::string("3.0.1"), header.PacBioBamVersion());
+
+    EXPECT_EQ(3, header.ReadGroups().size());
+    EXPECT_TRUE(header.HasReadGroup("rg1"));
+    EXPECT_TRUE(header.HasReadGroup("rg2"));
+    EXPECT_TRUE(header.HasReadGroup("rg3"));
+
+    EXPECT_EQ(std::string("control"), header.ReadGroup("rg1").Sample());
+    EXPECT_EQ(std::string("condition1"), header.ReadGroup("rg2").Sample());
+    EXPECT_EQ(std::string("condition1"), header.ReadGroup("rg3").Sample());
+
+    EXPECT_EQ(2, header.Sequences().size());
+    EXPECT_TRUE(header.HasSequence("chr1"));
+    EXPECT_TRUE(header.HasSequence("chr2"));
+    EXPECT_EQ(std::string("chocobo"), header.Sequence("chr1").Species());
+    EXPECT_EQ(std::string("chocobo"), header.Sequence("chr2").Species());
+    EXPECT_EQ(std::string("2038"), header.Sequence("chr1").Length());
+    EXPECT_EQ(std::string("3042"), header.Sequence("chr2").Length());
+
+    EXPECT_EQ(1, header.Programs().size());
+    EXPECT_TRUE(header.HasProgram("_foo_"));
+    EXPECT_EQ(std::string("ide"), header.Program("_foo_").Name());
+
+    EXPECT_EQ(2, header.Comments().size());
+    EXPECT_EQ(std::string("ipsum and so on"), header.Comments().at(0));
+    EXPECT_EQ(std::string("citation needed"), header.Comments().at(1));
+}
+
+TEST(BamHeaderTest, VersionCheckOk)
+{
+    auto expectFail = [](std::string&& label, std::string&& text) {
+        SCOPED_TRACE(label);
+        EXPECT_THROW(BamHeader{text}, std::runtime_error);
+    };
+    expectFail("empty version", "@HD\tVN:1.1\tSO:queryname\tpb:\n");
+    expectFail("old beta version", "@HD\tVN:1.1\tSO:queryname\tpb:3.0b3\n");
+    expectFail("old beta version", "@HD\tVN:1.1\tSO:queryname\tpb:3.0b7\n");
+    expectFail("invalid value", "@HD\tVN:1.1\tSO:queryname\tpb:3.0.should_not_work\n");
+    expectFail("earlier than minimum", "@HD\tVN:1.1\tSO:queryname\tpb:3.0.0\n");
+
+    // correct version syntax, number
+    EXPECT_NO_THROW(BamHeader{"@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"});
+}
+
+TEST(BamHeaderTest, EncodeTest)
+{
+    ReadGroupInfo rg1("rg1");
+    rg1.Sample("control");
+    ReadGroupInfo rg2("rg2");
+    rg2.Sample("condition1");
+    ReadGroupInfo rg3("rg3");
+    rg3.Sample("condition1");
+
+    SequenceInfo seq1("chr1");
+    seq1.Length("2038").Species("chocobo");
+    SequenceInfo seq2("chr2");
+    seq2.Length("3042").Species("chocobo");
+
+    ProgramInfo prog1("_foo_");
+    prog1.Name("ide");
+
+    BamHeader header;
+    header.Version("1.1")
+        .SortOrder("queryname")
+        .PacBioBamVersion("3.0.1")
+        .AddReadGroup(rg1)
+        .AddReadGroup(rg2)
+        .AddReadGroup(rg3)
+        .AddSequence(seq1)
+        .AddSequence(seq2)
+        .AddProgram(prog1)
+        .AddComment("ipsum and so on")
+        .AddComment("citation needed");
+
+    const std::string expectedText{
+        "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+        "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+        "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+        "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n"
+        "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+        "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+        "@PG\tID:_foo_\tPN:ide\n"
+        "@CO\tipsum and so on\n"
+        "@CO\tcitation needed\n"};
+
+    const std::string text = header.ToSam();
+    EXPECT_EQ(expectedText, text);
+}
+
+TEST(BamHeaderTest, ConvertToRawDataOk)
+{
+    ReadGroupInfo rg1("rg1");
+    rg1.Sample("control");
+    ReadGroupInfo rg2("rg2");
+    rg2.Sample("condition1");
+    ReadGroupInfo rg3("rg3");
+    rg3.Sample("condition1");
+
+    SequenceInfo seq1("chr1");
+    seq1.Length("2038").Species("chocobo");
+    SequenceInfo seq2("chr2");
+    seq2.Length("3042").Species("chocobo");
+
+    ProgramInfo prog1("_foo_");
+    prog1.Name("ide");
+
+    BamHeader header;
+    header.Version("1.1")
+        .SortOrder("queryname")
+        .PacBioBamVersion("3.0.1")
+        .AddReadGroup(rg1)
+        .AddReadGroup(rg2)
+        .AddReadGroup(rg3)
+        .AddSequence(seq1)
+        .AddSequence(seq2)
+        .AddProgram(prog1)
+        .AddComment("ipsum and so on")
+        .AddComment("citation needed");
+
+    const std::string expectedText{
+        "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+        "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+        "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+        "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n"
+        "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+        "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+        "@PG\tID:_foo_\tPN:ide\n"
+        "@CO\tipsum and so on\n"
+        "@CO\tcitation needed\n"};
+
+    const std::string text = header.ToSam();
+    std::shared_ptr<bam_hdr_t> rawData(sam_hdr_parse(text.size(), text.c_str()),
+                                       BamHeaderTests::BamHdrDeleter());
+    rawData->ignore_sam_err = 0;
+    rawData->cigar_tab = nullptr;
+    rawData->l_text = text.size();
+    rawData->text = static_cast<char*>(calloc(rawData->l_text + 1, 1));
+    memcpy(rawData->text, text.c_str(), rawData->l_text);
+
+    const std::string rawText(rawData->text, rawData->l_text);
+    EXPECT_EQ(expectedText, rawText);
+}
+
+TEST(BamHeaderTest, ExtractFromRawDataOk)
+{
+    ReadGroupInfo rg1("rg1");
+    rg1.Sample("control");
+    ReadGroupInfo rg2("rg2");
+    rg2.Sample("condition1");
+    ReadGroupInfo rg3("rg3");
+    rg3.Sample("condition1");
+
+    SequenceInfo seq1("chr1");
+    seq1.Length("2038").Species("chocobo");
+    SequenceInfo seq2("chr2");
+    seq2.Length("3042").Species("chocobo");
+
+    ProgramInfo prog1("_foo_");
+    prog1.Name("ide");
+
+    BamHeader header;
+    header.Version("1.1")
+        .SortOrder("queryname")
+        .PacBioBamVersion("3.0.1")
+        .AddReadGroup(rg1)
+        .AddReadGroup(rg2)
+        .AddReadGroup(rg3)
+        .AddSequence(seq1)
+        .AddSequence(seq2)
+        .AddProgram(prog1)
+        .AddComment("ipsum and so on")
+        .AddComment("citation needed");
+
+    const std::string expectedText{
+        "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+        "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+        "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+        "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n"
+        "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+        "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+        "@PG\tID:_foo_\tPN:ide\n"
+        "@CO\tipsum and so on\n"
+        "@CO\tcitation needed\n"};
+
+    std::string text = header.ToSam();
+    std::shared_ptr<bam_hdr_t> rawData(sam_hdr_parse(text.size(), text.c_str()),
+                                       BamHeaderTests::BamHdrDeleter());
+    rawData->ignore_sam_err = 0;
+    rawData->cigar_tab = nullptr;
+    rawData->l_text = text.size();
+    rawData->text = static_cast<char*>(calloc(rawData->l_text + 1, 1));
+    memcpy(rawData->text, text.c_str(), rawData->l_text);
+
+    const BamHeader newHeader = BamHeader(std::string(rawData->text, rawData->l_text));
+
+    EXPECT_EQ(header.Version(), newHeader.Version());
+    EXPECT_EQ(header.SortOrder(), newHeader.SortOrder());
+    EXPECT_EQ(header.PacBioBamVersion(), newHeader.PacBioBamVersion());
+
+    text = newHeader.ToSam();
+    EXPECT_EQ(expectedText, text);
+}
+
+TEST(BamHeaderTest, MergeOk)
+{
+    const std::string hdrText1{
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+        "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t"
+        "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\t"
+        "PM:SEQUEL\n"
+        "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n"
+        "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n"
+        "@CO\tcomment1\n"};
+
+    const std::string hdrText2{
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;"
+        "PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;"
+        "PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;"
+        "BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;"
+        "FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\t"
+        "PM:SEQUEL\n"
+        "@PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0\n"
+        "@PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0\n"
+        "@PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0\n"
+        "@CO\tcomment2\n"};
+
+    const std::string mergedText{
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+        "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t"
+        "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\t"
+        "PM:SEQUEL\n"
+        "@RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;"
+        "PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;"
+        "PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;"
+        "BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;"
+        "FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\t"
+        "PM:SEQUEL\n"
+        "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n"
+        "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n"
+        "@PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0\n"
+        "@PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0\n"
+        "@PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0\n"
+        "@CO\tcomment1\n"
+        "@CO\tcomment2\n"};
+
+    {  // operator+
+
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        const BamHeader merged = header1 + header2;
+        EXPECT_EQ(mergedText, merged.ToSam());
+
+        // also make sure inputs not changed
+        EXPECT_EQ(hdrText1, header1.ToSam());
+        EXPECT_EQ(hdrText2, header2.ToSam());
+    }
+
+    {  // operator+=
+
+        BamHeader header1(hdrText1);
+        header1 += BamHeader(hdrText2);
+        EXPECT_EQ(mergedText, header1.ToSam());
+    }
+}
+
+TEST(BamHeaderTest, MergeHandlesDuplicateReadGroups)
+{
+    const std::string hdrText{
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+        "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t"
+        "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\tPM:SEQUEL\n"
+        "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n"
+        "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n"};
+
+    // duplicate @RG:IDs handled ok (i.e. not duplicated in output)
+    const BamHeader header1(hdrText);
+    const BamHeader header2(hdrText);
+    const BamHeader merged = header1 + header2;
+    EXPECT_EQ(hdrText, merged.ToSam());
+}
+
+TEST(BamHeaderTest, MergeCompatibilityOk)
+{
+    {  // different @HD:VN - this IS allowed (as of SAT-465, pbbam v0.7.2)
+        const std::string hdrText1 = {"@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"};
+        const std::string hdrText2 = {"@HD\tVN:1.0\tSO:unknown\tpb:3.0.1\n"};
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        EXPECT_NO_THROW(header1 + header2);
+    }
+
+    {  // different @HD:SO
+        const std::string hdrText1 = {"@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"};
+        const std::string hdrText2 = {"@HD\tVN:1.1\tSO:coordinate\tpb:3.0.1\n"};
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        EXPECT_THROW(header1 + header2, std::runtime_error);
+    }
+
+    {  // different @HD:pb - this IS allowed (as of SAT-529, pbbam 0.7.4)
+        const std::string hdrText1 = {"@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"};
+        const std::string hdrText2 = {"@HD\tVN:1.1\tSO:unknown\tpb:3.0.3\n"};
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        EXPECT_NO_THROW(header1 + header2);
+    }
+
+    {  // @SQ list clash
+        const std::string hdrText1{
+            "@HD\tVN:1.1\tSO:coordinate\tpb:3.0.1\n"
+            "@SQ\tSN:foo\tLN:42\n"
+            "@SQ\tSN:bar\tLN:24\n"};
+        const std::string hdrText2{
+            "@HD\tVN:1.1\tSO:coordinate\tpb:3.0.1\n"
+            "@SQ\tSN:foo\tLN:42\n"
+            "@SQ\tSN:baz\tLN:99\n"};
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        EXPECT_THROW(header1 + header2, std::runtime_error);
+    }
+}
diff --git a/tests/src/test_BamRecord.cpp b/tests/src/test_BamRecord.cpp

new file mode 100644 (file)

index 0000000..56acae1
--- /dev/null
+++ b/tests/src/test_BamRecord.cpp
@@ -0,0 +1,2761 @@
+// Author: Derek Barnett
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamReader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamTagCodec.h>
+#include "../src/MemoryUtils.h"
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace BamRecordTests {
+
+static
+BamRecordImpl CreateBamImpl()
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Bin(42);
+    bam.Flag(42);
+    bam.InsertSize(42);
+    bam.MapQuality(42);
+    bam.MatePosition(42);
+    bam.MateReferenceId(42);
+    bam.Position(42);
+    bam.ReferenceId(42);
+    bam.Tags(tags);
+    return bam;
+}
+
+static inline
+BamRecord CreateBam()
+{ return BamRecord{ CreateBamImpl() }; }
+
+static
+void CheckRawData(const BamRecordImpl& bam)
+{
+    // ensure raw data (lengths at least) matches API-facing data
+    const uint32_t expectedNameBytes = bam.Name().size() + 1;  // include NULL term
+    const uint32_t expectedNameNulls = 4 - (expectedNameBytes % 4);
+    const uint32_t expectedNameLength = expectedNameBytes + expectedNameNulls;
+    const uint32_t expectedNumCigarOps = bam.CigarData().size();
+    const int32_t expectedSeqLength = bam.Sequence().length();
+    const size_t expectedTagsLength = BamTagCodec::Encode(bam.Tags()).size();
+
+    //  Name        CIGAR         Sequence       Quals      Tags
+    // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + <encoded length>
+    const int expectedTotalDataLength = expectedNameLength + (expectedNumCigarOps * 4) +
+                                        (expectedSeqLength + 1) / 2 + expectedSeqLength +
+                                        expectedTagsLength;
+
+    const auto rawData = PacBio::BAM::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    EXPECT_EQ(expectedNameNulls, rawData->core.l_extranul);
+    EXPECT_EQ(expectedNameLength, rawData->core.l_qname);
+    EXPECT_EQ(expectedNumCigarOps, rawData->core.n_cigar);
+    EXPECT_EQ(expectedSeqLength, rawData->core.l_qseq);
+    EXPECT_EQ(expectedTotalDataLength, rawData->l_data);
+}
+
+static inline
+void CheckRawData(const BamRecord& bam)
+{ CheckRawData(bam.Impl()); }
+
+static
+BamRecordImpl MakeCigaredImpl(const std::string& seq,
+                              const std::string& cigar,
+                              const Strand strand)
+{
+    BamRecordImpl impl;
+    impl.SetMapped(true).ReferenceId(0).Position(0).MapQuality(0);
+    impl.CigarData(Cigar::FromStdString(cigar));
+    impl.MateReferenceId(-1).MatePosition(-1).InsertSize(0);
+    impl.SetSequenceAndQualities(seq, std::string(seq.size(), '*'));
+    impl.SetReverseStrand(strand == Strand::REVERSE);
+    return impl;
+}
+
+static inline
+BamRecord MakeCigaredRecord(const std::string& seq,
+                            const std::string& cigar,
+                            const Strand strand)
+{ return BamRecord{ MakeCigaredImpl(seq, cigar, strand) }; }
+
+static
+BamRecord MakeCigaredBaseRecord(const std::string& bases,
+                                const std::string& cigar,
+                                const Strand strand)
+{
+    TagCollection tags;
+    tags["dt"] = bases;
+    tags["st"] = bases;
+
+    const std::string seq = std::string(bases.size(), 'N');
+    BamRecordImpl impl = MakeCigaredImpl(seq, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredFrameRecord(const std::vector<uint16_t>& frames,
+                                 const std::string& cigar,
+                                 const Strand strand)
+{
+    TagCollection tags;
+    tags["ip"] = frames;
+    tags["pw"] = frames;
+
+    const std::string seq = std::string(frames.size(), 'N');
+    BamRecordImpl impl = MakeCigaredImpl(seq, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredQualRecord(const std::string& quals,
+                                const std::string& cigar,
+                                const Strand strand)
+{
+    TagCollection tags;
+    tags["dq"] = quals;
+    tags["iq"] = quals;
+    tags["mq"] = quals;
+    tags["sq"] = quals;
+
+    const std::string seq = std::string(quals.size(), 'N');
+    BamRecordImpl impl = MakeCigaredImpl(seq, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredPulseBaseRecord(const std::string& seqBases,
+                                     const std::string& pulseCalls,
+                                     const std::string& pulseBases,
+                                     const std::string& cigar,
+                                     const Strand strand)
+{
+    TagCollection tags;
+    tags["pc"] = pulseCalls; // PulseCall
+    tags["pt"] = pulseBases; // AltLabelTag
+
+    BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredPulseQualRecord(const std::string& seqBases,
+                                     const std::string& pulseCalls,
+                                     const std::string& pulseQuals,
+                                     const std::string& cigar,
+                                     const Strand strand)
+{
+    TagCollection tags;
+    tags["pc"] = pulseCalls;
+    tags["pv"] = pulseQuals; // AltLabelQV
+    tags["pq"] = pulseQuals; // LabelQV
+    tags["pg"] = pulseQuals; // PulseMergeQV
+
+    BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredPulseFrameRecord(const std::string& seqBases,
+                                     const std::string& pulseCalls,
+                                     const std::vector<uint16_t>& pulseFrames,
+                                     const std::string& cigar,
+                                     const Strand strand)
+{
+    TagCollection tags;
+    tags["pc"] = pulseCalls;
+    tags["pd"] = pulseFrames; // PrePulseFrames
+    tags["px"] = pulseFrames; // PulseCallWidth
+
+    BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredPulseUIntRecord(const std::string& seqBases,
+                                     const std::string& pulseCalls,
+                                     const std::vector<uint32_t>& pulseUInts,
+                                     const std::string& cigar,
+                                     const Strand strand)
+{
+    TagCollection tags;
+    tags["pc"] = pulseCalls;
+    tags["sf"] = pulseUInts; // StartFrame
+
+    BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+// ----------------------------------------------------------
+// helper structs and methods for checking combinations of:
+//   aligned strand, orientation requested, alignment, clipping
+// ----------------------------------------------------------
+
+// generic result holder for various requested states
+template<typename T>
+struct ExpectedResult
+{
+public:
+    ExpectedResult(std::initializer_list<T> init)
+        : d_(init)
+    {
+        assert(12 == init.size());
+    }
+
+    T ForwardGenomic() const               { return d_.at(0); }
+    T ForwardNative() const                { return d_.at(1); }
+    T ForwardGenomicAligned() const        { return d_.at(2); }
+    T ForwardNativeAligned() const         { return d_.at(3); }
+    T ForwardGenomicAlignedClipped() const { return d_.at(4); }
+    T ForwardNativeAlignedClipped() const  { return d_.at(5); }
+    T ReverseGenomic() const               { return d_.at(6); }
+    T ReverseNative() const                { return d_.at(7); }
+    T ReverseGenomicAligned() const        { return d_.at(8); }
+    T ReverseNativeAligned() const         { return d_.at(9); }
+    T ReverseGenomicAlignedClipped() const { return d_.at(10); }
+    T ReverseNativeAlignedClipped() const  { return d_.at(11); }
+
+private:
+    std::vector<T> d_;
+};
+
+// generic data type checker on the various requested states
+template<typename DataType, typename MakeRecordType, typename FetchDataType>
+void CheckAlignAndClip(const std::string& cigar,
+                       const DataType& input,
+                       const BamRecordTests::ExpectedResult<DataType>& e,
+                       const MakeRecordType& makeRecord,
+                       const FetchDataType& fetchData)
+{
+    {   // map to forward strand
+        const BamRecord b = makeRecord(input, cigar, Strand::FORWARD);
+        EXPECT_EQ(e.ForwardGenomic(),               fetchData(b, Orientation::GENOMIC, false, false));
+        EXPECT_EQ(e.ForwardNative(),                fetchData(b, Orientation::NATIVE,  false, false));
+        EXPECT_EQ(e.ForwardGenomicAligned(),        fetchData(b, Orientation::GENOMIC, true,  false));
+        EXPECT_EQ(e.ForwardNativeAligned(),         fetchData(b, Orientation::NATIVE,  true,  false));
+        EXPECT_EQ(e.ForwardGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true,  true));
+        EXPECT_EQ(e.ForwardNativeAlignedClipped(),  fetchData(b, Orientation::NATIVE,  true,  true));
+    }
+    {   // map to reverse strand
+        const BamRecord b = makeRecord(input, cigar, Strand::REVERSE);
+        EXPECT_EQ(e.ReverseGenomic(),               fetchData(b, Orientation::GENOMIC, false, false));
+        EXPECT_EQ(e.ReverseNative(),                fetchData(b, Orientation::NATIVE,  false, false));
+        EXPECT_EQ(e.ReverseGenomicAligned(),        fetchData(b, Orientation::GENOMIC, true,  false));
+        EXPECT_EQ(e.ReverseNativeAligned(),         fetchData(b, Orientation::NATIVE,  true,  false));
+        EXPECT_EQ(e.ReverseGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true,  true));
+        EXPECT_EQ(e.ReverseNativeAlignedClipped(),  fetchData(b, Orientation::NATIVE,  true,  true));
+    }
+}
+
+template<typename DataType, typename MakeRecordType, typename FetchDataType>
+void CheckPulseDataAlignAndClip(const std::string& cigar,
+                                const std::string& seqBases,
+                                const std::string& pulseCalls,
+                                const DataType& input,
+                                const BamRecordTests::ExpectedResult<DataType>& allPulses,
+                                const BamRecordTests::ExpectedResult<DataType>& basecallsOnly,
+                                const MakeRecordType& makeRecord,
+                                const FetchDataType& fetchData)
+{
+    {   // map to forward strand
+        const BamRecord b = makeRecord(seqBases, pulseCalls, input, cigar, Strand::FORWARD);
+
+        EXPECT_EQ(allPulses.ForwardGenomic(),               fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::ALL));
+        EXPECT_EQ(allPulses.ForwardNative(),                fetchData(b, Orientation::NATIVE,  false, false, PulseBehavior::ALL));
+        // no align/clipping operations available on ALL pulses
+
+        EXPECT_EQ(basecallsOnly.ForwardGenomic(),               fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardNative(),                fetchData(b, Orientation::NATIVE,  false, false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardGenomicAligned(),        fetchData(b, Orientation::GENOMIC, true,  false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardNativeAligned(),         fetchData(b, Orientation::NATIVE,  true,  false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true,  true,  PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardNativeAlignedClipped(),  fetchData(b, Orientation::NATIVE,  true,  true,  PulseBehavior::BASECALLS_ONLY));
+    }
+    {   // map to reverse strand
+        const BamRecord b = makeRecord(seqBases, pulseCalls, input, cigar, Strand::REVERSE);
+
+        EXPECT_EQ(allPulses.ReverseGenomic(),               fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::ALL));
+        EXPECT_EQ(allPulses.ReverseNative(),                fetchData(b, Orientation::NATIVE,  false, false, PulseBehavior::ALL));
+        // no align/clipping operations available on ALL pulses
+
+        EXPECT_EQ(basecallsOnly.ReverseGenomic(),               fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseNative(),                fetchData(b, Orientation::NATIVE,  false, false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseGenomicAligned(),        fetchData(b, Orientation::GENOMIC, true,  false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseNativeAligned(),         fetchData(b, Orientation::NATIVE,  true,  false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true,  true,  PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseNativeAlignedClipped(),  fetchData(b, Orientation::NATIVE,  true,  true,  PulseBehavior::BASECALLS_ONLY));
+    }
+}
+
+static
+void CheckBaseTagsClippedAndAligned(const std::string& cigar,
+                                    const std::string& input,
+                                    const ExpectedResult<std::string>& e)
+{
+    // aligned record + DeletionTag, SubstitutionTag
+    auto makeRecord = [](const std::string& newBases,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return MakeCigaredBaseRecord(newBases, newCigar, newStrand); };
+
+    // DeletionTag
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.DeletionTag(orientation, aligned, exciseSoftClips); }
+    );
+
+    // SubstitutionTag
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.SubstitutionTag(orientation, aligned, exciseSoftClips); }
+    );
+}
+
+static
+void CheckFrameTagsClippedAndAligned(const std::string& cigar,
+                                     const std::vector<uint16_t>& input,
+                                     const ExpectedResult<std::vector<uint16_t> >& e)
+{
+
+    // aligned record + IPD, PulseWidth
+    auto makeRecord = [](const std::vector<uint16_t>& newFrames,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return BamRecordTests::MakeCigaredFrameRecord(newFrames, newCigar, newStrand); };
+
+    // IPD
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.IPD(orientation, aligned, exciseSoftClips).Data(); }
+    );
+
+    // PulseWidth
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.PulseWidth(orientation, aligned, exciseSoftClips).Data(); }
+    );
+}
+
+static
+void CheckQualityTagsClippedAndAligned(const std::string& cigar,
+                                       const std::string& input,
+                                       const ExpectedResult<std::string>& e)
+{
+    // aligned record + DeletionQV, InsertionQV, MergeQV, SubstitutionQV
+    auto makeRecord = [](const std::string& newQuals,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return BamRecordTests::MakeCigaredQualRecord(newQuals, newCigar, newStrand); };
+
+    // DeletionQV
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.DeletionQV(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+
+    // InsertionQV
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.InsertionQV(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+
+    // MergeQV
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.MergeQV(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+
+    // SubstitutionQV
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.SubstitutionQV(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+}
+
+static
+void CheckQualitiesClippedAndAligned(const std::string& cigar,
+                                     const std::string& input,
+                                     const ExpectedResult<std::string>& e)
+{
+    // aligned record w/ dummy SEQ & QUALs under test
+    auto makeRecord = [](const std::string& newQuals,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    {
+        const std::string seq = std::string(newQuals.size(), 'N');
+        auto record = BamRecordTests::MakeCigaredRecord(seq, newCigar, newStrand);
+        record.Impl().SetSequenceAndQualities(seq, newQuals);
+        return record;
+    };
+
+    // QUAL
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.Qualities(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+}
+
+static
+void CheckSequenceClippedAndAligned(const std::string& cigar,
+                                    const std::string& input,
+                                    const ExpectedResult<std::string>& e)
+{
+    // aligned record w/ SEQ
+    auto makeRecord = [](const std::string& newSeq,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return BamRecordTests::MakeCigaredRecord(newSeq, newCigar, newStrand); };
+
+    // SEQ
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.Sequence(orientation, aligned, exciseSoftClips); }
+    );
+}
+
+static
+void CheckPulseBaseTags(const std::string& cigar,
+                        const std::string& seqBases,
+                        const std::string& pulseCalls,
+                        const std::string& pulseBases,
+                        const ExpectedResult<std::string>& allPulses,
+                        const ExpectedResult<std::string>& basecallsOnly)
+{
+    // aligned record + AltLabelTag
+    auto makeRecord = [](const std::string& newSeqBases,
+                         const std::string& newPulseCalls,
+                         const std::string& newPulseBases,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return MakeCigaredPulseBaseRecord(newSeqBases, newPulseCalls, newPulseBases, newCigar, newStrand); };
+
+    // AltLabelTag
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseBases, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.AltLabelTag(orientation, aligned, exciseSoftClips, pulseBehavior); }
+    );
+    // PulseCall
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseBases, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.PulseCall(orientation, aligned, exciseSoftClips, pulseBehavior); }
+    );
+}
+
+static
+void CheckPulseFrameTags(const std::string& cigar,
+                         const std::string& seqBases,
+                         const std::string& pulseCalls,
+                         const std::vector<uint16_t>& pulseFrames,
+                         const ExpectedResult<std::vector<uint16_t>>& allPulses,
+                         const ExpectedResult<std::vector<uint16_t>>& basecallsOnly)
+{
+    // aligned record + PrePulseFrames
+    auto makeRecord = [](const std::string& newSeqBases,
+                         const std::string& newPulseCalls,
+                         const std::vector<uint16_t>& newPulseFrames,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return MakeCigaredPulseFrameRecord(newSeqBases, newPulseCalls, newPulseFrames, newCigar, newStrand); };
+
+    // PrePulseFrame
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseFrames, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.PrePulseFrames(orientation, aligned, exciseSoftClips, pulseBehavior).Data(); }
+    );
+    // PulseCallWidth
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseFrames, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.PulseCallWidth(orientation, aligned, exciseSoftClips, pulseBehavior).Data(); }
+    );
+}
+
+/*
+
+    { BamRecordTag::PKMEAN,            {"pa", true}  },   photons (vector<float>
+    { BamRecordTag::PKMEAN_2,          {"ps", true}  },   photons
+    { BamRecordTag::PKMID,             {"pm", true}  },   photons
+    { BamRecordTag::PKMID_2,           {"pi", true}  },   photons
+*/
+
+static
+void CheckPulseQualityTags(const std::string& cigar,
+                           const std::string& seqBases,
+                           const std::string& pulseCalls,
+                           const std::string& pulseQuals,
+                           const ExpectedResult<std::string>& allPulses,
+                           const ExpectedResult<std::string>& basecallsOnly)
+{
+    // aligned record + AltLabelQV
+    auto makeRecord = [](const std::string& newSeqBases,
+                         const std::string& newPulseCalls,
+                         const std::string& newPulseQuals,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return MakeCigaredPulseQualRecord(newSeqBases, newPulseCalls, newPulseQuals, newCigar, newStrand); };
+
+    // AltLabelQV
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseQuals, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.AltLabelQV(orientation, aligned, exciseSoftClips, pulseBehavior).Fastq(); }
+    );
+    // LabelQV
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseQuals, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.LabelQV(orientation, aligned, exciseSoftClips, pulseBehavior).Fastq(); }
+    );
+    // PulseMergeQV
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseQuals, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.PulseMergeQV(orientation, aligned, exciseSoftClips, pulseBehavior).Fastq(); }
+    );
+}
+
+static
+void CheckPulseUIntTags(const std::string& cigar,
+                        const std::string& seqBases,
+                        const std::string& pulseCalls,
+                        const std::vector<uint32_t>& startFrames,
+                        const ExpectedResult<std::vector<uint32_t>>& allPulses,
+                        const ExpectedResult<std::vector<uint32_t>>& basecallsOnly)
+{
+   // aligned record + StartFrame
+   auto makeRecord = [](const std::string& newSeqBases,
+                        const std::string& newPulseCalls,
+                        const std::vector<uint32_t>& newStartFrames,
+                        const std::string& newCigar,
+                        const Strand newStrand)
+   { return MakeCigaredPulseUIntRecord(newSeqBases, newPulseCalls, newStartFrames, newCigar, newStrand); };
+
+   // StartFrame
+   CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, startFrames, allPulses, basecallsOnly, makeRecord,
+                             [](const BamRecord& b,
+                                Orientation orientation,
+                                bool aligned,
+                                bool exciseSoftClips,
+                                PulseBehavior pulseBehavior)
+                             { return b.StartFrame(orientation, aligned, exciseSoftClips, pulseBehavior); }
+   );
+}
+
+
+
+} // namespace BamRecordTests
+
+TEST(BamRecordTest, DefaultValues)
+{
+    BamRecord bam;
+    const std::string emptyString;
+
+    // BamRecordImpl data
+    EXPECT_EQ(0, bam.Impl().Bin());
+    EXPECT_EQ(BamRecordImpl::UNMAPPED, bam.Impl().Flag());  // forced init unmapped
+    EXPECT_EQ(0, bam.Impl().InsertSize());
+    EXPECT_EQ(255, bam.Impl().MapQuality());
+    EXPECT_EQ(-1, bam.Impl().MateReferenceId());
+    EXPECT_EQ(-1, bam.Impl().MatePosition());
+    EXPECT_EQ(-1, bam.Impl().Position());
+    EXPECT_EQ(-1, bam.Impl().ReferenceId());
+    EXPECT_EQ(0, bam.Impl().Tags().size());
+
+    EXPECT_FALSE(bam.Impl().IsDuplicate());
+    EXPECT_FALSE(bam.Impl().IsFailedQC());
+    EXPECT_FALSE(bam.Impl().IsFirstMate());
+    EXPECT_FALSE(bam.Impl().IsMapped());             // forced init unmapped
+    EXPECT_TRUE(bam.Impl().IsMateMapped());
+    EXPECT_FALSE(bam.Impl().IsMateReverseStrand());
+    EXPECT_FALSE(bam.Impl().IsPaired());
+    EXPECT_TRUE(bam.Impl().IsPrimaryAlignment());
+    EXPECT_FALSE(bam.Impl().IsProperPair());
+    EXPECT_FALSE(bam.Impl().IsReverseStrand());
+    EXPECT_FALSE(bam.Impl().IsSecondMate());
+    EXPECT_FALSE(bam.Impl().IsSupplementaryAlignment());
+
+    EXPECT_EQ(emptyString, bam.Impl().Name());
+    EXPECT_EQ(emptyString, bam.Impl().CigarData().ToStdString());
+    EXPECT_EQ(emptyString, bam.Impl().Sequence());
+    EXPECT_EQ(emptyString, bam.Impl().Qualities().Fastq());
+
+    // PacBio data
+    EXPECT_EQ(-1, bam.AlignedStart());
+    EXPECT_EQ(-1, bam.AlignedEnd());
+
+    EXPECT_FALSE(bam.HasHoleNumber());
+    EXPECT_FALSE(bam.HasNumPasses());
+    EXPECT_FALSE(bam.HasQueryEnd());
+    EXPECT_FALSE(bam.HasQueryStart());
+    EXPECT_FALSE(bam.HasReadAccuracy());
+
+    EXPECT_THROW(bam.HoleNumber(), std::exception);
+    EXPECT_THROW(bam.NumPasses(), std::exception);
+    EXPECT_EQ(int32_t{0}, bam.QueryEnd());
+    EXPECT_EQ(int32_t{0}, bam.QueryStart());
+    EXPECT_THROW(bam.ReadAccuracy(), std::exception);
+
+    EXPECT_FALSE(bam.HasDeletionQV());
+    EXPECT_FALSE(bam.HasDeletionTag());
+    EXPECT_FALSE(bam.HasInsertionQV());
+    EXPECT_FALSE(bam.HasMergeQV());
+    EXPECT_FALSE(bam.HasSubstitutionQV());
+    EXPECT_FALSE(bam.HasSubstitutionTag());
+
+    EXPECT_THROW(bam.DeletionQV(),      std::exception);
+    EXPECT_THROW(bam.DeletionTag(),     std::exception);
+    EXPECT_THROW(bam.InsertionQV(),     std::exception);
+    EXPECT_THROW(bam.MergeQV(),         std::exception);
+    EXPECT_THROW(bam.SubstitutionQV(),  std::exception);
+    EXPECT_THROW(bam.SubstitutionTag(), std::exception);
+
+    // raw data
+    BamRecordTests::CheckRawData(bam);
+}
+
+TEST(BamRecordTest, FromBamRecordImpl)
+{
+    // check generic data
+    BamRecordImpl genericBam = BamRecordTests::CreateBamImpl();
+
+    EXPECT_EQ(42, genericBam.Bin());
+    EXPECT_EQ(42, genericBam.Flag());
+    EXPECT_EQ(42, genericBam.InsertSize());
+    EXPECT_EQ(42, genericBam.MapQuality());
+    EXPECT_EQ(42, genericBam.MateReferenceId());
+    EXPECT_EQ(42, genericBam.MatePosition());
+    EXPECT_EQ(42, genericBam.Position());
+    EXPECT_EQ(42, genericBam.ReferenceId());
+
+    const TagCollection genericTags = genericBam.Tags();
+    EXPECT_TRUE(genericTags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), genericTags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, genericTags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), genericTags.at("CA").ToUInt8Array());
+
+    // copy ctor
+    BamRecord bam1(genericBam);
+
+    EXPECT_EQ(42, bam1.Impl().Bin());
+    EXPECT_EQ(42, bam1.Impl().Flag());
+    EXPECT_EQ(42, bam1.Impl().InsertSize());
+    EXPECT_EQ(42, bam1.Impl().MapQuality());
+    EXPECT_EQ(42, bam1.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam1.Impl().MatePosition());
+    EXPECT_EQ(42, bam1.Impl().Position());
+    EXPECT_EQ(42, bam1.Impl().ReferenceId());
+
+    const TagCollection bam1Tags = bam1.Impl().Tags();
+    EXPECT_TRUE(bam1Tags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), bam1Tags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, bam1Tags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), bam1Tags.at("CA").ToUInt8Array());
+
+    // copy assignment
+    BamRecord bam2;
+    bam2 = genericBam;
+
+    EXPECT_EQ(42, bam2.Impl().Bin());
+    EXPECT_EQ(42, bam2.Impl().Flag());
+    EXPECT_EQ(42, bam2.Impl().InsertSize());
+    EXPECT_EQ(42, bam2.Impl().MapQuality());
+    EXPECT_EQ(42, bam2.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam2.Impl().MatePosition());
+    EXPECT_EQ(42, bam2.Impl().Position());
+    EXPECT_EQ(42, bam2.Impl().ReferenceId());
+
+    const TagCollection bam2Tags = bam2.Impl().Tags();
+    EXPECT_TRUE(bam2Tags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), bam2Tags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, bam2Tags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), bam2Tags.at("CA").ToUInt8Array());
+
+    // change genericBam, make sure we deep copied bam1 & bam2
+    genericBam.Position(2000);
+
+    EXPECT_EQ(2000, genericBam.Position());
+    EXPECT_EQ(42, bam1.Impl().Position());
+    EXPECT_EQ(42, bam2.Impl().Position());
+
+    // move ctor
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    BamRecord bam3(std::move(BamRecordTests::CreateBamImpl()));
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_EQ(42, bam3.Impl().Bin());
+    EXPECT_EQ(42, bam3.Impl().Flag());
+    EXPECT_EQ(42, bam3.Impl().InsertSize());
+    EXPECT_EQ(42, bam3.Impl().MapQuality());
+    EXPECT_EQ(42, bam3.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam3.Impl().MatePosition());
+    EXPECT_EQ(42, bam3.Impl().Position());
+    EXPECT_EQ(42, bam3.Impl().ReferenceId());
+
+    const TagCollection bam3Tags = bam3.Impl().Tags();
+    EXPECT_TRUE(bam3Tags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), bam3Tags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, bam3Tags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), bam3Tags.at("CA").ToUInt8Array());
+
+    // move assignment
+    BamRecord bam4;
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    bam4 = std::move(BamRecordTests::CreateBamImpl());
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_EQ(42, bam4.Impl().Bin());
+    EXPECT_EQ(42, bam4.Impl().Flag());
+    EXPECT_EQ(42, bam4.Impl().InsertSize());
+    EXPECT_EQ(42, bam4.Impl().MapQuality());
+    EXPECT_EQ(42, bam4.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam4.Impl().MatePosition());
+    EXPECT_EQ(42, bam4.Impl().Position());
+    EXPECT_EQ(42, bam4.Impl().ReferenceId());
+
+    const TagCollection bam4Tags = bam4.Impl().Tags();
+    EXPECT_TRUE(bam4Tags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), bam4Tags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, bam4Tags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), bam4Tags.at("CA").ToUInt8Array());
+}
+
+TEST(BamRecordTest, SelfAssignmentTolerated)
+{
+    BamRecord bam1;
+    bam1.Impl().Bin(42);
+    bam1.Impl().Flag(42);
+    bam1.Impl().InsertSize(42);
+    bam1.Impl().MapQuality(42);
+    bam1.Impl().MatePosition(42);
+    bam1.Impl().MateReferenceId(42);
+    bam1.Impl().Position(42);
+    bam1.Impl().ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam1.Impl().Tags(tags);
+
+    bam1 = bam1;
+
+    EXPECT_EQ(42, bam1.Impl().Bin());
+    EXPECT_EQ(42, bam1.Impl().Flag());
+    EXPECT_EQ(42, bam1.Impl().InsertSize());
+    EXPECT_EQ(42, bam1.Impl().MapQuality());
+    EXPECT_EQ(42, bam1.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam1.Impl().MatePosition());
+    EXPECT_EQ(42, bam1.Impl().Position());
+    EXPECT_EQ(42, bam1.Impl().ReferenceId());
+
+    const TagCollection fetchedTags1 = bam1.Impl().Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    BamRecordTests::CheckRawData(bam1);
+}
+
+TEST(BamRecordTest, CoreSetters)
+{
+    // create basic BAM with (generic) data
+    BamRecord bam = BamRecordTests::CreateBam();
+
+    QualityValues testQVs;
+    testQVs.push_back(0);
+    testQVs.push_back(1);
+
+    const std::string testTags = "GATTACA";
+
+    // now set PacBio data
+//    bam.AlignedStart(42);
+//    bam.AlignedEnd(42);
+//    bam.DeletionQVs(testQVs);
+//    bam.DeletionTags(testTags);
+//    bam.HoleNumber(42);
+//    bam.InsertionQVs(testQVs);
+//    bam.MergeQVs(testQVs);
+//    bam.NumPasses(42);
+//    bam.QueryEnd(42);
+//    bam.QueryStart(42);
+//    bam.ReadAccuracy(42);
+//    bam.ReferenceEnd(42);
+//    bam.ReferenceStart(42);
+//    bam.SubstitutionQVs(testQVs);
+//    bam.SubstitutionTags(testTags);
+
+    // check generic data
+    EXPECT_EQ(42, bam.Impl().Bin());
+    EXPECT_EQ(42, bam.Impl().Flag());
+    EXPECT_EQ(42, bam.Impl().InsertSize());
+    EXPECT_EQ(42, bam.Impl().MapQuality());
+    EXPECT_EQ(42, bam.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam.Impl().MatePosition());
+    EXPECT_EQ(42, bam.Impl().Position());
+    EXPECT_EQ(42, bam.Impl().ReferenceId());
+
+    // check PacBio data
+//    EXPECT_EQ(42, bam.AlignedStart());
+//    EXPECT_EQ(42, bam.AlignedEnd());
+//    EXPECT_EQ(testQVs, bam.DeletionQVs());
+//    EXPECT_EQ(testTags, bam.DeletionTags());
+//    EXPECT_EQ(42, bam.HoleNumber());
+//    EXPECT_EQ(testQVs, bam.InsertionQVs());
+//    EXPECT_EQ(testQVs, bam.MergeQVs());
+
+//    EXPECT_EQ(42, bam.NumPasses());
+//    EXPECT_EQ(42, bam.QueryEnd());
+//    EXPECT_EQ(42, bam.QueryStart());
+//    EXPECT_EQ(42, bam.ReadAccuracy());
+//    EXPECT_EQ(42, bam.ReferenceEnd());
+//    EXPECT_EQ(42, bam.ReferenceStart());
+//    EXPECT_EQ(testQVs, bam.SubstitutionQVs());
+//    EXPECT_EQ(testTags, bam.SubstitutionTags());
+
+    // check tags
+    const TagCollection fetchedTags = bam.Impl().Tags();
+    EXPECT_TRUE(fetchedTags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags.at("CA").ToUInt8Array());
+
+    BamRecordTests::CheckRawData(bam);
+}
+
+TEST(BamRecordTest, SequenceOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Sequence");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "13=",                  // CIGAR
+            "ATATATCCCGGCG",        // input
+            {
+                "ATATATCCCGGCG",    // forward strand, genomic
+                "ATATATCCCGGCG",    // forward strand, native
+                "ATATATCCCGGCG",    // forward strand, genomic, aligned
+                "ATATATCCCGGCG",    // forward strand, native,  aligned
+                "ATATATCCCGGCG",    // forward strand, genomic, aligned + clipped
+                "ATATATCCCGGCG",    // forward strand, native,  aligned + clipped
+                "ATATATCCCGGCG",    // reverse strand, genomic
+                "CGCCGGGATATAT",    // reverse strand, native
+                "ATATATCCCGGCG",    // reverse strand, genomic, aligned
+                "CGCCGGGATATAT",    // reverse strand, native,  aligned
+                "ATATATCCCGGCG",    // reverse strand, genomic, aligned + clipped
+                "CGCCGGGATATAT"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, QualitiesOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Qualities");
+        BamRecordTests::CheckQualitiesClippedAndAligned(
+            "13=",                  // CIGAR
+            "?]?]?]?]?]?]*",        // input
+            {
+                "?]?]?]?]?]?]*",    // forward strand, genomic
+                "?]?]?]?]?]?]*",    // forward strand, native
+                "?]?]?]?]?]?]*",    // forward strand, genomic, aligned
+                "?]?]?]?]?]?]*",    // forward strand, native,  aligned
+                "?]?]?]?]?]?]*",    // forward strand, genomic, aligned + clipped
+                "?]?]?]?]?]?]*",    // forward strand, native,  aligned + clipped
+                "?]?]?]?]?]?]*",    // reverse strand, genomic
+                "*]?]?]?]?]?]?",    // reverse strand, native
+                "?]?]?]?]?]?]*",    // reverse strand, genomic, aligned
+                "*]?]?]?]?]?]?",    // reverse strand, native,  aligned
+                "?]?]?]?]?]?]*",    // reverse strand, genomic, aligned + clipped
+                "*]?]?]?]?]?]?"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, SequenceTagsOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Base Tags");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "13=",                  // CIGAR
+            "ATATATCCCGGCG",        // input
+            {
+                "ATATATCCCGGCG",    // forward strand, genomic
+                "ATATATCCCGGCG",    // forward strand, native
+                "ATATATCCCGGCG",    // forward strand, genomic, aligned
+                "ATATATCCCGGCG",    // forward strand, native, aligned
+                "ATATATCCCGGCG",    // forward strand, genomic, aligned, clipped
+                "ATATATCCCGGCG",    // forward strand, native, aligned, clipped
+                "CGCCGGGATATAT",    // reverse strand, genomic
+                "ATATATCCCGGCG",    // reverse strand, native
+                "CGCCGGGATATAT",    // reverse strand, genomic, aligned
+                "ATATATCCCGGCG",    // reverse strand, native, aligned
+                "CGCCGGGATATAT",    // reverse strand, genomic, aligned, clipped
+                "ATATATCCCGGCG"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, FrameTagsOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Frames");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "5=",                   // CIGAR
+            { 0, 1, 2, 3, 4 },      // input
+            {
+                { 0, 1, 2, 3, 4 },  // forward strand, genomic
+                { 0, 1, 2, 3, 4 },  // forward strand, native
+                { 0, 1, 2, 3, 4 },  // forward strand, genomic, aligned
+                { 0, 1, 2, 3, 4 },  // forward strand, native, aligned
+                { 0, 1, 2, 3, 4 },  // forward strand, genomic, aligned, clipped
+                { 0, 1, 2, 3, 4 },  // forward strand, native, aligned, clipped
+                { 4, 3, 2, 1, 0 },  // reverse strand, genomic
+                { 0, 1, 2, 3, 4 },  // reverse strand, native
+                { 4, 3, 2, 1, 0 },  // reverse strand, genomic, aligned
+                { 0, 1, 2, 3, 4 },  // reverse strand, native, aligned
+                { 4, 3, 2, 1, 0 },  // reverse strand, genomic, aligned, clipped
+                { 0, 1, 2, 3, 4 }   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, QualityTagsOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Quality Tags");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "13=",                  // CIGAR
+            "?]?]?]?]?]?]*",        // input
+            {
+                "?]?]?]?]?]?]*",    // forward strand, genomic
+                "?]?]?]?]?]?]*",    // forward strand, native
+                "?]?]?]?]?]?]*",    // forward strand, genomic, aligned
+                "?]?]?]?]?]?]*",    // forward strand, native,  aligned
+                "?]?]?]?]?]?]*",    // forward strand, genomic, aligned + clipped
+                "?]?]?]?]?]?]*",    // forward strand, native,  aligned + clipped
+                "*]?]?]?]?]?]?",    // reverse strand, genomic
+                "?]?]?]?]?]?]*",    // reverse strand, native
+                "*]?]?]?]?]?]?",    // reverse strand, genomic, aligned
+                "?]?]?]?]?]?]*",    // reverse strand, native,  aligned
+                "*]?]?]?]?]?]?",    // reverse strand, genomic, aligned + clipped
+                "?]?]?]?]?]?]*"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, SequenceClippedAndAligned)
+{
+    {
+        SCOPED_TRACE("CIGAR: 10=");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "10=",              // CIGAR
+            "ATCCGCGGTT",       // input
+            {
+                "ATCCGCGGTT",   // forward strand, genomic
+                "ATCCGCGGTT",   // forward strand, native
+                "ATCCGCGGTT",   // forward strand, genomic, aligned
+                "ATCCGCGGTT",   // forward strand, native,  aligned
+                "ATCCGCGGTT",   // forward strand, genomic, aligned + clipped
+                "ATCCGCGGTT",   // forward strand, native,  aligned + clipped
+                "ATCCGCGGTT",   // reverse strand, genomic
+                "AACCGCGGAT",   // reverse strand, native
+                "ATCCGCGGTT",   // reverse strand, genomic, aligned
+                "AACCGCGGAT",   // reverse strand, native,  aligned
+                "ATCCGCGGTT",   // reverse strand, genomic, aligned + clipped
+                "AACCGCGGAT"    // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3=4N3=");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "3=4N3=",       // CIGAR
+            "ACGTTT",        // input
+            {
+                "ACGTTT",    // forward strand, genomic
+                "ACGTTT",    // forward strand, native
+                "ACGTTT",    // forward strand, genomic, aligned
+                "ACGTTT",    // forward strand, native,  aligned
+                "ACGTTT",    // forward strand, genomic, aligned + clipped
+                "ACGTTT",    // forward strand, native,  aligned + clipped
+                "ACGTTT",    // reverse strand, genomic
+                "AAACGT",    // reverse strand, native
+                "ACGTTT",    // reverse strand, genomic, aligned
+                "AAACGT",    // reverse strand, native,  aligned
+                "ACGTTT",    // reverse strand, genomic, aligned + clipped
+                "AAACGT"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 1S8=1S");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "1S8=1S",           // CIGAR
+            "ACCCGCGGTT",       // input
+            {
+                "ACCCGCGGTT",   // forward strand, genomic
+                "ACCCGCGGTT",   // forward strand, native
+                "ACCCGCGGTT",   // forward strand, genomic, aligned
+                "ACCCGCGGTT",   // forward strand, native,  aligned
+                "CCCGCGGT",     // forward strand, genomic, aligned + clipped
+                "CCCGCGGT",     // forward strand, native,  aligned + clipped
+                "ACCCGCGGTT",   // reverse strand, genomic
+                "AACCGCGGGT",   // reverse strand, native
+                "ACCCGCGGTT",   // reverse strand, genomic, aligned
+                "AACCGCGGGT",   // reverse strand, native,  aligned
+                "CCCGCGGT",     // reverse strand, genomic, aligned + clipped
+                "ACCGCGGG"      // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 1H8=1H");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "1H8=1H",           // CIGAR
+            "ATCGCGGT",         // input
+            {
+                "ATCGCGGT",     // forward strand, genomic
+                "ATCGCGGT",     // forward strand, native
+                "ATCGCGGT",     // forward strand, genomic, aligned
+                "ATCGCGGT",     // forward strand, native,  aligned
+                "ATCGCGGT",     // forward strand, genomic, aligned + clipped
+                "ATCGCGGT",     // forward strand, native,  aligned + clipped
+                "ATCGCGGT",     // reverse strand, genomic
+                "ACCGCGAT",     // reverse strand, native
+                "ATCGCGGT",     // reverse strand, genomic, aligned
+                "ACCGCGAT",     // reverse strand, native,  aligned
+                "ATCGCGGT",     // reverse strand, genomic, aligned + clipped
+                "ACCGCGAT"      // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2S6=2S");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "2S6=2S",           // CIGAR
+            "AGCCGCGGTT",       // input
+            {
+                "AGCCGCGGTT",   // forward strand, genomic
+                "AGCCGCGGTT",   // forward strand, native
+                "AGCCGCGGTT",   // forward strand, genomic, aligned
+                "AGCCGCGGTT",   // forward strand, native,  aligned
+                "CCGCGG",       // forward strand, genomic, aligned + clipped
+                "CCGCGG",       // forward strand, native,  aligned + clipped
+                "AGCCGCGGTT",   // reverse strand, genomic
+                "AACCGCGGCT",   // reverse strand, native
+                "AGCCGCGGTT",   // reverse strand, genomic, aligned
+                "AACCGCGGCT",   // reverse strand, native,  aligned
+                "CCGCGG",       // reverse strand, genomic, aligned + clipped
+                "CCGCGG"        // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2S3=2I3=2S");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "2S3=2I3=2S",           // CIGAR
+            "ATCCGNNCGGTT",         // input
+            {
+                "ATCCGNNCGGTT",     // forward strand, genomic
+                "ATCCGNNCGGTT",     // forward strand, native
+                "ATCCGNNCGGTT",     // forward strand, genomic, aligned
+                "ATCCGNNCGGTT",     // forward strand, native,  aligned
+                "CCGNNCGG",         // forward strand, genomic, aligned + clipped
+                "CCGNNCGG",         // forward strand, native,  aligned + clipped
+                "ATCCGNNCGGTT",     // reverse strand, genomic
+                "AACCGNNCGGAT",     // reverse strand, native
+                "ATCCGNNCGGTT",     // reverse strand, genomic, aligned
+                "AACCGNNCGGAT",     // reverse strand, native,  aligned
+                "CCGNNCGG",         // reverse strand, genomic, aligned + clipped
+                "CCGNNCGG"          // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H6=2H");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "2H6=2H",       // CIGAR
+            "CAGCGG",       // input
+            {
+                "CAGCGG",   // forward strand, genomic
+                "CAGCGG",   // forward strand, native
+                "CAGCGG",   // forward strand, genomic, aligned
+                "CAGCGG",   // forward strand, native,  aligned
+                "CAGCGG",   // forward strand, genomic, aligned + clipped
+                "CAGCGG",   // forward strand, native,  aligned + clipped
+                "CAGCGG",   // reverse strand, genomic
+                "CCGCTG",   // reverse strand, native
+                "CAGCGG",   // reverse strand, genomic, aligned
+                "CCGCTG",   // reverse strand, native,  aligned
+                "CAGCGG",   // reverse strand, genomic, aligned + clipped
+                "CCGCTG"    // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, ClippingOrientationAndAlignment)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "4=3D4=",           // CIGAR
+            "AACCGTTA",         // input
+            {
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native,  aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned + clipped
+                "AACC---GTTA",  // forward strand, native,  aligned + clipped
+                "AACCGTTA",     // reverse strand, genomic
+                "TAACGGTT",     // reverse strand, native
+                "AACC---GTTA",  // reverse strand, genomic, aligned
+                "TAAC---GGTT",  // reverse strand, native,  aligned
+                "AACC---GTTA",  // reverse strand, genomic, aligned + clipped
+                "TAAC---GGTT"   // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "4=1D2I2D4=",           // CIGAR
+            "ATCCTAGGTT",           // input
+            {
+                "ATCCTAGGTT",       // forward strand, genomic
+                "ATCCTAGGTT",       // forward strand, native
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned
+                "ATCC-TA--GGTT",    // forward strand, native,  aligned
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned + clipped
+                "ATCC-TA--GGTT",    // forward strand, native,  aligned + clipped
+                "ATCCTAGGTT",       // reverse strand, genomic
+                "AACCTAGGAT",       // reverse strand, native
+                "ATCC-TA--GGTT",    // reverse strand, genomic, aligned
+                "AACC--TA-GGAT",    // reverse strand, native,  aligned
+                "ATCC-TA--GGTT",    // reverse strand, genomic, aligned + clipped
+                "AACC--TA-GGAT"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "4=1D2P2I2P2D4=",           // CIGAR
+            "ATCCTAGGTT",               // input
+            {
+                "ATCCTAGGTT",           // forward strand, genomic
+                "ATCCTAGGTT",           // forward strand, native
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, native,  aligned
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned + clipped
+                "ATCC-**TA**--GGTT",    // forward strand, native,  aligned + clipped
+                "ATCCTAGGTT",           // reverse strand, genomic
+                "AACCTAGGAT",           // reverse strand, native
+                "ATCC-**TA**--GGTT",    // reverse strand, genomic, aligned
+                "AACC--**TA**-GGAT",    // reverse strand, native,  aligned
+                "ATCC-**TA**--GGTT",    // reverse strand, genomic, aligned + clipped
+                "AACC--**TA**-GGAT"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2S4=3D4=3S");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "2S4=3D4=3S",               // CIGAR
+            "TTAACCGTTACCG",            // input
+            {
+                "TTAACCGTTACCG",        // forward strand, genomic
+                "TTAACCGTTACCG",        // forward strand, native
+                "TTAACC---GTTACCG",     // forward strand, genomic, aligned
+                "TTAACC---GTTACCG",     // forward strand, native,  aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned + clipped
+                "AACC---GTTA",          // forward strand, native,  aligned + clipped
+                "TTAACCGTTACCG",        // reverse strand, genomic
+                "CGGTAACGGTTAA",        // reverse strand, native
+                "TTAACC---GTTACCG",     // reverse strand, genomic, aligned
+                "CGGTAAC---GGTTAA",     // reverse strand, native,  aligned
+                "AACC---GTTA",          // reverse strand, genomic, aligned + clipped
+                "TAAC---GGTT"           // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // input
+            {
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native,  aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned + clipped
+                "AACC---GTTA",  // forward strand, native,  aligned + clipped
+                "AACCGTTA",     // reverse strand, genomic
+                "TAACGGTT",     // reverse strand, native
+                "AACC---GTTA",  // reverse strand, genomic, aligned
+                "TAAC---GGTT",  // reverse strand, native,  aligned
+                "AACC---GTTA",  // reverse strand, genomic, aligned + clipped
+                "TAAC---GGTT"   // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H2S4=3D4=3S3H");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "2H2S4=3D4=3S3H",           // CIGAR
+            "TTAACCGTTACCG",            // input
+            {
+                "TTAACCGTTACCG",        // forward strand, genomic
+                "TTAACCGTTACCG",        // forward strand, native
+                "TTAACC---GTTACCG",     // forward strand, genomic, aligned
+                "TTAACC---GTTACCG",     // forward strand, native,  aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned + clipped
+                "AACC---GTTA",          // forward strand, native,  aligned + clipped
+                "TTAACCGTTACCG",        // reverse strand, genomic
+                "CGGTAACGGTTAA",        // reverse strand, native
+                "TTAACC---GTTACCG",     // reverse strand, genomic, aligned
+                "CGGTAAC---GGTTAA",     // reverse strand, native,  aligned
+                "AACC---GTTA",          // reverse strand, genomic, aligned + clipped
+                "TAAC---GGTT"           // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, QualityTagsClippedAndAligned)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "4=3D4=",           // CIGAR
+            "?]?]?]?@",         // input
+            {
+                "?]?]?]?@",     // forward strand, genomic
+                "?]?]?]?@",     // forward strand, native
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned
+                "?]?]!!!?]?@",  // forward strand, native,  aligned
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned + clipped
+                "?]?]!!!?]?@",  // forward strand, native,  aligned + clipped
+                "@?]?]?]?",     // reverse strand, genomic
+                "?]?]?]?@",     // reverse strand, native
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned
+                "?]?]!!!?]?@",  // reverse strand, native,  aligned
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned + clipped
+                "?]?]!!!?]?@"   // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "4=1D2I2D4=",           // CIGAR
+            "?]?]87?]?@",           // input
+            {
+                "?]?]87?]?@",       // forward strand, genomic
+                "?]?]87?]?@",       // forward strand, native
+                "?]?]!87!!?]?@",    // forward strand, genomic, aligned
+                "?]?]!87!!?]?@",    // forward strand, native,  aligned
+                "?]?]!87!!?]?@",    // forward strand, genomic, aligned + clipped
+                "?]?]!87!!?]?@",    // forward strand, native,  aligned + clipped
+                "@?]?78]?]?",       // reverse strand, genomic
+                "?]?]87?]?@",       // reverse strand, native
+                "@?]?!78!!]?]?",    // reverse strand, genomic, aligned
+                "?]?]!!87!?]?@",    // reverse strand, native,  aligned
+                "@?]?!78!!]?]?",    // reverse strand, genomic, aligned + clipped
+                "?]?]!!87!?]?@"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "4=1D2P2I2P2D4=",       // CIGAR
+            "?]?]87?]?@",           // input
+        {
+            "?]?]87?]?@",           // forward strand, genomic
+            "?]?]87?]?@",           // forward strand, native
+            "?]?]!!!87!!!!?]?@",    // forward strand, genomic, aligned
+            "?]?]!!!87!!!!?]?@",    // forward strand, native,  aligned
+            "?]?]!!!87!!!!?]?@",    // forward strand, genomic, aligned + clipped
+            "?]?]!!!87!!!!?]?@",    // forward strand, native,  aligned + clipped
+            "@?]?78]?]?",           // reverse strand, genomic
+            "?]?]87?]?@",           // reverse strand, native
+            "@?]?!!!78!!!!]?]?",    // reverse strand, genomic, aligned
+            "?]?]!!!!87!!!?]?@",    // reverse strand, native,  aligned
+            "@?]?!!!78!!!!]?]?",    // reverse strand, genomic, aligned + clipped
+            "?]?]!!!!87!!!?]?@"     // reverse strand, native,  aligned + clipped
+        }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "3S4=3D4=3S",               // CIGAR
+            "vvv?]?]?]?@xxx",           // input
+            {
+                "vvv?]?]?]?@xxx",       // forward strand, genomic
+                "vvv?]?]?]?@xxx",       // forward strand, native
+                "vvv?]?]!!!?]?@xxx",    // forward strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // forward strand, native, aligned
+                "?]?]!!!?]?@",          // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",          // forward strand, native, aligned, clipped
+                "xxx@?]?]?]?vvv",       // reverse strand, genomic
+                "vvv?]?]?]?@xxx",       // reverse strand, native
+                "xxx@?]?!!!]?]?vvv",    // reverse strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // reverse strand, native, aligned
+                "@?]?!!!]?]?",          // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "2H4=3D4=3H",       // CIGAR
+            "?]?]?]?@",         // input
+            {
+                "?]?]?]?@",     // forward strand, genomic
+                "?]?]?]?@",     // forward strand, native
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned
+                "?]?]!!!?]?@",  // forward strand, native, aligned
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",  // forward strand, native, aligned, clipped
+                "@?]?]?]?",     // reverse strand, genomic
+                "?]?]?]?@",     // reverse strand, native
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned
+                "?]?]!!!?]?@",  // reverse strand, native, aligned
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "vvv?]?]?]?@xxx",           // input
+            {
+                "vvv?]?]?]?@xxx",       // forward strand, genomic
+                "vvv?]?]?]?@xxx",       // forward strand, native
+                "vvv?]?]!!!?]?@xxx",    // forward strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // forward strand, native, aligned
+                "?]?]!!!?]?@",          // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",          // forward strand, native, aligned, clipped
+                "xxx@?]?]?]?vvv",       // reverse strand, genomic
+                "vvv?]?]?]?@xxx",       // reverse strand, native
+                "xxx@?]?!!!]?]?vvv",    // reverse strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // reverse strand, native, aligned
+                "@?]?!!!]?]?",          // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, BaseTagsClippedAndAligned)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "4=3D4=",           // CIGAR
+            "AACCGTTA",         // input
+            {
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native, aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",  // forward strand, native, aligned, clipped
+                "TAACGGTT",     // reverse strand, genomic
+                "AACCGTTA",     // reverse strand, native
+                "TAAC---GGTT",  // reverse strand, genomic, aligned
+                "AACC---GTTA",  // reverse strand, native, aligned
+                "TAAC---GGTT",  // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "4=1D2I2D4=",           // CIGAR
+            "ATCCTAGGTT",           // input
+            {
+                "ATCCTAGGTT",       // forward strand, genomic
+                "ATCCTAGGTT",       // forward strand, native
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned
+                "ATCC-TA--GGTT",    // forward strand, native, aligned
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned, clipped
+                "ATCC-TA--GGTT",    // forward strand, native, aligned, clipped
+                "AACCTAGGAT",       // reverse strand, genomic
+                "ATCCTAGGTT",       // reverse strand, native
+                "AACC-TA--GGAT",    // reverse strand, genomic, aligned
+                "ATCC--TA-GGTT",    // reverse strand, native, aligned
+                "AACC-TA--GGAT",    // reverse strand, genomic, aligned, clipped
+                "ATCC--TA-GGTT"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "4=1D2P2I2P2D4=",           // CIGAR
+            "ATCCTAGGTT",               // input
+            {
+                "ATCCTAGGTT",           // forward strand, genomic
+                "ATCCTAGGTT",           // forward strand, native
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, native, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned, clipped
+                "ATCC-**TA**--GGTT",    // forward strand, native, aligned, clipped
+                "AACCTAGGAT",           // reverse strand, genomic
+                "ATCCTAGGTT",           // reverse strand, native
+                "AACC-**TA**--GGAT",    // reverse strand, genomic, aligned
+                "ATCC--**TA**-GGTT",    // reverse strand, native, aligned
+                "AACC-**TA**--GGAT",    // reverse strand, genomic, aligned, clipped
+                "ATCC--**TA**-GGTT"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // input
+            {
+                "TTTAACCGTTACCG",       // forward strand, genomic
+                "TTTAACCGTTACCG",       // forward strand, native
+                "TTTAACC---GTTACCG",    // forward strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // forward strand, native, aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",          // forward strand, native, aligned, clipped
+                "CGGTAACGGTTAAA",       // reverse strand, genomic
+                "TTTAACCGTTACCG",       // reverse strand, native
+                "CGGTAAC---GGTTAAA",    // reverse strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // reverse strand, native, aligned
+                "TAAC---GGTT",          // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // input
+            {
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native, aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",  // forward strand, native, aligned, clipped
+                "TAACGGTT",     // reverse strand, genomic
+                "AACCGTTA",     // reverse strand, native
+                "TAAC---GGTT",  // reverse strand, genomic, aligned
+                "AACC---GTTA",  // reverse strand, native, aligned
+                "TAAC---GGTT",  // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // input
+            {
+                "TTTAACCGTTACCG",       // forward strand, genomic
+                "TTTAACCGTTACCG",       // forward strand, native
+                "TTTAACC---GTTACCG",    // forward strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // forward strand, native, aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",          // forward strand, native, aligned, clipped
+                "CGGTAACGGTTAAA",       // reverse strand, genomic
+                "TTTAACCGTTACCG",       // reverse strand, native
+                "CGGTAAC---GGTTAAA",    // reverse strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // reverse strand, native, aligned
+                "TAAC---GGTT",          // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, FrameTagsClippedAndAligned)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "4=3D4=",                                           // CIGAR
+            { 10, 20, 10, 20, 10, 20, 10, 30 },                 // input
+            {
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "4=1D2I2D4=",                                               // CIGAR
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                 // input
+            {
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "4=1D2P2I2P2D4=",                                                   // CIGAR
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // input
+        {
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, genomic
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, native
+            { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+            { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+            { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+            { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+            { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },                         // reverse strand, genomic
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // reverse strand, native
+            { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+            { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+            { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+            { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+        }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "3S4=3D4=3S",                                                               // CIGAR
+            { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },                 // input
+            {
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "2H4=3D4=3H",                                       // CIGAR
+            { 10, 20, 10, 20, 10, 20, 10, 30 },                 // input
+            {
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "2H3S4=3D4=3S3H",                                                           // CIGAR
+            { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },                 // input
+            {
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseBaseTags)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckPulseBaseTags(
+            "4=3D4=",           // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            "AAaaCCGggTTA",     // tag data
+
+            {   // all pulses
+
+                "AAaaCCGggTTA",     // forward strand, genomic
+                "AAaaCCGggTTA",     // forward strand, native
+                "",  // forward strand, genomic, aligned
+                "",  // forward strand, native, aligned
+                "",  // forward strand, genomic, aligned, clipped
+                "",  // forward strand, native, aligned, clipped
+                "TAAccCGGttTT",     // reverse strand, genomic
+                "AAaaCCGggTTA",     // reverse strand, native
+                "",  // reverse strand, genomic, aligned
+                "",  // reverse strand, native, aligned
+                "",  // reverse strand, genomic, aligned, clipped
+                ""   // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native, aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",  // forward strand, native, aligned, clipped
+                "TAACGGTT",     // reverse strand, genomic
+                "AACCGTTA",     // reverse strand, native
+                "TAAC---GGTT",  // reverse strand, genomic, aligned
+                "AACC---GTTA",  // reverse strand, native, aligned
+                "TAAC---GGTT",  // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckPulseBaseTags(
+            "4=1D2I2D4=",       // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            "ATttCCTtAGGggTT",  // tag data
+
+            {   // all pulses
+
+                "ATttCCTtAGGggTT",       // forward strand, genomic
+                "ATttCCTtAGGggTT",       // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native, aligned
+                "",    // forward strand, genomic, aligned, clipped
+                "",    // forward strand, native, aligned, clipped
+                "AAccCCTaAGGaaAT",       // reverse strand, genomic
+                "ATttCCTtAGGggTT",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",    // reverse strand, genomic, aligned, clipped
+                ""     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "ATCCTAGGTT",       // forward strand, genomic
+                "ATCCTAGGTT",       // forward strand, native
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned
+                "ATCC-TA--GGTT",    // forward strand, native, aligned
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned, clipped
+                "ATCC-TA--GGTT",    // forward strand, native, aligned, clipped
+                "AACCTAGGAT",       // reverse strand, genomic
+                "ATCCTAGGTT",       // reverse strand, native
+                "AACC-TA--GGAT",    // reverse strand, genomic, aligned
+                "ATCC--TA-GGTT",    // reverse strand, native, aligned
+                "AACC-TA--GGAT",    // reverse strand, genomic, aligned, clipped
+                "ATCC--TA-GGTT"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckPulseBaseTags(
+            "4=1D2P2I2P2D4=",   // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            "ATttCCTtAGGggTT",  // tag data
+            {
+                "ATttCCTtAGGggTT",           // forward strand, genomic
+                "ATttCCTtAGGggTT",           // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native, aligned
+                "",    // forward strand, genomic, aligned, clipped
+                "",    // forward strand, native, aligned, clipped
+                "AAccCCTaAGGaaAT",           // reverse strand, genomic
+                "ATttCCTtAGGggTT",           // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",    // reverse strand, genomic, aligned, clipped
+                ""     // reverse strand, native, aligned, clipped
+            },
+            {
+                "ATCCTAGGTT",           // forward strand, genomic
+                "ATCCTAGGTT",           // forward strand, native
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, native, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned, clipped
+                "ATCC-**TA**--GGTT",    // forward strand, native, aligned, clipped
+                "AACCTAGGAT",           // reverse strand, genomic
+                "ATCCTAGGTT",           // reverse strand, native
+                "AACC-**TA**--GGAT",    // reverse strand, genomic, aligned
+                "ATCC--**TA**-GGTT",    // reverse strand, native, aligned
+                "AACC-**TA**--GGAT",    // reverse strand, genomic, aligned, clipped
+                "ATCC--**TA**-GGTT"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckPulseBaseTags(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            "TTTttAACCccGTTAaaCCG",     // tag data
+
+            {   // all pulses
+
+                "TTTttAACCccGTTAaaCCG",       // forward strand, genomic
+                "TTTttAACCccGTTAaaCCG",       // forward strand, native
+                "",         // forward strand, genomic, aligned
+                "",         // forward strand, native, aligned
+                "",          // forward strand, genomic, aligned, clipped
+                "",          // forward strand, native, aligned, clipped
+                "CGGttTAACggGGTTaaAAA",       // reverse strand, genomic
+                "TTTttAACCccGTTAaaCCG",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",     // reverse strand, genomic, aligned, clipped
+                ""           // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "TTTAACCGTTACCG",       // forward strand, genomic
+                "TTTAACCGTTACCG",       // forward strand, native
+                "TTTAACC---GTTACCG",    // forward strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // forward strand, native, aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",          // forward strand, native, aligned, clipped
+                "CGGTAACGGTTAAA",       // reverse strand, genomic
+                "TTTAACCGTTACCG",       // reverse strand, native
+                "CGGTAAC---GGTTAAA",    // reverse strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // reverse strand, native, aligned
+                "TAAC---GGTT",          // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckPulseBaseTags(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            "AAaaCCGggTTA",     // tag data
+
+            {   // all pulses
+
+                "AAaaCCGggTTA",     // forward strand, genomic
+                "AAaaCCGggTTA",     // forward strand, native
+                "",  // forward strand, genomic, aligned
+                "",  // forward strand, native, aligned
+                "",  // forward strand, genomic, aligned, clipped
+                "",  // forward strand, native, aligned, clipped
+                "TAAccCGGttTT",     // reverse strand, genomic
+                "AAaaCCGggTTA",     // reverse strand, native
+                "",  // reverse strand, genomic, aligned
+                "",  // reverse strand, native, aligned
+                "",  // reverse strand, genomic, aligned, clipped
+                ""   // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native, aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",  // forward strand, native, aligned, clipped
+                "TAACGGTT",     // reverse strand, genomic
+                "AACCGTTA",     // reverse strand, native
+                "TAAC---GGTT",  // reverse strand, genomic, aligned
+                "AACC---GTTA",  // reverse strand, native, aligned
+                "TAAC---GGTT",  // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckPulseBaseTags(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            "TTTttAACCccGTTAaaCCG",     // tag data
+
+            {   // all pulses
+
+                "TTTttAACCccGTTAaaCCG",       // forward strand, genomic
+                "TTTttAACCccGTTAaaCCG",       // forward strand, native
+                "",         // forward strand, genomic, aligned
+                "",         // forward strand, native, aligned
+                "",          // forward strand, genomic, aligned, clipped
+                "",          // forward strand, native, aligned, clipped
+                "CGGttTAACggGGTTaaAAA",       // reverse strand, genomic
+                "TTTttAACCccGTTAaaCCG",       // reverse strand, native
+                "",         // reverse strand, genomic, aligned
+                "",         // reverse strand, native, aligned
+                "",          // reverse strand, genomic, aligned, clipped
+                ""           // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "TTTAACCGTTACCG",       // forward strand, genomic
+                "TTTAACCGTTACCG",       // forward strand, native
+                "TTTAACC---GTTACCG",    // forward strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // forward strand, native, aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",          // forward strand, native, aligned, clipped
+                "CGGTAACGGTTAAA",       // reverse strand, genomic
+                "TTTAACCGTTACCG",       // reverse strand, native
+                "CGGTAAC---GGTTAAA",    // reverse strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // reverse strand, native, aligned
+                "TAAC---GGTT",          // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseQualityTags)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckPulseQualityTags(
+            "4=3D4=",           // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            "?]!!?]?!!]?@",     // tag data
+
+            {   // all pulses
+
+                "?]!!?]?!!]?@",     // forward strand, genomic
+                "?]!!?]?!!]?@",     // forward strand, native
+                "",  // forward strand, genomic, aligned
+                "",  // forward strand, native,  aligned
+                "",  // forward strand, genomic, aligned + clipped
+                "",  // forward strand, native,  aligned + clipped
+                "@?]!!?]?!!]?",     // reverse strand, genomic
+                "?]!!?]?!!]?@",     // reverse strand, native
+                "",  // reverse strand, genomic, aligned
+                "",  // reverse strand, native,  aligned
+                "",  // reverse strand, genomic, aligned + clipped
+                ""   // reverse strand, native,  aligned + clipped
+            },
+            {   // basecalls only
+
+                "?]?]?]?@",     // forward strand, genomic
+                "?]?]?]?@",     // forward strand, native
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned
+                "?]?]!!!?]?@",  // forward strand, native,  aligned
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned + clipped
+                "?]?]!!!?]?@",  // forward strand, native,  aligned + clipped
+                "@?]?]?]?",     // reverse strand, genomic
+                "?]?]?]?@",     // reverse strand, native
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned
+                "?]?]!!!?]?@",  // reverse strand, native,  aligned
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned + clipped
+                "?]?]!!!?]?@"   // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckPulseQualityTags(
+            "4=1D2I2D4=",       // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            "?]!!?]8!7?]!!?@",  // tag data
+
+            {   // all pulses
+
+                "?]!!?]8!7?]!!?@",       // forward strand, genomic
+                "?]!!?]8!7?]!!?@",       // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native,  aligned
+                "",    // forward strand, genomic, aligned + clipped
+                "",    // forward strand, native,  aligned + clipped
+                "@?!!]?7!8]?!!]?",       // reverse strand, genomic
+                "?]!!?]8!7?]!!?@",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native,  aligned
+                "",    // reverse strand, genomic, aligned + clipped
+                ""     // reverse strand, native,  aligned + clipped
+            },
+            {   // basecalls only
+
+                "?]?]87?]?@",       // forward strand, genomic
+                "?]?]87?]?@",       // forward strand, native
+                "?]?]!87!!?]?@",    // forward strand, genomic, aligned
+                "?]?]!87!!?]?@",    // forward strand, native,  aligned
+                "?]?]!87!!?]?@",    // forward strand, genomic, aligned + clipped
+                "?]?]!87!!?]?@",    // forward strand, native,  aligned + clipped
+                "@?]?78]?]?",       // reverse strand, genomic
+                "?]?]87?]?@",       // reverse strand, native
+                "@?]?!78!!]?]?",    // reverse strand, genomic, aligned
+                "?]?]!!87!?]?@",    // reverse strand, native,  aligned
+                "@?]?!78!!]?]?",    // reverse strand, genomic, aligned + clipped
+                "?]?]!!87!?]?@"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckPulseQualityTags(
+            "4=1D2P2I2P2D4=",   // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            "?]!!?]8!7?]!!?@",  // tag data
+        {
+            "?]!!?]8!7?]!!?@",           // forward strand, genomic
+            "?]!!?]8!7?]!!?@",           // forward strand, native
+            "",    // forward strand, genomic, aligned
+            "",    // forward strand, native,  aligned
+            "",    // forward strand, genomic, aligned + clipped
+            "",    // forward strand, native,  aligned + clipped
+            "@?!!]?7!8]?!!]?",           // reverse strand, genomic
+            "?]!!?]8!7?]!!?@",           // reverse strand, native
+            "",    // reverse strand, genomic, aligned
+            "",    // reverse strand, native,  aligned
+            "",    // reverse strand, genomic, aligned + clipped
+            ""     // reverse strand, native,  aligned + clipped
+        },
+        {
+            "?]?]87?]?@",           // forward strand, genomic
+            "?]?]87?]?@",           // forward strand, native
+            "?]?]!!!87!!!!?]?@",    // forward strand, genomic, aligned
+            "?]?]!!!87!!!!?]?@",    // forward strand, native,  aligned
+            "?]?]!!!87!!!!?]?@",    // forward strand, genomic, aligned + clipped
+            "?]?]!!!87!!!!?]?@",    // forward strand, native,  aligned + clipped
+            "@?]?78]?]?",           // reverse strand, genomic
+            "?]?]87?]?@",           // reverse strand, native
+            "@?]?!!!78!!!!]?]?",    // reverse strand, genomic, aligned
+            "?]?]!!!!87!!!?]?@",    // reverse strand, native,  aligned
+            "@?]?!!!78!!!!]?]?",    // reverse strand, genomic, aligned + clipped
+            "?]?]!!!!87!!!?]?@"     // reverse strand, native,  aligned + clipped
+        }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckPulseQualityTags(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            "vvv!!?]?]!!?]?@!!xxx",     // tag data
+
+            {   // all pulses
+
+                "vvv!!?]?]!!?]?@!!xxx",       // forward strand, genomic
+                "vvv!!?]?]!!?]?@!!xxx",       // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native, aligned
+                "",          // forward strand, genomic, aligned, clipped
+                "",          // forward strand, native, aligned, clipped
+                "xxx!!@?]?!!]?]?!!vvv",       // reverse strand, genomic
+                "vvv!!?]?]!!?]?@!!xxx",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",          // reverse strand, genomic, aligned, clipped
+                ""           // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "vvv?]?]?]?@xxx",       // forward strand, genomic
+                "vvv?]?]?]?@xxx",       // forward strand, native
+                "vvv?]?]!!!?]?@xxx",    // forward strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // forward strand, native, aligned
+                "?]?]!!!?]?@",          // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",          // forward strand, native, aligned, clipped
+                "xxx@?]?]?]?vvv",       // reverse strand, genomic
+                "vvv?]?]?]?@xxx",       // reverse strand, native
+                "xxx@?]?!!!]?]?vvv",    // reverse strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // reverse strand, native, aligned
+                "@?]?!!!]?]?",          // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckPulseQualityTags(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            "?]!!?]?!!]?@",     // tag data
+
+            {   // all pulses
+
+                "?]!!?]?!!]?@",     // forward strand, genomic
+                "?]!!?]?!!]?@",     // forward strand, native
+                "",  // forward strand, genomic, aligned
+                "",  // forward strand, native, aligned
+                "",  // forward strand, genomic, aligned, clipped
+                "",  // forward strand, native, aligned, clipped
+                "@?]!!?]?!!]?",     // reverse strand, genomic
+                "?]!!?]?!!]?@",     // reverse strand, native
+                "",  // reverse strand, genomic, aligned
+                "",  // reverse strand, native, aligned
+                "",  // reverse strand, genomic, aligned, clipped
+                ""   // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "?]?]?]?@",     // forward strand, genomic
+                "?]?]?]?@",     // forward strand, native
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned
+                "?]?]!!!?]?@",  // forward strand, native, aligned
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",  // forward strand, native, aligned, clipped
+                "@?]?]?]?",     // reverse strand, genomic
+                "?]?]?]?@",     // reverse strand, native
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned
+                "?]?]!!!?]?@",  // reverse strand, native, aligned
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckPulseQualityTags(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            "vvv!!?]?]!!?]?@!!xxx",     // tag data
+
+            {   // all pulses
+
+                "vvv!!?]?]!!?]?@!!xxx",       // forward strand, genomic
+                "vvv!!?]?]!!?]?@!!xxx",       // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native, aligned
+                "",          // forward strand, genomic, aligned, clipped
+                "",          // forward strand, native, aligned, clipped
+                "xxx!!@?]?!!]?]?!!vvv",       // reverse strand, genomic
+                "vvv!!?]?]!!?]?@!!xxx",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",          // reverse strand, genomic, aligned, clipped
+                ""           // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "vvv?]?]?]?@xxx",       // forward strand, genomic
+                "vvv?]?]?]?@xxx",       // forward strand, native
+                "vvv?]?]!!!?]?@xxx",    // forward strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // forward strand, native, aligned
+                "?]?]!!!?]?@",          // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",          // forward strand, native, aligned, clipped
+                "xxx@?]?]?]?vvv",       // reverse strand, genomic
+                "vvv?]?]?]?@xxx",       // reverse strand, native
+                "xxx@?]?!!!]?]?vvv",    // reverse strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // reverse strand, native, aligned
+                "@?]?!!!]?]?",          // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseFrameTags)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckPulseFrameTags(
+            "4=3D4=",       // CIGAR
+            "AACCGTTA",     // seqBases
+            "AAaaCCGggTTA", // pulseCalls
+            { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },   // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckPulseFrameTags(
+            "4=1D2I2D4=",       // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckPulseFrameTags(
+            "4=1D2P2I2P2D4=",   // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },                         // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckPulseFrameTags(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },   // tag data
+
+            {   // all pulses
+
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckPulseFrameTags(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckPulseFrameTags(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },                 // tag data
+
+            {   // all pulses
+
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseUIntTags)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckPulseUIntTags(
+            "4=3D4=",       // CIGAR
+            "AACCGTTA",     // seqBases
+            "AAaaCCGggTTA", // pulseCalls
+            { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },   // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckPulseUIntTags(
+            "4=1D2I2D4=",       // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckPulseUIntTags(
+            "4=1D2P2I2P2D4=",   // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },                         // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckPulseUIntTags(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },   // tag data
+
+            {   // all pulses
+
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckPulseUIntTags(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckPulseUIntTags(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },                 // tag data
+
+            {   // all pulses
+
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseExclusionTag)
+{
+    const std::vector<PacBio::BAM::PulseExclusionReason> reasons =
+    {
+        PulseExclusionReason::BASE
+      , PulseExclusionReason::PAUSE
+      , PulseExclusionReason::SHORT_PULSE
+      , PulseExclusionReason::BURST
+      , PulseExclusionReason::BASE
+      , PulseExclusionReason::PAUSE
+    };
+
+    auto bam = BamRecordTests::CreateBam();
+    bam.PulseExclusionReason(reasons);
+
+    EXPECT_TRUE(bam.HasPulseExclusion());
+    auto result = bam.PulseExclusionReason();
+    EXPECT_EQ(reasons, result);
+
+}
+
+TEST(BamRecordTest, TranscriptRecord)
+{
+    const std::string readTypeStr{"TRANSCRIPT"};
+    const auto readGroupId = MakeReadGroupId("transcript", readTypeStr);
+
+    ReadGroupInfo rg{readGroupId};
+    rg.ReadType(readTypeStr);
+
+    BamHeader header;
+    header.Version("1.1")
+        .SortOrder("queryname")
+        .PacBioBamVersion("3.0.1");
+
+    BamRecord bam{header};
+    bam.Impl().Name("transcript/1234");
+
+    EXPECT_EQ(RecordType::TRANSCRIPT, bam.Type());
+    EXPECT_EQ(1234, bam.HoleNumber());
+    EXPECT_THROW({bam.QueryStart();}, std::runtime_error);
+    EXPECT_THROW({bam.QueryEnd();}, std::runtime_error);
+}
+
+TEST(BamRecordTest, NumDeletedBasesExcludesRefskips)
+{
+    const std::string file{PbbamTestsConfig::Data_Dir + "/refskip.bam"};
+
+    BamRecord record;
+    BamReader reader{file};
+    ASSERT_TRUE(reader.GetNext(record));
+
+    // Buggy version had (numDel:4882) due to miscounting of ref-skip CIGAR ops.
+
+    const size_t expectedNumDel = 73;
+    EXPECT_EQ(expectedNumDel, record.NumDeletedBases());
+}
+
+TEST(BamRecordTest, SupportWallStartEndTags)
+{
+    const int ws = 100;
+    const int we = 500;
+
+    auto bam = BamRecordTests::CreateBam();
+    bam.QueryStartFrameNumber(ws);
+    bam.QueryEndFrameNumber(we);
+
+    EXPECT_TRUE(bam.Impl().HasTag("ws"));
+    EXPECT_TRUE(bam.Impl().HasTag("we"));
+
+    EXPECT_TRUE(bam.HasQueryStartFrameNumber());
+    EXPECT_TRUE(bam.HasQueryEndFrameNumber());
+
+    EXPECT_EQ(ws, bam.QueryStartFrameNumber());
+    EXPECT_EQ(we, bam.QueryEndFrameNumber());
+}
+
+// clang-format on
diff --git a/tests/src/test_BamRecordBuilder.cpp b/tests/src/test_BamRecordBuilder.cpp

new file mode 100644 (file)

index 0000000..69a3880
--- /dev/null
+++ b/tests/src/test_BamRecordBuilder.cpp
@@ -0,0 +1,174 @@
+// Author: Derek Barnett
+
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamRecordBuilder.h>
+#include <pbbam/BamTagCodec.h>
+#include "../src/MemoryUtils.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace BamRecordBuilderTests {
+
+static void CheckRawData(const BamRecordImpl& bam)
+{
+    // ensure raw data (lengths at least) matches API-facing data
+
+    const uint32_t expectedNameLength = bam.Name().size() + 1;
+    const uint32_t expectedNumCigarOps = bam.CigarData().size();
+    const int32_t expectedSeqLength = bam.Sequence().length();
+    const size_t expectedTagsLength = BamTagCodec::Encode(bam.Tags()).size();
+
+    //  Name        CIGAR         Sequence       Quals      Tags
+    // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + << TAGS >>
+
+    const int expectedTotalDataLength = expectedNameLength + (expectedNumCigarOps * 4) +
+                                        (expectedSeqLength + 1) / 2 + expectedSeqLength +
+                                        expectedTagsLength;
+
+    const auto rawData = PacBio::BAM::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+    EXPECT_EQ(expectedNameLength, rawData->core.l_qname);
+    EXPECT_EQ(expectedNumCigarOps, rawData->core.n_cigar);
+    EXPECT_EQ(expectedSeqLength, rawData->core.l_qseq);
+    EXPECT_EQ(expectedTotalDataLength, rawData->l_data);
+}
+
+static void CheckRawData(const BamRecord& bam) { CheckRawData(bam.Impl()); }
+
+}  // namespace BamRecordBuilderTests
+
+TEST(BamRecordBuilderTest, DefaultValues)
+{
+    BamRecordBuilder builder;
+    BamRecord bam = builder.Build();
+
+    const auto rawData = PacBio::BAM::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    // fixed-length (core) data
+    EXPECT_EQ(0, rawData->core.tid);
+    EXPECT_EQ(0, rawData->core.pos);
+    EXPECT_EQ(0, rawData->core.bin);
+    EXPECT_EQ(0, rawData->core.qual);
+    EXPECT_EQ(1, rawData->core.l_qname);  // initialized w/ NULL-term
+    EXPECT_EQ(0, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(0, rawData->core.mtid);
+    EXPECT_EQ(0, rawData->core.mpos);
+    EXPECT_EQ(0, rawData->core.isize);
+
+    // variable length data
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_EQ(1, rawData->l_data);
+    EXPECT_EQ(int{0x800}, rawData->m_data);  // check this if we change or tune later
+
+    // -------------------------------
+    // check data via API calls
+    // -------------------------------
+
+    EXPECT_EQ(0, bam.Impl().Bin());
+    EXPECT_EQ(0, bam.Impl().Flag());
+    EXPECT_EQ(0, bam.Impl().InsertSize());
+    EXPECT_EQ(0, bam.Impl().MapQuality());
+    EXPECT_EQ(0, bam.Impl().MateReferenceId());
+    EXPECT_EQ(0, bam.Impl().MatePosition());
+    EXPECT_EQ(0, bam.Impl().Position());
+    EXPECT_EQ(0, bam.Impl().ReferenceId());
+    EXPECT_EQ(0, bam.Impl().Tags().size());
+
+    EXPECT_FALSE(bam.Impl().IsDuplicate());
+    EXPECT_FALSE(bam.Impl().IsFailedQC());
+    EXPECT_FALSE(bam.Impl().IsFirstMate());
+    EXPECT_TRUE(bam.Impl().IsMapped());
+    EXPECT_TRUE(bam.Impl().IsMateMapped());
+    EXPECT_FALSE(bam.Impl().IsMateReverseStrand());
+    EXPECT_FALSE(bam.Impl().IsPaired());
+    EXPECT_TRUE(bam.Impl().IsPrimaryAlignment());
+    EXPECT_FALSE(bam.Impl().IsProperPair());
+    EXPECT_FALSE(bam.Impl().IsReverseStrand());
+    EXPECT_FALSE(bam.Impl().IsSecondMate());
+    EXPECT_FALSE(bam.Impl().IsSupplementaryAlignment());
+
+    const std::string emptyString = "";
+    EXPECT_EQ(emptyString, bam.Impl().Name());
+    EXPECT_EQ(emptyString, bam.Impl().CigarData().ToStdString());
+    EXPECT_EQ(emptyString, bam.Impl().Sequence());
+    EXPECT_EQ(emptyString, bam.Impl().Qualities().Fastq());
+    BamRecordBuilderTests::CheckRawData(bam);
+}
+
+TEST(BamRecordBuilderTest, CheckSetters)
+{
+    // should be 28 bytes, encoded
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordBuilder builder;
+    builder.Bin(42)
+        .Flag(42)
+        .InsertSize(42)
+        .MapQuality(42)
+        .MatePosition(42)
+        .MateReferenceId(42)
+        .Position(42)
+        .ReferenceId(42)
+        .Tags(tags);
+
+    BamRecord bam = builder.Build();
+
+    // -------------------------------
+    // check raw data
+    // -------------------------------
+
+    const auto rawData = PacBio::BAM::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    // fixed-length (core) data
+    EXPECT_EQ(42, rawData->core.tid);
+    EXPECT_EQ(42, rawData->core.pos);
+    EXPECT_EQ(42, rawData->core.bin);
+    EXPECT_EQ(42, rawData->core.qual);
+    EXPECT_EQ(1, rawData->core.l_qname);  // initialized w/ NULL-term
+    EXPECT_EQ(42, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(42, rawData->core.mtid);
+    EXPECT_EQ(42, rawData->core.mpos);
+    EXPECT_EQ(42, rawData->core.isize);
+
+    // variable length data
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_EQ(29, rawData->l_data);          // NULL-term qname + tags
+    EXPECT_EQ(int{0x800}, rawData->m_data);  // check this if we change or tune later
+
+    // -------------------------------
+    // check data via API calls
+    // -------------------------------
+
+    EXPECT_EQ(42, bam.Impl().Bin());
+    EXPECT_EQ(42, bam.Impl().Flag());
+    EXPECT_EQ(42, bam.Impl().InsertSize());
+    EXPECT_EQ(42, bam.Impl().MapQuality());
+    EXPECT_EQ(42, bam.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam.Impl().MatePosition());
+    EXPECT_EQ(42, bam.Impl().Position());
+    EXPECT_EQ(42, bam.Impl().ReferenceId());
+
+    const TagCollection fetchedTags = bam.Impl().Tags();
+
+    EXPECT_TRUE(fetchedTags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags.at("CA").ToUInt8Array());
+}
diff --git a/tests/src/test_BamRecordClipping.cpp b/tests/src/test_BamRecordClipping.cpp

new file mode 100644 (file)

index 0000000..9478a62
--- /dev/null
+++ b/tests/src/test_BamRecordClipping.cpp
@@ -0,0 +1,2401 @@
+// Author: Derek Barnett
+
+#include <chrono>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamReader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamRecordView.h>
+#include <pbbam/BamTagCodec.h>
+#include <pbbam/EntireFileQuery.h>
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+typedef std::vector<uint16_t> f_data;
+
+namespace BamRecordClippingTests {
+
+static
+ReadGroupInfo MakeReadGroup(const FrameCodec codec,
+                            const std::string& movieName,
+                            const std::string& readType)
+{
+    ReadGroupInfo rg{movieName, readType};
+    rg.IpdCodec(codec);
+    rg.PulseWidthCodec(codec);
+    return rg;
+}
+
+static
+BamRecord MakeRecord(const Position qStart,
+                     const Position qEnd,
+                     const std::string& seq,
+                     const std::string& quals,
+                     const std::string& tagBases,
+                     const std::string& tagQuals,
+                     const f_data& frames,
+                     const std::string& pulseCall = "",
+                     const std::string& pulseBases = "",
+                     const std::string& pulseQuals = "",
+                     const f_data& pulseFrames = f_data(),
+                     const FrameCodec codec = FrameCodec::RAW)
+{
+    BamRecordImpl impl;
+    impl.SetSequenceAndQualities(seq, quals);
+
+    TagCollection tags;
+    tags["qs"] = qStart;        // qStart
+    tags["qe"] = qEnd;          // qEnd
+    tags["dt"] = tagBases;      // deletionTag
+    tags["st"] = tagBases;      // substitutionTag
+    tags["dq"] = tagQuals;      // deletionQV
+    tags["iq"] = tagQuals;      // insertionQV
+    tags["mq"] = tagQuals;      // mergeQV
+    tags["sq"] = tagQuals;      // substitutionQV
+    tags["ip"] = frames;        // IPD
+    tags["pw"] = frames;        // pulseWidth
+    tags["pc"] = pulseCall;     // pulseCall
+    tags["pt"] = pulseBases;    // altLabelTag
+    tags["pq"] = pulseQuals;    // labelQV
+    tags["pv"] = pulseQuals;    // altLabelQV
+    tags["pg"] = pulseQuals;    // pulseMergeQV
+    tags["pa"] = pulseFrames;   // pkmean
+    tags["pm"] = pulseFrames;   // pkmid
+    impl.Tags(tags);
+
+    const auto rg = MakeReadGroup(codec, "movie", "SUBREAD");
+
+    BamRecord bam(std::move(impl));
+    bam.header_.AddReadGroup(rg);
+    bam.ReadGroup(rg);
+    return bam;
+}
+
+static
+BamRecord MakeCCSRecord(const std::string& seq,
+                        const std::string& quals,
+                        const std::string& tagBases,
+                        const std::string& tagQuals,
+                        const f_data& frames,
+                        const std::string& pulseCall = "",
+                        const std::string& pulseBases = "",
+                        const std::string& pulseQuals = "",
+                        const f_data& pulseFrames = f_data(),
+                        const FrameCodec codec = FrameCodec::RAW)
+{
+    BamRecordImpl impl;
+    impl.Name("movie/42/ccs");
+    impl.SetSequenceAndQualities(seq, quals);
+
+    TagCollection tags;
+    tags["dt"] = tagBases;      // deletionTag
+    tags["st"] = tagBases;      // substitutionTag
+    tags["dq"] = tagQuals;      // deletionQV
+    tags["iq"] = tagQuals;      // insertionQV
+    tags["mq"] = tagQuals;      // mergeQV
+    tags["sq"] = tagQuals;      // substitutionQV
+    tags["ip"] = frames;        // IPD
+    tags["pw"] = frames;        // pulseWidth
+    tags["pc"] = pulseCall;     // pulseCall
+    tags["pt"] = pulseBases;    // altLabelTag
+    tags["pq"] = pulseQuals;    // labelQV
+    tags["pv"] = pulseQuals;    // altLabelQV
+    tags["pg"] = pulseQuals;    // pulseMergeQV
+    tags["pa"] = pulseFrames;   // pkmean
+    tags["pm"] = pulseFrames;   // pkmid
+    impl.Tags(tags);
+
+    const auto rg = MakeReadGroup(codec, "movie", "CCS");
+
+    BamRecord bam(std::move(impl));
+    bam.header_.AddReadGroup(rg);
+    bam.ReadGroup(rg);
+    return bam;
+}
+
+} // namespace BamRecordClippingTests
+
+TEST(BamRecordClippingTest, ClipToQuery_Basic)
+{
+    const Position qStart  = 500;
+    const Position qEnd    = 510;
+    const std::string seq       = "AACCGTTAGC";
+    const std::string quals     = "?]?]?]?]?*";
+    const std::string tagBases  = "AACCGTTAGC";
+    const std::string tagQuals  = "?]?]?]?]?*";
+    const f_data frames    = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const std::string pulseCall   = "ttAaAtaCCGggatTTAcatGCt";
+    const std::string pulseBases  = pulseCall;
+    const std::string pulseQuals  = "==?=]==?]?====]?]===?*=";
+    const f_data pulseFrames = { 0,0,10,0,10,0,0,20,20,30,0,0,0,0,40,40,10,0,0,0,30,20,0 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const std::string seq_clipped      = "CCGTTAG";
+    const std::string quals_clipped    = "?]?]?]?";
+    const std::string tagBases_clipped = "CCGTTAG";
+    const std::string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const std::string pulseCall_clipped = "CCGggatTTAcatG";
+    const std::string pulseQuals_clipped = "?]?====]?]===?";
+    const f_data pulseFrames_clipped = { 20,20,30,0,0,0,0,40,40,10,0,0,0,30 };
+
+    const std::string seq_rev       = "GCTAACGGTT";
+    const std::string pulseCall_rev = "aGCatgTAAatccCGGtaTtTaa";
+    const std::string quals_rev     = "*?]?]?]?]?";
+    const std::string tagQuals_rev  = quals_rev;
+    const f_data frames_rev    = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 };
+
+    const std::string seq_rev_clipped   = "CTAACGG";
+    const std::string quals_rev_clipped = "?]?]?]?";
+    const std::string tagBases_rev_clipped = seq_rev_clipped;
+    const std::string tagQuals_rev_clipped = quals_rev_clipped;
+    const f_data frames_rev_clipped = { 30, 10, 40, 40, 30, 20, 20 };
+
+    const std::string pulseCall_rev_clipped = "CatgTAAatccCGG";
+    const std::string pulseQuals_rev_clipped    = "?===]?]====?]?";
+    const f_data pulseFrames_rev_clipped = { 30,0,0,0,10,40,40,0,0,0,0,30,20,20 };
+
+    const std::string s1_cigar = "10=";
+    const std::string s2_cigar = "5=3D5=";
+    const std::string s3_cigar = "4=1D2I2D4=";
+
+    const std::string s1_cigar_clipped = "7=";
+    const std::string s2_cigar_clipped = "3=3D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D3=";
+
+    const std::string s1_rev_cigar_clipped = "7=";
+    const std::string s2_rev_cigar_clipped = "4=3D3=";
+    const std::string s3_rev_cigar_clipped = "3=1D2I2D2=";
+
+    const BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  pulseCall, pulseBases, pulseQuals, pulseFrames);
+
+    {
+        SCOPED_TRACE("s0");
+
+        BamRecord s0 = prototype; // unmapped record
+        s0.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+        EXPECT_FALSE(s0.IsMapped());
+        EXPECT_EQ(clipStart, s0.QueryStart());
+        EXPECT_EQ(clipEnd,   s0.QueryEnd());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.AlignedStart());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.AlignedEnd());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.ReferenceStart());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.ReferenceEnd());
+
+        const BamRecordView view
+        {
+            s0,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+    {
+        SCOPED_TRACE("s1 - FORWARD");
+
+        BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+        s1.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(clipStart, s1.QueryStart());
+        EXPECT_EQ(clipEnd,   s1.QueryEnd());
+        EXPECT_EQ(clipStart, s1.AlignedStart());   // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s1.AlignedEnd());     // alignStart + seqLength
+        EXPECT_EQ(102, s1.ReferenceStart());       // 100 + startOffset
+        EXPECT_EQ(109, s1.ReferenceEnd());         // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+    {
+        SCOPED_TRACE("s1 - REVERSE");
+
+        BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(500,  s1_rev.QueryStart());
+        EXPECT_EQ(510,  s1_rev.QueryEnd());
+        EXPECT_EQ(500,  s1_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(510,  s1_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos, s1_rev.ReferenceStart());        // 100 + startOffset
+        EXPECT_EQ(110,  s1_rev.ReferenceEnd());          // RefStart + 7=
+        EXPECT_EQ(s1_cigar, s1_rev.CigarData().ToStdString());
+
+        s1_rev.Clip(ClipType::CLIP_TO_QUERY, 502, 509);
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(502, s1_rev.QueryStart());
+        EXPECT_EQ(509, s1_rev.QueryEnd());
+        EXPECT_EQ(502, s1_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(509, s1_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(102, s1_rev.ReferenceStart());        // 100 + startOffset
+        EXPECT_EQ(109, s1_rev.ReferenceEnd());          // RefStart + 7=
+        EXPECT_EQ(s1_rev_cigar_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+    {
+        SCOPED_TRACE("s2 - FORWARD");
+
+        BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+        s2.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(clipStart, s2.QueryStart());
+        EXPECT_EQ(clipEnd,   s2.QueryEnd());
+        EXPECT_EQ(clipStart, s2.AlignedStart());   // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s2.AlignedEnd());     // alignStart + seqLength
+        EXPECT_EQ(102, s2.ReferenceStart());       // 100 + startOffset
+        EXPECT_EQ(112, s2.ReferenceEnd());         // RefStart + 7= + 3D
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,      view.Sequence());
+        EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,   view.IPD().Data());
+    }
+    {
+        SCOPED_TRACE("s2 - REVERSE");
+
+        BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+        s2_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s2_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s2_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s2_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s2_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(102, s2_rev.ReferenceStart());        // 100 + startOffset
+        EXPECT_EQ(112, s2_rev.ReferenceEnd());          // RefStart + 7= + 3D
+
+        EXPECT_EQ(s2_rev_cigar_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+    {
+        SCOPED_TRACE("s3 - FORWARD");
+
+        BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+        s3.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(clipStart, s3.QueryStart());
+        EXPECT_EQ(clipEnd,   s3.QueryEnd());
+        EXPECT_EQ(clipStart, s3.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s3.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+        EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+    {
+        SCOPED_TRACE("s3 - REVERSE");
+
+        BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+        s3_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s3_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s3_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s3_rev.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s3_rev.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(102, s3_rev.ReferenceStart());         // 100 + startOffset
+        EXPECT_EQ(110, s3_rev.ReferenceEnd());           // RefStart + 5= + 3D
+
+        EXPECT_EQ(s3_rev_cigar_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+}
+
+TEST(BamRecordClippingTest, ClipToQuery_WithSoftClips)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const std::string seq      = "TTAACCGTTAGCAAA";
+    const std::string seq_rev  = "TTTGCTAACGGTTAA";
+    const std::string quals    = "--?]?]?]?]?*+++";
+    const std::string tagBases = "TTAACCGTTAGCAAA";
+    const std::string tagQuals = "--?]?]?]?]?*+++";
+    const std::string tagQuals_rev = "+++*?]?]?]?]?--";
+    const f_data frames   = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+    const f_data frames_rev = { 10, 10, 10, 20, 30, 10, 40, 40, 30, 20, 20, 10, 10, 40, 40 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const std::string s1_cigar = "2S10=3S";
+    const std::string s1_cigar_clipped = "7=";
+    const std::string s1_seq_clipped      = "AACCGTT";
+    const std::string s1_quals_clipped    = "?]?]?]?";
+    const std::string s1_tagBases_clipped = s1_seq_clipped;
+    const std::string s1_tagQuals_clipped = s1_quals_clipped;
+    const f_data s1_frames_clipped   = { 10, 10, 20, 20, 30, 40, 40 };
+    const std::string s1_cigar_rev_clipped = "6=1S";
+    const std::string s1_seq_rev_clipped   = "AACGGTT";
+    const std::string s1_quals_rev_clipped = "?]?]?]?";
+    const std::string s1_tagBases_rev_clipped = s1_seq_rev_clipped;
+    const std::string s1_tagQuals_rev_clipped = s1_quals_rev_clipped;
+    const f_data s1_frames_rev_clipped = { 40, 40, 30, 20, 20, 10, 10 };
+
+    const std::string s2_cigar = "2S5=3D5=3S";
+    const std::string s2_cigar_clipped = "5=3D2=";
+    const std::string s2_seq_clipped      = "AACCGTT";
+    const std::string s2_quals_clipped    = "?]?]?]?";
+    const std::string s2_tagBases_clipped = s2_seq_clipped;
+    const std::string s2_tagQuals_clipped = s2_quals_clipped;
+    const f_data s2_frames_clipped   = { 10, 10, 20, 20, 30, 40, 40 };
+    const std::string s2_cigar_rev_clipped = "1=3D5=1S";
+    const std::string s2_seq_rev_clipped   = "AACGGTT";
+    const std::string s2_quals_rev_clipped = "?]?]?]?";
+    const std::string s2_tagBases_rev_clipped = s2_seq_rev_clipped;
+    const std::string s2_tagQuals_rev_clipped = s2_quals_rev_clipped;
+    const f_data s2_frames_rev_clipped = { 40, 40, 30, 20, 20, 10, 10 };
+
+    const std::string s3_cigar = "2S4=1D2I2D4=3S";
+    const std::string s3_cigar_clipped = "4=1D2I2D1=";
+    const std::string s3_seq_clipped      = "AACCGTT";
+    const std::string s3_quals_clipped    = "?]?]?]?";
+    const std::string s3_tagBases_clipped = s3_seq_clipped;
+    const std::string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 10, 10, 20, 20, 30, 40, 40 };
+    const std::string s3_cigar_rev_clipped = "1D2I2D4=1S";
+    const std::string s3_seq_rev_clipped   = "AACGGTT";
+    const std::string s3_quals_rev_clipped = "?]?]?]?";
+    const std::string s3_tagBases_rev_clipped = s3_seq_rev_clipped;
+    const std::string s3_tagQuals_rev_clipped = s3_quals_rev_clipped;
+    const f_data s3_frames_rev_clipped = { 40, 40, 30, 20, 20, 10, 10 };
+
+    const BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  seq, tagBases, tagQuals, frames);
+
+    {
+        SCOPED_TRACE("s1 - FORWARD");
+
+        BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(100, s1.ReferenceStart());
+        EXPECT_EQ(110, s1.ReferenceEnd()); // 10=
+
+        s1.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(clipStart, s1.QueryStart());
+        EXPECT_EQ(clipEnd,   s1.QueryEnd());
+        EXPECT_EQ(clipStart, s1.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s1.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s1.ReferenceStart());  // tPos
+        EXPECT_EQ(tPos + 7,  s1.ReferenceEnd());    // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_frames_clipped,   view.IPD().Data());
+    }
+    {
+        SCOPED_TRACE("s1 - REVERSE");
+
+        BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(100, s1_rev.ReferenceStart());
+        EXPECT_EQ(110, s1_rev.ReferenceEnd()); // 10=
+
+        s1_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s1_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s1_rev.QueryEnd());
+        EXPECT_EQ(503, s1_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(509,   s1_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s1_rev.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 6,  s1_rev.ReferenceEnd());    // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_rev_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_frames_rev_clipped,   view.IPD().Data());
+    }
+    {
+        SCOPED_TRACE("s2 - FORWARD");
+
+        BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(100, s2.ReferenceStart());
+        EXPECT_EQ(113, s2.ReferenceEnd());   // 5= + 3D + 5=
+
+        s2.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(clipStart, s2.QueryStart());
+        EXPECT_EQ(clipEnd,   s2.QueryEnd());
+        EXPECT_EQ(clipStart, s2.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s2.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s2.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 10, s2.ReferenceEnd());    // RefStart + 5=3D2=
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_frames_clipped,   view.IPD().Data());
+    }
+    {
+        SCOPED_TRACE("s2 - REVERSE");
+
+        BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(100, s2_rev.ReferenceStart());
+        EXPECT_EQ(113, s2_rev.ReferenceEnd());   // 5= + 3D + 5=
+
+        s2_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s2_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s2_rev.QueryEnd());
+        EXPECT_EQ(503, s2_rev.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(509,   s2_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s2_rev.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 9, s2_rev.ReferenceEnd());    // RefStart + 5=3D2=
+
+        EXPECT_EQ(s2_cigar_rev_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_rev_clipped,   view.IPD().Data());
+    }
+    {
+        SCOPED_TRACE("s3 - FORWARD");
+
+        BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(100, s3.ReferenceStart());
+        EXPECT_EQ(111, s3.ReferenceEnd());   // 4= + 1D + 2D + 4=
+
+        s3.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(clipStart, s3.QueryStart());
+        EXPECT_EQ(clipEnd,   s3.QueryEnd());
+        EXPECT_EQ(clipStart, s3.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s3.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s3.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 8,  s3.ReferenceEnd());    // RefStart + 4=1D2D1=
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+    }
+    {
+        SCOPED_TRACE("s3 - REVERSE");
+
+        BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(100, s3_rev.ReferenceStart());
+        EXPECT_EQ(111, s3_rev.ReferenceEnd());   // 4= + 1D + 2D + 4=
+
+        s3_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s3_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s3_rev.QueryEnd());
+        EXPECT_EQ(503, s3_rev.AlignedStart());    // queryStart + 1S
+        EXPECT_EQ(509,   s3_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s3_rev.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 7,  s3_rev.ReferenceEnd());    // RefStart + 4=1D2D1=
+
+        EXPECT_EQ(s3_cigar_rev_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_rev_clipped,   view.IPD().Data());
+    }
+}
+
+TEST(BamRecordClippingTest, ClipToReference_Basic)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const std::string tagQuals_rev = "*?]?]?]?]?";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const std::string s1_cigar = "10=";
+    const std::string s1_cigar_clipped = "5=";
+    const std::string s1_seq_clipped      = "CCGTT";
+    const std::string s1_quals_clipped    = "?]?]?";
+    const std::string s1_tagBases_clipped = s1_seq_clipped;
+    const std::string s1_tagQuals_clipped = s1_quals_clipped;
+    const f_data s1_frames_clipped   = { 20, 20, 30, 40, 40 };
+    const std::string s1_seq_rev_clipped   = "TAACG";
+    const std::string s1_quals_rev_clipped = "]?]?]";
+    const std::string s1_tagBases_rev_clipped = s1_seq_rev_clipped;
+    const std::string s1_tagQuals_rev_clipped = s1_quals_rev_clipped;
+    const f_data s1_frames_rev_clipped = { 10, 40, 40, 30, 20 };
+
+    const std::string s2_cigar = "5=3D5=";
+    const std::string s2_cigar_clipped = "3=2D";
+    const std::string s2_seq_clipped      = "CCG";
+    const std::string s2_quals_clipped    = "?]?";
+    const std::string s2_tagBases_clipped = s2_seq_clipped;
+    const std::string s2_tagQuals_clipped = s2_quals_clipped;
+    const f_data s2_frames_clipped   = { 20, 20, 30 };
+    const std::string s2_seq_rev_clipped   = "TAA";
+    const std::string s2_quals_rev_clipped = "]?]";
+    const std::string s2_tagBases_rev_clipped = s2_seq_rev_clipped;
+    const std::string s2_tagQuals_rev_clipped = s2_quals_rev_clipped;
+    const f_data s2_frames_rev_clipped = { 10, 40, 40 };
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D";
+    const std::string s3_seq_clipped      = "CCGT";
+    const std::string s3_quals_clipped    = "?]?]";
+    const std::string s3_tagBases_clipped = s3_seq_clipped;
+    const std::string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+    const std::string s3_seq_rev_clipped   = "TAAC";
+    const std::string s3_quals_rev_clipped = "]?]?";
+    const std::string s3_tagBases_rev_clipped = s3_seq_rev_clipped;
+    const std::string s3_tagQuals_rev_clipped = s3_quals_rev_clipped;
+    const f_data s3_frames_rev_clipped = { 10, 40, 40, 30};
+
+    const BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  seq, tagBases, tagQuals, frames);
+    BamRecord s0 = prototype;
+    BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+    BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+    BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+    BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+    BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+    BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+
+    s0.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s1.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s2.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s3.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s1_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s2_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s3_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    {   // s0 - no clipping should have been done to unmapped record
+
+        EXPECT_FALSE(s0.IsMapped());
+        EXPECT_EQ(prototype.QueryStart(),     s0.QueryStart());
+        EXPECT_EQ(prototype.QueryEnd(),       s0.QueryEnd());
+        EXPECT_EQ(prototype.AlignedStart(),   s0.AlignedStart());
+        EXPECT_EQ(prototype.AlignedEnd(),     s0.AlignedEnd());
+        EXPECT_EQ(prototype.ReferenceStart(), s0.ReferenceStart());
+        EXPECT_EQ(prototype.ReferenceEnd(),   s0.ReferenceEnd());
+
+        const BamRecordView protoView
+        {
+            prototype,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        const BamRecordView view
+        {
+            s0,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(protoView.Sequence(),       view.Sequence());
+        EXPECT_EQ(protoView.Qualities(),      view.Qualities());
+        EXPECT_EQ(protoView.DeletionTags(),    view.DeletionTags());
+        EXPECT_EQ(protoView.DeletionQVs(),     view.DeletionQVs());
+        EXPECT_EQ(protoView.LabelQVs(),        view.LabelQVs());
+        EXPECT_EQ(protoView.AltLabelQVs(),     view.AltLabelQVs());
+        EXPECT_EQ(protoView.IPD(),            view.IPD());
+    }
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(502,   s1.QueryStart());
+        EXPECT_EQ(507,   s1.QueryEnd());
+        EXPECT_EQ(502,   s1.AlignedStart());       // queryStart (no soft clips)
+        EXPECT_EQ(507,   s1.AlignedEnd());         // alignStart + seqLength
+        EXPECT_EQ(clipStart, s1.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s1.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s1_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(503, s1_rev.QueryStart());
+        EXPECT_EQ(508, s1_rev.QueryEnd());
+        EXPECT_EQ(503, s1_rev.AlignedStart());          // queryStart (no soft clips)
+        EXPECT_EQ(508, s1_rev.AlignedEnd());            // alignStart + seqLength
+        EXPECT_EQ(clipStart, s1_rev.ReferenceStart());  // clipStart
+        EXPECT_EQ(clipEnd,   s1_rev.ReferenceEnd());    // clipEnd
+
+        EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s1_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(502, s2.QueryStart());
+        EXPECT_EQ(505, s2.QueryEnd());
+        EXPECT_EQ(502, s2.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(505, s2.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s2.ReferenceStart());   // clipStart
+        EXPECT_EQ(clipEnd,   s2.ReferenceEnd());     // clipEnd
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(505, s2_rev.QueryStart());
+        EXPECT_EQ(508, s2_rev.QueryEnd());
+        EXPECT_EQ(505, s2_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(508, s2_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(clipStart, s2_rev.ReferenceStart());  // clipStart
+        EXPECT_EQ(clipEnd,   s2_rev.ReferenceEnd());    // clipEnd
+
+        EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(502, s3.QueryStart());
+        EXPECT_EQ(506, s3.QueryEnd());
+        EXPECT_EQ(502, s3.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(506, s3.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+        EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(504, s3_rev.QueryStart());
+        EXPECT_EQ(508, s3_rev.QueryEnd());
+        EXPECT_EQ(504, s3_rev.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(508, s3_rev.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s3_rev.ReferenceStart());  // clipStart
+        EXPECT_EQ(clipEnd,   s3_rev.ReferenceEnd());    // clipEnd
+
+        EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_rev_clipped,   view.IPD().Data());
+    }
+}
+
+TEST(BamRecordClippingTest, ClipToReference_WithSoftClips)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const std::string seq      = "TTAACCGTTAGCAAA";
+    const std::string quals    = "--?]?]?]?]?*+++";
+    const std::string tagBases = "TTAACCGTTAGCAAA";
+    const std::string tagQuals = "--?]?]?]?]?*+++";
+    const std::string tagQuals_rev = "+++*?]?]?]?]?--";
+    const f_data frames   = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const std::string seq_rev      = "TTTGCTAACGGTTAA";
+    const std::string quals_rev    = "+++*?]?]?]?]?--";
+    const f_data frames_rev   = { 10, 10, 10, 20, 30, 10, 40, 40, 30, 20, 20, 10, 10, 40, 40 };
+
+    const std::string s1_cigar = "2S10=3S";
+    const std::string s1_cigar_clipped = "5=";
+    const std::string s1_seq_clipped      = "CCGTT";
+    const std::string s1_quals_clipped    = "?]?]?";
+    const std::string s1_tagBases_clipped = s1_seq_clipped;
+    const std::string s1_tagQuals_clipped = s1_quals_clipped;
+    const f_data s1_frames_clipped   = { 20, 20, 30, 40, 40 };
+    const std::string s1_seq_rev_clipped   = "CTAAC";
+    const std::string s1_quals_rev_clipped = "?]?]?";
+    const std::string s1_tagBases_rev_clipped = s1_seq_rev_clipped;
+    const std::string s1_tagQuals_rev_clipped = s1_quals_rev_clipped;
+    const f_data s1_frames_rev_clipped = { 30, 10, 40, 40, 30 };
+
+    const std::string s2_cigar = "2S5=3D5=3S";
+    const std::string s2_cigar_clipped = "3=2D";
+    const std::string s2_seq_clipped      = "CCG";
+    const std::string s2_quals_clipped    = "?]?";
+    const std::string s2_tagBases_clipped = s2_seq_clipped;
+    const std::string s2_tagQuals_clipped = s2_quals_clipped;
+    const f_data s2_frames_clipped   = { 20, 20, 30 };
+    const std::string s2_seq_rev_clipped   = "CTA";
+    const std::string s2_quals_rev_clipped = "?]?";
+    const std::string s2_tagBases_rev_clipped = s2_seq_rev_clipped;
+    const std::string s2_tagQuals_rev_clipped = s2_quals_rev_clipped;
+    const f_data s2_frames_rev_clipped = { 30, 10, 40 };
+
+    const std::string s3_cigar = "2S4=1D2I2D4=3S";
+    const std::string s3_cigar_clipped = "2=1D2I2D";
+    const std::string s3_seq_clipped      = "CCGT";
+    const std::string s3_quals_clipped    = "?]?]";
+    const std::string s3_tagBases_clipped = s3_seq_clipped;
+    const std::string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+    const std::string s3_seq_rev_clipped   = "CTAA";
+    const std::string s3_quals_rev_clipped = "?]?]";
+    const std::string s3_tagBases_rev_clipped = s3_seq_rev_clipped;
+    const std::string s3_tagQuals_rev_clipped = s3_quals_rev_clipped;
+    const f_data s3_frames_rev_clipped = { 30, 10, 40, 40 };
+
+    const BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  seq, tagBases, tagQuals, frames);
+    BamRecord s0 = prototype;
+    BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+    BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+    BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+    BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+    BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+    BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+
+    // sanity checks before clipping
+    EXPECT_FALSE(s0.IsMapped());
+
+    EXPECT_TRUE(s1.IsMapped());
+    EXPECT_EQ(500,       s1.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s1.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(502,       s1.AlignedStart());    // queryStart + 2S
+    EXPECT_EQ(512,       s1.AlignedEnd());      // alignedStart + 10=
+    EXPECT_EQ(tPos,      s1.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 10, s1.ReferenceEnd());    // tPos + 10=
+
+    EXPECT_TRUE(s1_rev.IsMapped());
+    EXPECT_EQ(500,       s1_rev.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s1_rev.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(503,       s1_rev.AlignedStart());    // queryStart + 3S
+    EXPECT_EQ(513,       s1_rev.AlignedEnd());      // alignedStart + 10=
+    EXPECT_EQ(tPos,      s1_rev.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 10, s1_rev.ReferenceEnd());    // tPos + 10=
+
+    EXPECT_TRUE(s2.IsMapped());
+    EXPECT_EQ(500,       s2.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s2.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(502,       s2.AlignedStart());    // queryStart + 2S
+    EXPECT_EQ(512,       s2.AlignedEnd());      // alignedStart + 5=5=
+    EXPECT_EQ(tPos,      s2.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 13, s2.ReferenceEnd());    // tPos + 5=3D5=
+
+    EXPECT_TRUE(s2_rev.IsMapped());
+    EXPECT_EQ(500,       s2_rev.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s2_rev.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(503,       s2_rev.AlignedStart());    // queryStart + S
+    EXPECT_EQ(513,       s2_rev.AlignedEnd());      // alignedStart + 5=5=
+    EXPECT_EQ(tPos,      s2_rev.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 13, s2_rev.ReferenceEnd());    // tPos + 5=3D5=
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(500,       s3.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s3.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(502,       s3.AlignedStart());    // queryStart + 2S
+    EXPECT_EQ(512,       s3.AlignedEnd());      // alignedStart + 4=2I4=
+    EXPECT_EQ(tPos,      s3.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 11, s3.ReferenceEnd());    // tPos + 4=1D2D4=
+
+    EXPECT_TRUE(s3_rev.IsMapped());
+    EXPECT_EQ(500,       s3_rev.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s3_rev.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(503,       s3_rev.AlignedStart());    // queryStart + 2S
+    EXPECT_EQ(513,       s3_rev.AlignedEnd());      // alignedStart + 4=2I4=
+    EXPECT_EQ(tPos,      s3_rev.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 11, s3_rev.ReferenceEnd());    // tPos + 4=1D2D4=
+
+    s0.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s1.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s2.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s3.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s1_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s2_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s3_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    {   // s0 - no clipping should have been done to unmapped record
+
+        EXPECT_FALSE(s0.IsMapped());
+        EXPECT_EQ(prototype.QueryStart(),     s0.QueryStart());
+        EXPECT_EQ(prototype.QueryEnd(),       s0.QueryEnd());
+        EXPECT_EQ(prototype.AlignedStart(),   s0.AlignedStart());
+        EXPECT_EQ(prototype.AlignedEnd(),     s0.AlignedEnd());
+        EXPECT_EQ(prototype.ReferenceStart(), s0.ReferenceStart());
+        EXPECT_EQ(prototype.ReferenceEnd(),   s0.ReferenceEnd());
+
+        const BamRecordView protoView
+        {
+            prototype,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        const BamRecordView view
+        {
+            s0,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(protoView.Sequence(),      view.Sequence());
+        EXPECT_EQ(protoView.Qualities(),     view.Qualities());
+        EXPECT_EQ(protoView.DeletionTags(),  view.DeletionTags());
+        EXPECT_EQ(protoView.DeletionQVs(),   view.DeletionQVs());
+        EXPECT_EQ(protoView.LabelQVs(),      view.LabelQVs());
+        EXPECT_EQ(protoView.AltLabelQVs(),   view.AltLabelQVs());
+        EXPECT_EQ(protoView.IPD(),           view.IPD());
+    }
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(504,   s1.QueryStart());         // new queryStart
+        EXPECT_EQ(509,   s1.QueryEnd());           // queryStart + new seqLength
+        EXPECT_EQ(504,   s1.AlignedStart());       // queryStart (no soft clips remaining)
+        EXPECT_EQ(509,   s1.AlignedEnd());         // alignStart + new seqLength
+        EXPECT_EQ(clipStart, s1.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s1.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s1_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(506,   s1_rev.QueryStart());         // new queryStart
+        EXPECT_EQ(511,   s1_rev.QueryEnd());           // queryStart + new seqLength
+        EXPECT_EQ(506,   s1_rev.AlignedStart());       // queryStart (no soft clips remaining)
+        EXPECT_EQ(511,   s1_rev.AlignedEnd());         // alignStart + new seqLength
+        EXPECT_EQ(clipStart, s1_rev.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s1_rev.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s1_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(504, s2.QueryStart());
+        EXPECT_EQ(507, s2.QueryEnd());
+        EXPECT_EQ(504, s2.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(507, s2.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s2.ReferenceStart());   // clipStart
+        EXPECT_EQ(clipEnd,   s2.ReferenceEnd());     // clipEnd
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(508,   s2_rev.QueryStart());         // new queryStart
+        EXPECT_EQ(511,   s2_rev.QueryEnd());           // queryStart + new seqLength
+        EXPECT_EQ(508,   s2_rev.AlignedStart());       // queryStart (no soft clips remaining)
+        EXPECT_EQ(511,   s2_rev.AlignedEnd());         // alignStart + new seqLength
+        EXPECT_EQ(clipStart, s2_rev.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s2_rev.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(504, s3.QueryStart());
+        EXPECT_EQ(508, s3.QueryEnd());
+        EXPECT_EQ(504, s3.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(508, s3.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+        EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(507,   s3_rev.QueryStart());         // new queryStart
+        EXPECT_EQ(511,   s3_rev.QueryEnd());           // queryStart + new seqLength
+        EXPECT_EQ(507,   s3_rev.AlignedStart());       // queryStart (no soft clips remaining)
+        EXPECT_EQ(511,   s3_rev.AlignedEnd());         // alignStart + new seqLength
+        EXPECT_EQ(clipStart, s3_rev.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s3_rev.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_rev_clipped,   view.IPD().Data());
+    }
+}
+
+TEST(BamRecordClippingTest, ClippedToQueryCopy)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const std::string seq_clipped      = "CCGTTAG";
+    const std::string quals_clipped    = "?]?]?]?";
+    const std::string tagBases_clipped = "CCGTTAG";
+    const std::string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D3=";
+
+    BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                            seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    BamRecord s3 = prototype.Clipped(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(clipStart, s3.QueryStart());
+    EXPECT_EQ(clipEnd,   s3.QueryEnd());
+    EXPECT_EQ(clipStart, s3.AlignedStart());     // queryStart (no soft clips)
+    EXPECT_EQ(clipEnd,   s3.AlignedEnd());       // alignStart + seqLength
+    EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+    EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq_clipped,      view.Sequence());
+    EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordClippingTest, ClippedToReferenceCopy)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D";
+    const std::string s3_seq_clipped      = "CCGT";
+    const std::string s3_quals_clipped    = "?]?]";
+    const std::string s3_tagBases_clipped = s3_seq_clipped;
+    const std::string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+
+    BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                            seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    const BamRecord s3 = prototype.Clipped(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    // s3 - FORWARD
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(502, s3.QueryStart());
+    EXPECT_EQ(506, s3.QueryEnd());
+    EXPECT_EQ(502, s3.AlignedStart());     // queryStart (no soft clips)
+    EXPECT_EQ(506, s3.AlignedEnd());       // alignStart + seqLength
+    EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+    EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+    EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordClippingTest, StaticClippedToQuery)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const std::string seq_clipped      = "CCGTTAG";
+    const std::string quals_clipped    = "?]?]?]?";
+    const std::string tagBases_clipped = "CCGTTAG";
+    const std::string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D3=";
+
+    BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                            seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    const BamRecord s3 = BamRecord::Clipped(prototype, ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(clipStart, s3.QueryStart());
+    EXPECT_EQ(clipEnd,   s3.QueryEnd());
+    EXPECT_EQ(clipStart, s3.AlignedStart());     // queryStart (no soft clips)
+    EXPECT_EQ(clipEnd,   s3.AlignedEnd());       // alignStart + seqLength
+    EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+    EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq_clipped,      view.Sequence());
+    EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordClippingTest, StaticClippedToReference)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D";
+    const std::string s3_seq_clipped      = "CCGT";
+    const std::string s3_quals_clipped    = "?]?]";
+    const std::string s3_tagBases_clipped = s3_seq_clipped;
+    const std::string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+
+    BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                            seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    const BamRecord s3 = BamRecord::Clipped(prototype, ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    // s3 - FORWARD
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(502, s3.QueryStart());
+    EXPECT_EQ(506, s3.QueryEnd());
+    EXPECT_EQ(502, s3.AlignedStart());     // queryStart (no soft clips)
+    EXPECT_EQ(506, s3.AlignedEnd());       // alignStart + seqLength
+    EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+    EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+    EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordTest, ClipCigarData)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const std::string seq      = "TTAACCGTTAGCAAA";
+    const std::string quals    = "--?]?]?]?]?*+++";
+    const std::string tagBases = "TTAACCGTTAGCAAA";
+    const std::string tagQuals = "--?]?]?]?]?*+++";
+    const f_data frames   = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+    const uint8_t mapQual = 80;
+    BamRecord s3 = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                     seq, tagBases, tagQuals, frames);
+    BamRecord s3_rev = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                         seq, tagBases, tagQuals, frames);
+
+    const std::string s3_cigar = "5H2S4=1D2I2D4=3S7H";
+    s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual);
+    s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual);
+
+    const Cigar s3_cigar_raw     = s3.CigarData();
+    const Cigar s3_cigar_clipped = s3.CigarData(true);
+
+    EXPECT_EQ(s3_cigar, s3_cigar_raw.ToStdString());
+    EXPECT_EQ(std::string("4=1D2I2D4="), s3_cigar_clipped.ToStdString());
+}
+
+TEST(BamRecordTest, CCS_ClipToQuery)
+{
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 2;
+    const Position clipEnd   = 9;
+
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const std::string seq_clipped      = "CCGTTAG";
+    const std::string quals_clipped    = "?]?]?]?";
+    const std::string tagBases_clipped = "CCGTTAG";
+    const std::string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D3=";
+
+    BamRecord prototype = BamRecordClippingTests::MakeCCSRecord(seq, quals, tagBases, tagQuals, frames,
+                                               seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    BamRecord s3 = prototype.Clipped(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(0,   s3.AlignedStart());     // record start (no soft clips)
+    EXPECT_EQ(7,   s3.AlignedEnd());       // alignStart + clipped seqLength
+    EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+    EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq_clipped,      view.Sequence());
+    EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordTest, CCS_ClipToReference)
+{
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D";
+    const std::string s3_seq_clipped      = "CCGT";
+    const std::string s3_quals_clipped    = "?]?]";
+    const std::string s3_tagBases_clipped = s3_seq_clipped;
+    const std::string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+
+    BamRecord prototype = BamRecordClippingTests::MakeCCSRecord(seq, quals, tagBases, tagQuals, frames,
+                                               seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    const BamRecord s3 = BamRecord::Clipped(prototype, ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(0, s3.AlignedStart());     // record tart (no soft clips)
+    EXPECT_EQ(4, s3.AlignedEnd());       // alignStart + clipped seqLength (4)
+    EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+    EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+    EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordTest, ClipEncodedFrames)
+{
+    const Position qStart  = 500;
+    const Position qEnd    = 510;
+    const std::string seq       = "AACCGTTAGC";
+    const std::string quals     = "?]?]?]?]?*";
+    const std::string tagBases  = "AACCGTTAGC";
+    const std::string tagQuals  = "?]?]?]?]?*";
+    const f_data frames    = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const std::string pulseCall   = "ttAaAtaCCGggatTTAcatGCt";
+    const std::string pulseBases  = pulseCall;
+    const std::string pulseQuals  = "==?=]==?]?====]?]===?*=";
+    const f_data pulseFrames = { 0,0,10,0,10,0,0,20,20,30,0,0,0,0,40,40,10,0,0,0,30,20,0 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const std::string seq_clipped      = "CCGTTAG";
+    const std::string quals_clipped    = "?]?]?]?";
+    const std::string tagBases_clipped = "CCGTTAG";
+    const std::string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const std::string pulseCall_clipped = "CCGggatTTAcatG";
+    const std::string pulseQuals_clipped = "?]?====]?]===?";
+    const f_data pulseFrames_clipped = { 20,20,30,0,0,0,0,40,40,10,0,0,0,30 };
+
+    const std::string seq_rev       = "GCTAACGGTT";
+    const std::string pulseCall_rev = "aGCatgTAAatccCGGtaTtTaa";
+    const std::string quals_rev     = "*?]?]?]?]?";
+    const std::string tagQuals_rev  = quals_rev;
+    const f_data frames_rev    = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 };
+
+    const std::string seq_rev_clipped   = "CTAACGG";
+    const std::string quals_rev_clipped = "?]?]?]?";
+    const std::string tagBases_rev_clipped = seq_rev_clipped;
+    const std::string tagQuals_rev_clipped = quals_rev_clipped;
+    const f_data frames_rev_clipped = { 30, 10, 40, 40, 30, 20, 20 };
+
+    const std::string pulseCall_rev_clipped = "CatgTAAatccCGG";
+    const std::string pulseQuals_rev_clipped    = "?===]?]====?]?";
+    const f_data pulseFrames_rev_clipped = { 30,0,0,0,10,40,40,0,0,0,0,30,20,20 };
+
+    const std::string s1_cigar = "10=";
+    const std::string s2_cigar = "5=3D5=";
+    const std::string s3_cigar = "4=1D2I2D4=";
+
+    const std::string s1_cigar_clipped = "7=";
+    const std::string s2_cigar_clipped = "3=3D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D3=";
+
+    const std::string s1_cigar_rev_clipped = "7=";
+    const std::string s2_cigar_rev_clipped = "4=3D3=";
+    const std::string s3_cigar_rev_clipped = "3=1D2I2D2=";
+
+    const BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  pulseCall, pulseBases, pulseQuals, pulseFrames, FrameCodec::V1);
+
+    BamRecord s0 = prototype; // unmapped record
+    BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+    BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+    BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+    BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+    BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+    BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+
+    s0.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s1.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s2.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s3.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s1_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s2_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s3_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    {   // s0
+
+        EXPECT_FALSE(s0.IsMapped());
+        EXPECT_EQ(clipStart, s0.QueryStart());
+        EXPECT_EQ(clipEnd,   s0.QueryEnd());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.AlignedStart());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.AlignedEnd());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.ReferenceStart());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.ReferenceEnd());
+
+        const BamRecordView view
+        {
+            s0,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(clipStart, s1.QueryStart());
+        EXPECT_EQ(clipEnd,   s1.QueryEnd());
+        EXPECT_EQ(clipStart, s1.AlignedStart());   // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s1.AlignedEnd());     // alignStart + seqLength
+        EXPECT_EQ(102, s1.ReferenceStart());       // 100 + startOffset
+        EXPECT_EQ(109, s1.ReferenceEnd());         // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s1_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s1_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s1_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s1_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(102, s1_rev.ReferenceStart());        // 100 + startOffset
+        EXPECT_EQ(109, s1_rev.ReferenceEnd());          // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_rev_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(clipStart, s2.QueryStart());
+        EXPECT_EQ(clipEnd,   s2.QueryEnd());
+        EXPECT_EQ(clipStart, s2.AlignedStart());   // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s2.AlignedEnd());     // alignStart + seqLength
+        EXPECT_EQ(102, s2.ReferenceStart());       // 100 + startOffset
+        EXPECT_EQ(112, s2.ReferenceEnd());         // RefStart + 7= + 3D
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,      view.Sequence());
+        EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s2_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s2_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s2_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s2_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(102, s2_rev.ReferenceStart());        // 100 + startOffset
+        EXPECT_EQ(112, s2_rev.ReferenceEnd());          // RefStart + 7= + 3D
+
+        EXPECT_EQ(s2_cigar_rev_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(clipStart, s3.QueryStart());
+        EXPECT_EQ(clipEnd,   s3.QueryEnd());
+        EXPECT_EQ(clipStart, s3.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s3.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+        EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s3_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s3_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s3_rev.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s3_rev.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(102, s3_rev.ReferenceStart());         // 100 + startOffset
+        EXPECT_EQ(110, s3_rev.ReferenceEnd());           // RefStart + 5= + 3D
+
+        EXPECT_EQ(s3_cigar_rev_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+}
+
+TEST(BamRecordClippingTest, ExciseSoftClipsFromFramesWithDeletions)
+{
+    const std::string expectedName{"m141008_060349_42194_c100704972550000001823137703241586_s1_p0/14/2409_2745"};
+    const PacBio::BAM::Strand expectedStrand = PacBio::BAM::Strand::FORWARD;
+    const std::string expectedCigar{
+        "20S11=1I47=1I2=1I6=1I22=1I2=1I9=1I29=1D6=1I16=1I6=1I7=1I8=2I5=1I5=1I11=1I5=5I2=3I1=1I1=1I1=3I5=2D19=1I14=1I17=28S"};
+    const std::string expectedRawSeq{
+        "CCCCGGGATTCCTCTAGATGCATCAGGTAAGAAAAGTACGATGCTACAGCTTGTGACTGGTGCGGCACTT"
+        "TTGGCTGAGTTTATCCTGTGCCACCTCATGTATTCTGCCCTAGACAGTCGGTCTTGCACGCCATTACTAG"
+        "ACCGACAAAATGGAACCGGGGCCCTTAAACCCCGTTCGAAGGCGTAAGCAAGGAAGATAGGGTTTTATGA"
+        "AACTCTTCCCAGTCAATAATACCAAAAAAACCCCAACCAAGATCGTGACGGATTGCAGAGCGAATCCTAT"
+        "CCGCGCTCGCAATAATTTAGTGTTGATCCAAGCTTGCTGAGGACTAGTAAAGCTTC"};
+    const std::string expectedClippedSeq{
+        "CATCAGGTAAGAAAAGTACGATGCTACAGCTTGTGACTGGTGCGGCACTTTTGGCTGAGTTTATCCTGTG"
+        "CCACCTCATGTATTCTGCCCTAGACAGTCGGTCTTGCACGCCATTACTAGACCGACAAAATGGAACCGGG"
+        "GCCCTTAAACCCCGTTCGAAGGCGTAAGCAAGGAAGATAGGGTTTTATGAAACTCTTCCCAGTCAATAAT"
+        "ACCAAAAAAACCCCAACCAAGATCGTGACGGATTGCAGAGCGAATCCTATCCGCGCTCGCAATAATTTAG"
+        "TGTTGATC"};
+    const std::vector<uint8_t> expectedRawIpds{
+        17,3,8,3,4,1,14,8,2,1,21,3,1,17,22,13,10,9,89,7,4,5,3,17,8,8,18,58,14,
+        25,8,5,9,1,5,0,20,16,15,9,78,19,2,20,23,12,2,5,7,3,5,61,19,12,13,6,65,
+        18,105,2,34,94,3,38,69,16,5,76,1,21,5,3,2,0,32,23,26,9,3,4,18,2,2,12,19,
+        33,63,11,4,25,3,7,7,3,26,48,28,34,1,2,6,31,17,29,68,5,20,79,6,12,10,3,
+        43,72,21,65,8,45,17,14,13,20,7,3,5,8,0,17,11,65,6,7,8,3,6,11,4,1,80,4,
+        16,21,12,4,2,8,1,25,22,36,18,34,11,5,4,33,3,12,1,14,8,22,4,8,76,8,5,18,
+        32,5,33,47,255,36,9,26,2,6,47,0,35,8,8,0,5,37,40,1,11,8,39,60,8,42,0,3,
+        6,11,12,20,24,15,1,10,10,38,25,63,21,28,0,4,17,0,31,23,13,41,23,42,0,7,
+        33,7,23,11,50,30,2,44,21,182,44,105,231,33,255,59,189,253,17,13,7,28,40,
+        84,8,13,34,70,214,174,103,5,8,1,8,9,8,1,12,7,4,17,7,45,2,2,7,10,7,19,28,
+        31,3,18,0,42,0,8,2,9,2,1,11,25,1,35,36,1,7,5,17,12,39,8,31,1,40,41,4,18,
+        2,51,14,1,16,255,2,5,83,2,6,2,1,6,9,10,3,31,19,35,6,16,21,12,28,4,10,10,
+        12,1,105,17,2,11};
+    const std::vector<uint8_t> expectedClippedIpds{
+        4,5,3,17,8,8,18,58,14,25,8,5,9,1,5,0,20,16,15,9,78,19,2,20,23,12,2,5,7,
+        3,5,61,19,12,13,6,65,18,105,2,34,94,3,38,69,16,5,76,1,21,5,3,2,0,32,23,
+        26,9,3,4,18,2,2,12,19,33,63,11,4,25,3,7,7,3,26,48,28,34,1,2,6,31,17,29,
+        68,5,20,79,6,12,10,3,43,72,21,65,8,45,17,14,13,20,7,3,5,8,0,17,11,65,6,
+        7,8,3,6,11,4,1,80,4,16,21,12,4,2,8,1,25,22,36,18,34,11,5,4,33,3,12,1,14,
+        8,22,4,8,76,8,5,18,32,5,33,47,255,36,9,26,2,6,47,0,35,8,8,0,5,37,40,1,
+        11,8,39,60,8,42,0,3,6,11,12,20,24,15,1,10,10,38,25,63,21,28,0,4,17,0,31,
+        23,13,41,23,42,0,7,33,7,23,11,50,30,2,44,21,182,44,105,231,33,255,59,
+        189,253,17,13,7,28,40,84,8,13,34,70,214,174,103,5,8,1,8,9,8,1,12,7,4,17,
+        7,45,2,2,7,10,7,19,28,31,3,18,0,42,0,8,2,9,2,1,11,25,1,35,36,1,7,5,17,
+        12,39,8,31,1,40,41,4,18,2,51,14,1,16,255};
+
+    const std::string fn{PbbamTestsConfig::Data_Dir + "/softclip_deletions.bam"};
+    BamRecord record;
+    BamReader reader{fn};
+    ASSERT_TRUE(reader.GetNext(record));
+
+    EXPECT_EQ(expectedName, record.FullName());
+    EXPECT_EQ(expectedStrand, record.AlignedStrand());
+    EXPECT_EQ(expectedCigar, record.CigarData().ToStdString());
+
+    const auto rawSeq = record.Sequence(PacBio::BAM::Orientation::GENOMIC);
+    const auto clippedSeq = record.Sequence(PacBio::BAM::Orientation::GENOMIC, false, true);
+    EXPECT_EQ(expectedRawSeq, rawSeq);
+    EXPECT_EQ(expectedClippedSeq, clippedSeq);
+
+    ASSERT_TRUE(record.HasIPD());
+    const auto rawIpds = record.IPD(PacBio::BAM::Orientation::GENOMIC).Encode();
+    const auto clippedIpds = record.IPD(PacBio::BAM::Orientation::GENOMIC, false, true).Encode();
+    EXPECT_EQ(expectedRawIpds, rawIpds);
+    EXPECT_EQ(expectedClippedIpds, clippedIpds);
+}
+
+TEST(BamRecordTest, ClipToQuery_Stranded)
+{
+    using namespace PacBio::BAM;
+
+    const std::string bamFile{PbbamTestsConfig::Data_Dir + "/clip_to_query.bam"};
+
+    bool first = true;
+    EntireFileQuery query{bamFile};
+    for (auto& i : query)
+    {
+        Strand expectedStrand;
+        std::string scope;
+        if (first) {
+            expectedStrand = Strand::FORWARD;
+            scope = "First record (FORWARD strand)";
+        } else {
+            expectedStrand = Strand::REVERSE;
+            scope = "Second record (REVERSE strand)";
+        }
+
+        SCOPED_TRACE(scope);
+
+        // initial
+        EXPECT_EQ(2, i.ReferenceStart());
+        EXPECT_EQ(7, i.ReferenceEnd());
+        EXPECT_EQ(0, i.QueryStart());
+        EXPECT_EQ(8, i.QueryEnd());
+        EXPECT_EQ(expectedStrand, i.AlignedStrand());
+        EXPECT_EQ("1S4=1I1=1S", i.CigarData().ToStdString());
+
+        // first clip
+        i.Clip(ClipType::CLIP_TO_REFERENCE, 3, 6);
+        EXPECT_EQ(3, i.ReferenceStart());
+        EXPECT_EQ(6, i.ReferenceEnd());
+        EXPECT_EQ(2, i.QueryStart());
+        EXPECT_EQ(6, i.QueryEnd());
+        EXPECT_EQ(expectedStrand, i.AlignedStrand());
+        EXPECT_EQ("3=1I", i.CigarData().ToStdString());
+
+        // second clip
+        Position qS;
+        Position qE;
+        if (first) {
+            qS = i.QueryStart();
+            qE = i.QueryEnd() - 1;
+        } else {
+            qS = i.QueryStart() + 1;
+            qE = i.QueryEnd();
+        }
+        i.Clip(ClipType::CLIP_TO_QUERY, qS, qE);
+        EXPECT_EQ(3, i.ReferenceStart());
+        EXPECT_EQ(6, i.ReferenceEnd());
+        EXPECT_EQ(qS, i.QueryStart());
+        EXPECT_EQ(qE, i.QueryEnd());
+        EXPECT_EQ(expectedStrand, i.AlignedStrand());
+        EXPECT_EQ("3=", i.CigarData().ToStdString());
+
+        first = false;
+    }
+}
+
+TEST(BamRecordTest, ClippingFlankingInserts_IgnoredOnClipToQuery)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const std::string seq      = "TTAACCGTTAGCAAA";
+    const std::string quals    = "--?]?]?]?]?*+++";
+    const std::string tagBases = "TTAACCGTTAGCAAA";
+    const std::string tagQuals = "--?]?]?]?]?*+++";
+    const f_data frames = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+
+    const BamRecord prototype =
+        BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals,
+                                           frames, seq, tagBases, tagQuals, frames);
+
+    {   // aligned forward
+
+        const int32_t  tId     = 0;
+        const Position tPos    = 100;
+        const uint8_t  mapQual = 80;
+        const Cigar cigar{"4I5=6I"};
+
+        BamRecord s = prototype.Mapped(tId, tPos, Strand::FORWARD, cigar, mapQual);
+        EXPECT_TRUE(s.IsMapped());
+        EXPECT_EQ(100, s.ReferenceStart());
+        EXPECT_EQ(105, s.ReferenceEnd());
+
+        const size_t clipStart = 502;
+        const size_t clipEnd = 512;
+        const bool exciseFlankingInserts = true;
+
+        s.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd, exciseFlankingInserts);
+
+        EXPECT_TRUE(s.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s.AlignedStrand());
+        EXPECT_EQ("2I5=3I", s.CigarData().ToStdString());
+
+        EXPECT_EQ(clipStart, s.QueryStart());
+        EXPECT_EQ(clipEnd,   s.QueryEnd());
+        EXPECT_EQ(clipStart, s.AlignedStart());
+        EXPECT_EQ(clipEnd,   s.AlignedEnd());
+        EXPECT_EQ(100,       s.ReferenceStart());
+        EXPECT_EQ(105,       s.ReferenceEnd());
+    }
+    {   // aligned reverse
+
+        const int32_t  tId     = 0;
+        const Position tPos    = 100;
+        const uint8_t  mapQual = 80;
+        const Cigar cigar{"4I5=6I"};
+
+        BamRecord s = prototype.Mapped(tId, tPos, Strand::REVERSE, cigar, mapQual);
+        EXPECT_TRUE(s.IsMapped());
+        EXPECT_EQ(100, s.ReferenceStart());
+        EXPECT_EQ(105, s.ReferenceEnd());
+
+        const size_t clipStart = 502;
+        const size_t clipEnd = 512;
+        const bool exciseFlankingInserts = true;
+
+        s.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd, exciseFlankingInserts);
+
+        EXPECT_TRUE(s.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s.AlignedStrand());
+        EXPECT_EQ("1I5=4I", s.CigarData().ToStdString());
+
+        EXPECT_EQ(clipStart, s.QueryStart());
+        EXPECT_EQ(clipEnd,   s.QueryEnd());
+        EXPECT_EQ(clipStart, s.AlignedStart());
+        EXPECT_EQ(clipEnd,   s.AlignedEnd());
+        EXPECT_EQ(100,       s.ReferenceStart());
+        EXPECT_EQ(105,       s.ReferenceEnd());
+    }
+}
+
+TEST(BamRecordTest, ClipToReference_Forward_ExciseFlankingInserts)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 526;
+    const std::string seq      = "TTAACCGTTAGCAAATTAACCGTTAG";
+    const std::string quals    = "--?]?]?]?]?*+++--?]?]?]?]?";
+    const std::string tagBases = "TTAACCGTTAGCAAATTAACCGTTAG";
+    const std::string tagQuals = "--?]?]?]?]?*+++--?]?]?]?]?";
+    const f_data frames = {
+        40, 40, 10, 10, 20, 20, 30, 40, 40, 10,
+        30, 20, 10, 10, 10, 40, 40, 10, 10, 20,
+        20, 30, 40, 40, 10, 30 };
+
+    const BamRecord prototype =
+        BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals,
+                                           frames, seq, tagBases, tagQuals, frames);
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Cigar cigar{"3=6I10=6I1="};
+
+    const size_t clipStart = 103;
+    const size_t clipEnd = 113;
+
+    // ----------------
+    // keep inserts
+
+    bool exciseFlankingInserts = false;
+
+    BamRecord withInserts = prototype.Mapped(tId, tPos, Strand::FORWARD, cigar, mapQual);
+    EXPECT_TRUE(withInserts.IsMapped());
+    EXPECT_EQ(100, withInserts.ReferenceStart());
+    EXPECT_EQ(114, withInserts.ReferenceEnd());
+
+    withInserts.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd, exciseFlankingInserts);
+
+    EXPECT_TRUE(withInserts.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, withInserts.AlignedStrand());
+    EXPECT_EQ("6I10=6I", withInserts.CigarData().ToStdString());
+
+    EXPECT_EQ(503, withInserts.QueryStart());
+    EXPECT_EQ(525, withInserts.QueryEnd());
+    EXPECT_EQ(503, withInserts.AlignedStart());
+    EXPECT_EQ(525, withInserts.AlignedEnd());
+    EXPECT_EQ(103, withInserts.ReferenceStart());
+    EXPECT_EQ(113, withInserts.ReferenceEnd());
+
+    // -----------------
+    // excise inserts
+
+    exciseFlankingInserts = true;
+
+    BamRecord withoutInserts = prototype.Mapped(tId, tPos, Strand::FORWARD, cigar, mapQual);
+    EXPECT_TRUE(withoutInserts.IsMapped());
+    EXPECT_EQ(100, withoutInserts.ReferenceStart());
+    EXPECT_EQ(114, withoutInserts.ReferenceEnd());
+
+    withoutInserts.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd, exciseFlankingInserts);
+
+    EXPECT_TRUE(withoutInserts.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, withoutInserts.AlignedStrand());
+    EXPECT_EQ("10=", withoutInserts.CigarData().ToStdString());
+
+    EXPECT_EQ(509, withoutInserts.QueryStart());
+    EXPECT_EQ(519, withoutInserts.QueryEnd());
+    EXPECT_EQ(509, withoutInserts.AlignedStart());
+    EXPECT_EQ(519, withoutInserts.AlignedEnd());
+    EXPECT_EQ(103, withoutInserts.ReferenceStart());
+    EXPECT_EQ(113, withoutInserts.ReferenceEnd());
+}
+
+TEST(BamRecordTest, ClipToReference_Reverse_ExciseFlankingInserts)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 526;
+    const std::string seq      = "TTAACCGTTAGCAAATTAACCGTTAG";
+    const std::string quals    = "--?]?]?]?]?*+++--?]?]?]?]?";
+    const std::string tagBases = "TTAACCGTTAGCAAATTAACCGTTAG";
+    const std::string tagQuals = "--?]?]?]?]?*+++--?]?]?]?]?";
+    const f_data frames = {
+        40, 40, 10, 10, 20, 20, 30, 40, 40, 10,
+        30, 20, 10, 10, 10, 40, 40, 10, 10, 20,
+        20, 30, 40, 40, 10, 30 };
+
+    const BamRecord prototype =
+        BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals,
+                                           frames, seq, tagBases, tagQuals, frames);
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Cigar cigar{"3=6I10=6I1="};
+
+    const size_t clipStart = 103;
+    const size_t clipEnd = 113;
+
+    // ----------------
+    // keep inserts
+
+    bool exciseFlankingInserts = false;
+
+    BamRecord withInserts = prototype.Mapped(tId, tPos, Strand::REVERSE, cigar, mapQual);
+
+    EXPECT_TRUE(withInserts.IsMapped());
+    EXPECT_EQ(100, withInserts.ReferenceStart());
+    EXPECT_EQ(114, withInserts.ReferenceEnd());
+
+    withInserts.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd, exciseFlankingInserts);
+
+    EXPECT_TRUE(withInserts.IsMapped());
+    EXPECT_EQ(Strand::REVERSE, withInserts.AlignedStrand());
+    EXPECT_EQ("6I10=6I", withInserts.CigarData().ToStdString());
+
+    EXPECT_EQ(501, withInserts.QueryStart());
+    EXPECT_EQ(523, withInserts.QueryEnd());
+    EXPECT_EQ(501, withInserts.AlignedStart());
+    EXPECT_EQ(523, withInserts.AlignedEnd());
+    EXPECT_EQ(103, withInserts.ReferenceStart());
+    EXPECT_EQ(113, withInserts.ReferenceEnd());
+
+    // -----------------
+    // excise inserts
+
+    exciseFlankingInserts = true;
+
+    BamRecord withoutInserts = prototype.Mapped(tId, tPos, Strand::REVERSE, cigar, mapQual);
+    EXPECT_TRUE(withoutInserts.IsMapped());
+    EXPECT_EQ(100, withoutInserts.ReferenceStart());
+    EXPECT_EQ(114, withoutInserts.ReferenceEnd());
+
+    withoutInserts.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd, exciseFlankingInserts);
+
+    EXPECT_TRUE(withoutInserts.IsMapped());
+    EXPECT_EQ(Strand::REVERSE, withoutInserts.AlignedStrand());
+    EXPECT_EQ("10=", withoutInserts.CigarData().ToStdString());
+
+    EXPECT_EQ(507, withoutInserts.QueryStart());
+    EXPECT_EQ(517, withoutInserts.QueryEnd());
+    EXPECT_EQ(507, withoutInserts.AlignedStart());
+    EXPECT_EQ(517, withoutInserts.AlignedEnd());
+    EXPECT_EQ(103, withoutInserts.ReferenceStart());
+    EXPECT_EQ(113, withoutInserts.ReferenceEnd());
+
+}
+
+// clang-format on
diff --git a/tests/src/test_BamRecordImplCore.cpp b/tests/src/test_BamRecordImplCore.cpp

new file mode 100644 (file)

index 0000000..fcd65cf
--- /dev/null
+++ b/tests/src/test_BamRecordImplCore.cpp
@@ -0,0 +1,597 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamRecordImpl.h>
+#include <pbbam/BamTagCodec.h>
+#include <pbbam/Tag.h>
+#include <pbbam/TagCollection.h>
+#include "../src/MemoryUtils.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace BamRecordImplCoreTests {
+
+struct Bam1Deleter
+{
+    void operator()(bam1_t* b) const
+    {
+        if (b) bam_destroy1(b);
+        b = nullptr;
+    }
+};
+
+static BamRecordImpl CreateBamImpl()
+{
+    BamRecordImpl bam;
+    bam.Bin(42);
+    bam.Flag(42);
+    bam.InsertSize(42);
+    bam.MapQuality(42);
+    bam.MatePosition(42);
+    bam.MateReferenceId(42);
+    bam.Position(42);
+    bam.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam.Tags(tags);
+
+    return bam;
+}
+
+static void CheckRawData(const BamRecordImpl& bam)
+{
+    // ensure raw data (lengths at least) matches API-facing data
+    const uint32_t expectedNameBytes = bam.Name().size() + 1;  // include NULL term
+    const uint32_t expectedNameNulls = 4 - (expectedNameBytes % 4);
+    const uint32_t expectedNameLength = expectedNameBytes + expectedNameNulls;
+    const uint32_t expectedNumCigarOps = bam.CigarData().size();
+    const int32_t expectedSeqLength = bam.Sequence().length();
+    const size_t expectedTagsLength = BamTagCodec::Encode(bam.Tags()).size();
+
+    //  Name        CIGAR         Sequence       Quals      Tags
+    // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + << TAGS >>
+    const int expectedTotalDataLength = expectedNameLength + (expectedNumCigarOps * 4) +
+                                        (expectedSeqLength + 1) / 2 + expectedSeqLength +
+                                        expectedTagsLength;
+
+    const auto rawData = PacBio::BAM::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    EXPECT_EQ(expectedNameNulls, rawData->core.l_extranul);
+    EXPECT_EQ(expectedNameLength, rawData->core.l_qname);
+    EXPECT_EQ(expectedNumCigarOps, rawData->core.n_cigar);
+    EXPECT_EQ(expectedSeqLength, rawData->core.l_qseq);
+    EXPECT_EQ(expectedTotalDataLength, rawData->l_data);
+}
+
+}  // namespace BamRecordImplCoreTests
+
+TEST(BamRecordImplCoreTestsTest, RawDataDefaultValues)
+{
+    std::shared_ptr<bam1_t> rawData(bam_init1(), BamRecordImplCoreTests::Bam1Deleter());
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    // fixed-length (core) data
+    EXPECT_EQ(0, rawData->core.tid);
+    EXPECT_EQ(0, rawData->core.pos);
+    EXPECT_EQ(0, rawData->core.bin);
+    EXPECT_EQ(0, rawData->core.qual);
+    EXPECT_EQ(0, rawData->core.l_qname);
+    EXPECT_EQ(0, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(0, rawData->core.mtid);
+    EXPECT_EQ(0, rawData->core.mpos);
+    EXPECT_EQ(0, rawData->core.isize);
+
+    // variable length data
+    EXPECT_EQ(0, rawData->data);
+    EXPECT_EQ(0, rawData->l_data);  // initial aligned QNAME
+    EXPECT_EQ(0, rawData->m_data);  // check this if we change or tune later
+}
+
+TEST(BamRecordImplCoreTestsTest, DefaultValues)
+{
+    BamRecordImpl bam;
+
+    // -------------------------------
+    // check raw data
+    // -------------------------------
+
+    const auto rawData = PacBio::BAM::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    // fixed-length (core) data
+    // (forced init unmapped, with NULL-term as QNAME)
+    EXPECT_EQ(-1, rawData->core.tid);
+    EXPECT_EQ(-1, rawData->core.pos);
+    EXPECT_EQ(0, rawData->core.bin);
+    EXPECT_EQ(255, rawData->core.qual);
+    EXPECT_EQ(3, rawData->core.l_extranul);  // alignment nulls
+    EXPECT_EQ(4, rawData->core.l_qname);     // normal null term + alignment nulls
+    EXPECT_EQ(BamRecordImpl::UNMAPPED, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(-1, rawData->core.mtid);
+    EXPECT_EQ(-1, rawData->core.mpos);
+    EXPECT_EQ(0, rawData->core.isize);
+
+    // variable length data
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_EQ(4, rawData->l_data);           // initial aligned QNAME
+    EXPECT_EQ(int{0x800}, rawData->m_data);  // check this if we change or tune later
+
+    // -------------------------------
+    // check data via API calls
+    // -------------------------------
+
+    EXPECT_EQ(0, bam.Bin());
+    EXPECT_EQ(BamRecordImpl::UNMAPPED, bam.Flag());
+    EXPECT_EQ(0, bam.InsertSize());
+    EXPECT_EQ(255, bam.MapQuality());
+    EXPECT_EQ(-1, bam.MateReferenceId());
+    EXPECT_EQ(-1, bam.MatePosition());
+    EXPECT_EQ(-1, bam.Position());
+    EXPECT_EQ(-1, bam.ReferenceId());
+    EXPECT_EQ(0, bam.Tags().size());
+
+    EXPECT_FALSE(bam.IsDuplicate());
+    EXPECT_FALSE(bam.IsFailedQC());
+    EXPECT_FALSE(bam.IsFirstMate());
+    EXPECT_FALSE(bam.IsMapped());
+    EXPECT_TRUE(bam.IsMateMapped());
+    EXPECT_FALSE(bam.IsMateReverseStrand());
+    EXPECT_FALSE(bam.IsPaired());
+    EXPECT_TRUE(bam.IsPrimaryAlignment());
+    EXPECT_FALSE(bam.IsProperPair());
+    EXPECT_FALSE(bam.IsReverseStrand());
+    EXPECT_FALSE(bam.IsSecondMate());
+    EXPECT_FALSE(bam.IsSupplementaryAlignment());
+
+    const std::string emptyString = "";
+    EXPECT_EQ(emptyString, bam.Name());
+    EXPECT_EQ(emptyString, bam.CigarData().ToStdString());
+    EXPECT_EQ(emptyString, bam.Sequence());
+    EXPECT_EQ(emptyString, bam.Qualities().Fastq());
+    BamRecordImplCoreTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplCoreTestsTest, CoreSetters)
+{
+    BamRecordImpl bam;
+    bam.Bin(42);
+    bam.Flag(42);
+    bam.InsertSize(42);
+    bam.MapQuality(42);
+    bam.MatePosition(42);
+    bam.MateReferenceId(42);
+    bam.Position(42);
+    bam.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam.Tags(tags);  // (28 bytes encoded)
+
+    // -------------------------------
+    // check raw data
+    // -------------------------------
+
+    const auto rawData = PacBio::BAM::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    // fixed-length (core) data
+    EXPECT_EQ(42, rawData->core.tid);
+    EXPECT_EQ(42, rawData->core.pos);
+    EXPECT_EQ(42, rawData->core.bin);
+    EXPECT_EQ(42, rawData->core.qual);
+    EXPECT_EQ(3, rawData->core.l_extranul);  // alignment nulls
+    EXPECT_EQ(4, rawData->core.l_qname);     // normal null term + alignment nulls
+    EXPECT_EQ(42, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(42, rawData->core.mtid);
+    EXPECT_EQ(42, rawData->core.mpos);
+    EXPECT_EQ(42, rawData->core.isize);
+
+    // variable length data
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_EQ(32, rawData->l_data);          // aligned qname + tags
+    EXPECT_EQ(int{0x800}, rawData->m_data);  // check this if we change or tune later
+
+    // -------------------------------
+    // check data via API calls
+    // -------------------------------
+
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+
+    const TagCollection fetchedTags = bam.Tags();
+
+    EXPECT_TRUE(fetchedTags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags.at("CA").ToUInt8Array());
+}
+
+TEST(BamRecordImplCoreTestsTest, DeepCopyFromRawData)
+{
+    // init raw data
+    std::shared_ptr<bam1_t> rawData(bam_init1(), BamRecordImplCoreTests::Bam1Deleter());
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    rawData->core.tid = 42;
+    rawData->core.pos = 42;
+    rawData->core.bin = 42;
+    rawData->core.qual = 42;
+    rawData->core.flag = 42;
+    rawData->core.mtid = 42;
+    rawData->core.mpos = 42;
+    rawData->core.isize = 42;
+
+    const int32_t x = 42;
+    char valueBytes[sizeof x];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&x)),
+              static_cast<const char*>(static_cast<const void*>(&x)) + sizeof x, valueBytes);
+    bam_aux_append(rawData.get(), "XY", 'i', sizeof(x), reinterpret_cast<uint8_t*>(&valueBytes[0]));
+
+    EXPECT_EQ(42, rawData->core.tid);
+    EXPECT_EQ(42, rawData->core.pos);
+    EXPECT_EQ(42, rawData->core.bin);
+    EXPECT_EQ(42, rawData->core.qual);
+    EXPECT_EQ(0, rawData->core.l_qname);
+    EXPECT_EQ(42, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(42, rawData->core.mtid);
+    EXPECT_EQ(42, rawData->core.mpos);
+    EXPECT_EQ(42, rawData->core.isize);
+    const int32_t fetchedX = bam_aux2i(bam_aux_get(rawData.get(), "XY"));
+    EXPECT_EQ(42, fetchedX);
+
+    // create from raw data
+    BamRecordImpl bam = [&rawData]() {
+        BamRecordImpl result;
+        bam_copy1(PacBio::BAM::BamRecordMemory::GetRawData(result).get(), rawData.get());
+        return result;
+    }();
+
+    // make sure raw data is still valid
+    EXPECT_EQ(42, rawData->core.tid);
+    EXPECT_EQ(42, rawData->core.pos);
+    EXPECT_EQ(42, rawData->core.bin);
+    EXPECT_EQ(42, rawData->core.qual);
+    EXPECT_EQ(0, rawData->core.l_qname);
+    EXPECT_EQ(42, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(42, rawData->core.mtid);
+    EXPECT_EQ(42, rawData->core.mpos);
+    EXPECT_EQ(42, rawData->core.isize);
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_TRUE(0 != rawData->l_data);
+    EXPECT_TRUE(0 != rawData->m_data);
+
+    // check new record
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+    EXPECT_EQ(x, bam.Tags()["XY"].ToInt32());
+
+    const auto newBamRawData = PacBio::BAM::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(newBamRawData));
+
+    EXPECT_TRUE(newBamRawData->data != nullptr);
+    EXPECT_TRUE(newBamRawData->m_data >= int{0x800});  // check this if we change or tune later
+
+    // tweak raw data, make sure we've done a deep copy (so BamRecordImpl isn't changed)
+    rawData->core.pos = 37;
+    EXPECT_EQ(37, rawData->core.pos);
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, newBamRawData->core.pos);
+}
+
+TEST(BamRecordImplCoreTestsTest, CopyAssignment)
+{
+    BamRecordImpl bam1;
+    bam1.Bin(42);
+    bam1.Flag(42);
+    bam1.InsertSize(42);
+    bam1.MapQuality(42);
+    bam1.MatePosition(42);
+    bam1.MateReferenceId(42);
+    bam1.Position(42);
+    bam1.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam1.Tags(tags);
+
+    BamRecordImpl bam2;
+    bam2 = bam1;
+
+    EXPECT_EQ(42, bam1.Bin());
+    EXPECT_EQ(42, bam1.Flag());
+    EXPECT_EQ(42, bam1.InsertSize());
+    EXPECT_EQ(42, bam1.MapQuality());
+    EXPECT_EQ(42, bam1.MateReferenceId());
+    EXPECT_EQ(42, bam1.MatePosition());
+    EXPECT_EQ(42, bam1.Position());
+    EXPECT_EQ(42, bam1.ReferenceId());
+
+    const TagCollection fetchedTags1 = bam1.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    EXPECT_EQ(42, bam2.Bin());
+    EXPECT_EQ(42, bam2.Flag());
+    EXPECT_EQ(42, bam2.InsertSize());
+    EXPECT_EQ(42, bam2.MapQuality());
+    EXPECT_EQ(42, bam2.MateReferenceId());
+    EXPECT_EQ(42, bam2.MatePosition());
+    EXPECT_EQ(42, bam2.Position());
+    EXPECT_EQ(42, bam2.ReferenceId());
+
+    const TagCollection fetchedTags2 = bam2.Tags();
+    EXPECT_TRUE(fetchedTags2.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags2.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags2.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags2.at("CA").ToUInt8Array());
+
+    BamRecordImplCoreTests::CheckRawData(bam1);
+    BamRecordImplCoreTests::CheckRawData(bam2);
+}
+
+TEST(BamRecordImplCoreTestsTest, SelfAssignmentTolerated)
+{
+    BamRecordImpl bam1;
+    bam1.Bin(42);
+    bam1.Flag(42);
+    bam1.InsertSize(42);
+    bam1.MapQuality(42);
+    bam1.MatePosition(42);
+    bam1.MateReferenceId(42);
+    bam1.Position(42);
+    bam1.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam1.Tags(tags);
+
+    bam1 = bam1;
+
+    EXPECT_EQ(42, bam1.Bin());
+    EXPECT_EQ(42, bam1.Flag());
+    EXPECT_EQ(42, bam1.InsertSize());
+    EXPECT_EQ(42, bam1.MapQuality());
+    EXPECT_EQ(42, bam1.MateReferenceId());
+    EXPECT_EQ(42, bam1.MatePosition());
+    EXPECT_EQ(42, bam1.Position());
+    EXPECT_EQ(42, bam1.ReferenceId());
+
+    const TagCollection fetchedTags1 = bam1.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    BamRecordImplCoreTests::CheckRawData(bam1);
+}
+
+TEST(BamRecordImplCoreTestsTest, CopyConstructor)
+{
+    BamRecordImpl bam1;
+    bam1.Bin(42);
+    bam1.Flag(42);
+    bam1.InsertSize(42);
+    bam1.MapQuality(42);
+    bam1.MatePosition(42);
+    bam1.MateReferenceId(42);
+    bam1.Position(42);
+    bam1.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam1.Tags(tags);
+
+    BamRecordImpl bam2(bam1);
+
+    EXPECT_EQ(42, bam1.Bin());
+    EXPECT_EQ(42, bam1.Flag());
+    EXPECT_EQ(42, bam1.InsertSize());
+    EXPECT_EQ(42, bam1.MapQuality());
+    EXPECT_EQ(42, bam1.MateReferenceId());
+    EXPECT_EQ(42, bam1.MatePosition());
+    EXPECT_EQ(42, bam1.Position());
+    EXPECT_EQ(42, bam1.ReferenceId());
+
+    const TagCollection fetchedTags1 = bam1.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    EXPECT_EQ(42, bam2.Bin());
+    EXPECT_EQ(42, bam2.Flag());
+    EXPECT_EQ(42, bam2.InsertSize());
+    EXPECT_EQ(42, bam2.MapQuality());
+    EXPECT_EQ(42, bam2.MateReferenceId());
+    EXPECT_EQ(42, bam2.MatePosition());
+    EXPECT_EQ(42, bam2.Position());
+    EXPECT_EQ(42, bam2.ReferenceId());
+
+    const TagCollection fetchedTags2 = bam2.Tags();
+    EXPECT_TRUE(fetchedTags2.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags2.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags2.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags2.at("CA").ToUInt8Array());
+
+    BamRecordImplCoreTests::CheckRawData(bam1);
+    BamRecordImplCoreTests::CheckRawData(bam2);
+}
+
+TEST(BamRecordImplCoreTestsTest, CreateRecord_InternalTest)
+{
+    BamRecordImpl bam = BamRecordImplCoreTests::CreateBamImpl();
+
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam.Tags(tags);
+
+    BamRecordImplCoreTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplCoreTestsTest, MoveAssignment)
+{
+    BamRecordImpl bam;
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    bam = std::move(BamRecordImplCoreTests::CreateBamImpl());
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+
+    const TagCollection fetchedTags1 = bam.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    BamRecordImplCoreTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplCoreTestsTest, MoveConstructor)
+{
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    BamRecordImpl bam(std::move(BamRecordImplCoreTests::CreateBamImpl()));
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+
+    const TagCollection fetchedTags1 = bam.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    BamRecordImplCoreTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplCoreTestsTest, AlignmentFlags)
+{
+    // same set of flags, different ways of getting there
+
+    // raw number
+    BamRecordImpl bam1;
+    bam1.Flag(1107);
+
+    // enum values
+    BamRecordImpl bam2;
+    bam2.Flag(BamRecordImpl::DUPLICATE | BamRecordImpl::MATE_1 | BamRecordImpl::REVERSE_STRAND |
+              BamRecordImpl::PROPER_PAIR | BamRecordImpl::PAIRED);
+
+    // convenience calls
+    BamRecordImpl bam3;
+    bam3.SetDuplicate(true);
+    bam3.SetFirstMate(true);
+    bam3.SetReverseStrand(true);
+    bam3.SetMapped(true);
+    bam3.SetMateMapped(true);
+    bam3.SetPaired(true);
+    bam3.SetProperPair(true);
+    bam3.SetPrimaryAlignment(true);
+
+    // make sure all are same
+    EXPECT_EQ(1107, bam1.Flag());
+    EXPECT_EQ(1107, bam2.Flag());
+    EXPECT_EQ(1107, bam3.Flag());
+
+    // check API calls
+    EXPECT_TRUE(bam1.IsPaired());
+    EXPECT_TRUE(bam1.IsProperPair());
+    EXPECT_TRUE(bam1.IsMapped());
+    EXPECT_TRUE(bam1.IsMateMapped());
+    EXPECT_TRUE(bam1.IsReverseStrand());
+    EXPECT_FALSE(bam1.IsMateReverseStrand());
+    EXPECT_TRUE(bam1.IsFirstMate());
+    EXPECT_FALSE(bam1.IsSecondMate());
+    EXPECT_TRUE(bam1.IsPrimaryAlignment());
+    EXPECT_FALSE(bam1.IsFailedQC());
+    EXPECT_TRUE(bam1.IsDuplicate());
+    EXPECT_FALSE(bam1.IsSupplementaryAlignment());
+}
diff --git a/tests/src/test_BamRecordImplTags.cpp b/tests/src/test_BamRecordImplTags.cpp

new file mode 100644 (file)

index 0000000..2ab169a
--- /dev/null
+++ b/tests/src/test_BamRecordImplTags.cpp
@@ -0,0 +1,179 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamRecordImpl.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+// NOTE: these tests check "high-level" tag query/manipulation via BamRecordImpl.
+//       For raw Tag/TagCollection tests, see test_Tags.cpp
+//       For encoding tests, see test_BamRecordImplVariableData.cpp
+
+TEST(BamRecordImplTagsTest, HasTagTest)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    EXPECT_FALSE(bam.HasTag("zz"));
+    EXPECT_FALSE(bam.HasTag(""));
+    EXPECT_FALSE(bam.HasTag("some_too_long_name"));
+
+    const TagCollection fetchedTags = bam.Tags();
+    EXPECT_TRUE(fetchedTags.Contains("HX"));
+    EXPECT_TRUE(fetchedTags.Contains("CA"));
+    EXPECT_TRUE(fetchedTags.Contains("XY"));
+    EXPECT_FALSE(fetchedTags.Contains("zz"));
+    EXPECT_FALSE(fetchedTags.Contains(""));
+    EXPECT_FALSE(fetchedTags.Contains("some_too_long_name"));
+}
+
+TEST(BamRecordImplTagsTest, SimpleAddTag)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_FALSE(bam.HasTag("XY"));
+
+    bam.AddTag("XY", int32_t{-42});
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    const TagCollection fetchedTags = bam.Tags();
+    EXPECT_TRUE(fetchedTags.Contains("HX"));
+    EXPECT_TRUE(fetchedTags.Contains("CA"));
+    EXPECT_TRUE(fetchedTags.Contains("XY"));
+    EXPECT_FALSE(fetchedTags.Contains("zz"));
+    EXPECT_FALSE(fetchedTags.Contains(""));
+    EXPECT_FALSE(fetchedTags.Contains("some_too_long_name"));
+
+    EXPECT_EQ(-42, fetchedTags.at("XY").ToInt32());
+
+    // fail on invalid adds
+    EXPECT_FALSE(bam.AddTag("", int32_t{-42}));
+    EXPECT_FALSE(bam.AddTag("some_too_long_name", int32_t{-42}));
+    EXPECT_FALSE(bam.AddTag("XY", int32_t{-42}));  // reject duplicate
+}
+
+TEST(BamRecordImplTagsTest, SimpleRemoveTag)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    const bool removedOk = bam.RemoveTag("XY");
+    EXPECT_TRUE(removedOk);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_FALSE(bam.HasTag("XY"));
+
+    const TagCollection fetchedTags = bam.Tags();
+    EXPECT_TRUE(fetchedTags.Contains("HX"));
+    EXPECT_TRUE(fetchedTags.Contains("CA"));
+    EXPECT_FALSE(fetchedTags.Contains("XY"));
+    EXPECT_FALSE(fetchedTags.Contains("zz"));
+    EXPECT_FALSE(fetchedTags.Contains(""));
+    EXPECT_FALSE(fetchedTags.Contains("some_too_long_name"));
+
+    // fail on invalid removes
+    EXPECT_FALSE(bam.RemoveTag(""));
+    EXPECT_FALSE(bam.RemoveTag("some_too_long_name"));
+    EXPECT_FALSE(bam.RemoveTag("zz"));  // reject remove unknown
+}
+
+TEST(BamRecordImplTagsTest, SimpleEditTag)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    const TagCollection fetchedTags = bam.Tags();
+    EXPECT_TRUE(fetchedTags.Contains("HX"));
+    EXPECT_TRUE(fetchedTags.Contains("CA"));
+    EXPECT_TRUE(fetchedTags.Contains("XY"));
+    EXPECT_EQ(-42, fetchedTags.at("XY").ToInt32());
+
+    const bool editedOk = bam.EditTag("XY", int32_t{500});
+    EXPECT_TRUE(editedOk);
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    const TagCollection fetchedTags2 = bam.Tags();
+    EXPECT_TRUE(fetchedTags2.Contains("HX"));
+    EXPECT_TRUE(fetchedTags2.Contains("CA"));
+    EXPECT_TRUE(fetchedTags2.Contains("XY"));
+    EXPECT_EQ(500, fetchedTags2.at("XY").ToInt32());
+
+    // fail on invalid edits
+    EXPECT_FALSE(bam.EditTag("", 500));
+    EXPECT_FALSE(bam.EditTag("some_too_long_name", 500));
+    EXPECT_FALSE(bam.EditTag("zz", 500));  // reject edit unknown
+}
+
+TEST(BamRecordImplTagsTest, SimpleQueryTag)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("XY"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    EXPECT_EQ(std::string("1abc75"), bam.TagValue("HX").ToString());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), bam.TagValue("CA").ToUInt8Array());
+    EXPECT_EQ(int32_t{-42}, bam.TagValue("XY").ToInt32());
+
+    EXPECT_FALSE(bam.HasTag("zz"));
+    EXPECT_FALSE(bam.HasTag(""));
+    EXPECT_FALSE(bam.HasTag("some_too_long_name"));
+
+    EXPECT_EQ(Tag(), bam.TagValue("zz"));
+    EXPECT_EQ(Tag(), bam.TagValue(""));
+    EXPECT_EQ(Tag(), bam.TagValue("some_too_long_name"));
+}
diff --git a/tests/src/test_BamRecordImplVariableData.cpp b/tests/src/test_BamRecordImplVariableData.cpp

new file mode 100644 (file)

index 0000000..4fdfad1
--- /dev/null
+++ b/tests/src/test_BamRecordImplVariableData.cpp
@@ -0,0 +1,4526 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamRecordImpl.h>
+#include <pbbam/BamTagCodec.h>
+#include <pbbam/SamTagCodec.h>
+#include <pbbam/Tag.h>
+#include <pbbam/TagCollection.h>
+#include "../src/MemoryUtils.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+// NOTE: this file has a *TON* of tests. Probably overkill, but I wanted to check
+//       every possible combination of variable data, and then manipulate each
+//       element within each combo to shrink & expand.
+
+namespace BamRecordImplVariableDataTests {
+
+static void CheckRawData(const BamRecordImpl& bam)
+{
+    // ensure raw data (lengths at least) matches API-facing data
+    const uint32_t expectedNameBytes = bam.Name().size() + 1;  // include NULL term
+    const uint32_t expectedNameNulls = 4 - (expectedNameBytes % 4);
+    const uint32_t expectedNameLength = expectedNameBytes + expectedNameNulls;
+    const uint32_t expectedNumCigarOps = bam.CigarData().size();
+    const int32_t expectedSeqLength = bam.Sequence().length();
+    const size_t expectedTagsLength = BamTagCodec::Encode(bam.Tags()).size();
+
+    //  Name        CIGAR         Sequence       Quals      Tags
+    // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + <encoded length>
+    const int expectedTotalDataLength = expectedNameLength + (expectedNumCigarOps * 4) +
+                                        (expectedSeqLength + 1) / 2 + expectedSeqLength +
+                                        expectedTagsLength;
+
+    const auto rawData = PacBio::BAM::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    EXPECT_EQ(expectedNameNulls, rawData->core.l_extranul);
+    EXPECT_EQ(expectedNameLength, rawData->core.l_qname);
+    EXPECT_EQ(expectedNumCigarOps, rawData->core.n_cigar);
+    EXPECT_EQ(expectedSeqLength, rawData->core.l_qseq);
+    EXPECT_EQ(expectedTotalDataLength, rawData->l_data);
+}
+
+}  // namespace BamRecordImplVariableDataTests
+
+TEST(BamRecordImplVariableDataTest, InitEmpty)
+{
+    BamRecordImpl bam;
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_InitEmpty)
+{
+    BamRecordImpl bam;
+    bam.Tags(TagCollection());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_InitNormal)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_ThenOverwriteWithLongerTags)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_ThenOverwriteWithShorterTags)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_ThenOverwriteWithEmptyTags)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_InitEmpty)
+{
+    BamRecordImpl bam;
+    bam.CigarData(std::string());
+    EXPECT_EQ(0, bam.CigarData().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_InitNormal_CigarObject)
+{
+    Cigar cigar;
+    cigar.push_back(CigarOperation('=', 100));
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(cigar, bam.CigarData());
+    EXPECT_TRUE("100=" == bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_InitNormal_StdString)
+{
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_ThenOverwriteWithLongerCigar)
+{
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_ThenOverwriteWithShorterCigar)
+{
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.CigarData(longerCigar);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_ThenOverwriteWithEmptyCigar)
+{
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_Init_Normal)
+{
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_Init_EmptyCigar)
+{
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_Init_EmptyTag)
+{
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithLongerCigar)
+{
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithShorterCigar)
+{
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(longerCigar);
+    bam.Tags(tags);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithEmptyCigar)
+{
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithLongerTags)
+{
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithShorterTags)
+{
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_Empty)
+{
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(std::string(), std::string());
+    EXPECT_EQ(0, bam.Sequence().size());
+    EXPECT_EQ(0, bam.Qualities().Fastq().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_Preencoded)
+{
+
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    const size_t encodedLength = (sequence.size() + 1) / 2;
+    char* encoded = static_cast<char*>(std::calloc(encodedLength, sizeof(char)));
+    char* e = encoded;
+
+    uint8_t nucleotideCode{};
+    bool useHighWord = true;
+    for (size_t i = 0; i < sequence.size(); ++i) {
+        switch (sequence.at(i)) {
+            case 'A':
+                nucleotideCode = 1;
+                break;
+            case 'C':
+                nucleotideCode = 2;
+                break;
+            case 'G':
+                nucleotideCode = 4;
+                break;
+            case 'T':
+                nucleotideCode = 8;
+                break;
+            default:
+                EXPECT_FALSE(true);
+                break;
+        }
+
+        // pack the nucleotide code
+        if (useHighWord) {
+            *e = nucleotideCode << 4;
+            useHighWord = false;
+        } else {
+            *e |= nucleotideCode;
+            ++e;
+            useHighWord = true;
+        }
+    }
+
+    BamRecordImpl bam;
+    bam.SetPreencodedSequenceAndQualities(encoded, sequence.size(), qualities.c_str());
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+
+    if (encoded) free(encoded);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_Preencoded_EmptyQual)
+{
+
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    const auto encodedLength = (sequence.size() + 1) / 2;
+    auto* encoded = static_cast<char*>(std::calloc(encodedLength, sizeof(char)));
+    auto* e = encoded;
+
+    uint8_t nucleotideCode{};
+    bool useHighWord = true;
+    for (size_t i = 0; i < sequence.size(); ++i) {
+        switch (sequence.at(i)) {
+            case 'A':
+                nucleotideCode = 1;
+                break;
+            case 'C':
+                nucleotideCode = 2;
+                break;
+            case 'G':
+                nucleotideCode = 4;
+                break;
+            case 'T':
+                nucleotideCode = 8;
+                break;
+            default:
+                EXPECT_FALSE(true);
+                break;
+        }
+
+        // pack the nucleotide code
+        if (useHighWord) {
+            *e = nucleotideCode << 4;
+            useHighWord = false;
+        } else {
+            *e |= nucleotideCode;
+            ++e;
+            useHighWord = true;
+        }
+    }
+
+    BamRecordImpl bam;
+    bam.SetPreencodedSequenceAndQualities(encoded, sequence.size(), qualities.c_str());
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+
+    if (encoded) free(encoded);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithEmptySeq)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_Normal)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_EmptySeqQual)
+{
+    const std::string sequence = "";
+    const std::string qualities = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_EmptyTag)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithEmptySeq)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithLongerTags)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithShorterTags)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_Normal)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_EmptySeqQual)
+{
+    const std::string sequence = "";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_EmptyCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithEmptySeq)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithLongerCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithShorterCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(longerCigar);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithEmptyCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_Normal)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptySeqQual)
+{
+    const std::string sequence = "";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptyCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptyTag)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithEmptySeq)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(longerCigar);
+    bam.Tags(tags);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithEmptyCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerTags)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterTags)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_InitEmpty)
+{
+    BamRecordImpl bam;
+    bam.Name(std::string());
+    EXPECT_EQ(0, bam.Name().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_InitNormal)
+{
+    const std::string readName = "foo";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string emptyName = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Name(emptyName);
+
+    EXPECT_EQ(emptyName, bam.Name());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_Init_Normal)
+{
+    const std::string readName = "foo";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_Init_EmptyName)
+{
+    const std::string readName = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_Init_EmptyTag)
+{
+    const std::string readName = "foo";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.Tags(tags);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithLongerTags)
+{
+    const std::string readName = "foo";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithShorterTags)
+{
+    const std::string readName = "foo";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string readName = "foo";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_Init_EmptyName)
+{
+    const std::string readName = "";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_Init_EmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.CigarData(cigar);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithLongerCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithShorterCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(longerCigar);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithEmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_EmptyName)
+{
+    const std::string readName = "";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_EmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_EmptyTag)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithLongerCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithShorterCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(longerCigar);
+    bam.Tags(tags);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithEmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithLongerTags)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithShorterTags)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_Init_EmptySeqQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "";
+    const std::string qualities = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_Init_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithEmptySeq)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptyName)
+{
+    const std::string readName = "";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptySeqQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "";
+    const std::string qualities = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptyTag)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(0, bam.Tags().size());
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithEmptySeq)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerTags)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterTags)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptyName)
+{
+    const std::string readName = "";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptySeqQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithEmptySeq)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(longerCigar);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithEmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+// @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyName)
+{
+    const std::string readName = "";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptySeqQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyTag)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptySeq)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(longerCigar);
+    bam.Tags(tags);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerTags)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterTags)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
diff --git a/tests/src/test_BamRecordMapping.cpp b/tests/src/test_BamRecordMapping.cpp

new file mode 100644 (file)

index 0000000..5a7b842
--- /dev/null
+++ b/tests/src/test_BamRecordMapping.cpp
@@ -0,0 +1,702 @@
+// Author: Derek Barnett
+
+#include <chrono>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamRecordView.h>
+#include <pbbam/BamTagCodec.h>
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+using f_data = std::vector<uint16_t>;
+
+namespace BamRecordMappingTests {
+
+static
+BamRecord MakeRecord(const Position qStart,
+                     const Position qEnd,
+                     const std::string& seq,
+                     const std::string& quals,
+                     const std::string& tagBases,
+                     const std::string& tagQuals,
+                     const f_data& frames)
+{
+    BamRecordImpl impl;
+    impl.SetSequenceAndQualities(seq, quals);
+
+    TagCollection tags;
+    tags["qs"] = qStart;
+    tags["qe"] = qEnd;
+    tags["ip"] = frames;
+    tags["pw"] = frames;
+    tags["dt"] = tagBases;
+    tags["st"] = tagBases;
+    tags["dq"] = tagQuals;
+    tags["iq"] = tagQuals;
+    tags["mq"] = tagQuals;
+    tags["sq"] = tagQuals;
+    tags["pq"] = tagQuals;
+    tags["pv"] = tagQuals;
+    impl.Tags(tags);
+
+    return BamRecord(std::move(impl));
+}
+
+} // namespace BamRecordMappingTests
+
+TEST(BamRecordMappingTest, BasicMap)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const uint8_t mapQual = 80;
+
+    const std::string seq_rev   = "GCTAACGGTT";
+    const std::string quals_rev = "*?]?]?]?]?";
+    const std::string tagBases_rev = seq_rev;
+    const std::string tagQuals_rev = quals_rev;
+    const f_data frames_rev = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 };
+
+    const std::string s1_cigar = "10=";
+    const std::string s2_cigar = "5=3D5=";
+    const std::string s3_cigar = "4=1D2I2D4=";
+
+    BamRecord s1 = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2 = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3 = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s1_rev = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2_rev = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3_rev = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+
+    s1.Map(0, 100, Strand::FORWARD, s1_cigar, mapQual);
+    s2.Map(0, 100, Strand::FORWARD, s2_cigar, mapQual);
+    s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual);
+    s1_rev.Map(0, 100, Strand::REVERSE, s1_cigar, mapQual);
+    s2_rev.Map(0, 100, Strand::REVERSE, s2_cigar, mapQual);
+    s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual);
+
+    {   // s1 - FORWARD
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(0, s1.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(mapQual, s1.MapQuality());
+
+        EXPECT_EQ(qStart, s1.QueryStart());
+        EXPECT_EQ(qEnd,   s1.QueryEnd());
+        EXPECT_EQ(500, s1.AlignedStart());
+        EXPECT_EQ(510, s1.AlignedEnd());         // 500 + 10=
+        EXPECT_EQ(100, s1.ReferenceStart());
+        EXPECT_EQ(110, s1.ReferenceEnd());       // 100 + 10=
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(0, s1_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s1_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s1_rev.QueryStart());
+        EXPECT_EQ(qEnd,   s1_rev.QueryEnd());
+        EXPECT_EQ(500, s1_rev.AlignedStart());
+        EXPECT_EQ(510, s1_rev.AlignedEnd());         // 500 + 10=
+        EXPECT_EQ(100, s1_rev.ReferenceStart());
+        EXPECT_EQ(110, s1_rev.ReferenceEnd());       // 100 + 10=
+
+        // native
+        const BamRecordView nativeView
+        {
+            s1_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(0, s2.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(mapQual, s2.MapQuality());
+
+        EXPECT_EQ(qStart, s2.QueryStart());
+        EXPECT_EQ(qEnd,   s2.QueryEnd());
+        EXPECT_EQ(500, s2.AlignedStart());
+        EXPECT_EQ(510, s2.AlignedEnd());         // 500 + 10=
+        EXPECT_EQ(100, s2.ReferenceStart());
+        EXPECT_EQ(113, s2.ReferenceEnd());      // 100 + 10= + 3D
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(0, s2_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s2_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s2_rev.QueryStart());
+        EXPECT_EQ(qEnd,   s2_rev.QueryEnd());
+        EXPECT_EQ(500, s2_rev.AlignedStart());
+        EXPECT_EQ(510, s2_rev.AlignedEnd());         // 500 + 10=
+        EXPECT_EQ(100, s2_rev.ReferenceStart());
+        EXPECT_EQ(113, s2_rev.ReferenceEnd());      // 100 + 10= + 3D
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s2_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(0, s3.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(mapQual, s3.MapQuality());
+
+        EXPECT_EQ(qStart, s3.QueryStart());
+        EXPECT_EQ(qEnd,   s3.QueryEnd());
+        EXPECT_EQ(500, s3.AlignedStart());
+        EXPECT_EQ(510, s3.AlignedEnd());         // 500 + 8= + 2I
+        EXPECT_EQ(100, s3.ReferenceStart());
+        EXPECT_EQ(111, s3.ReferenceEnd());      // 100 + 8= + 3D
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(0, s3_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s3_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s3_rev.QueryStart());
+        EXPECT_EQ(qEnd,   s3_rev.QueryEnd());
+        EXPECT_EQ(500, s3_rev.AlignedStart());
+        EXPECT_EQ(510, s3_rev.AlignedEnd());         // 500 + 8= + 2I
+        EXPECT_EQ(100, s3_rev.ReferenceStart());
+        EXPECT_EQ(111, s3_rev.ReferenceEnd());      // 100 + 8= + 3D
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s3_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+}
+
+TEST(BamRecordMappingTest, SoftClipMapping)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const std::string seq      = "TTAACCGTTAGCAAA";
+    const std::string quals    = "--?]?]?]?]?*+++";
+    const std::string tagBases = "TTAACCGTTAGCAAA";
+    const std::string tagQuals = "--?]?]?]?]?*+++";
+    const f_data frames   = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+    const uint8_t mapQual = 80;
+
+    const std::string seq_rev   = "TTTGCTAACGGTTAA";
+    const std::string quals_rev = "+++*?]?]?]?]?--";
+    const std::string tagBases_rev = seq_rev;
+    const std::string tagQuals_rev = quals_rev;
+    const f_data frames_rev = { 10, 10, 10, 20, 30, 10, 40, 40, 30, 20, 20, 10, 10, 40, 40 };
+
+    const std::string s1_cigar = "2S10=3S";
+    const std::string s2_cigar = "2S5=3D5=3S";
+    const std::string s3_cigar = "2S4=1D2I2D4=3S";
+
+    BamRecord s1 = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2 = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3 = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s1_rev = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2_rev = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3_rev = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+
+    s1.Map(0, 100, Strand::FORWARD, s1_cigar, mapQual);
+    s2.Map(0, 100, Strand::FORWARD, s2_cigar, mapQual);
+    s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual);
+    s1_rev.Map(0, 100, Strand::REVERSE, s1_cigar, mapQual);
+    s2_rev.Map(0, 100, Strand::REVERSE, s2_cigar, mapQual);
+    s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual);
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(0, s1.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(mapQual, s1.MapQuality());
+
+        EXPECT_EQ(qStart, s1.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s1.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(502, s1.AlignedStart());       // QStart + 2S
+        EXPECT_EQ(512, s1.AlignedEnd());         // AStart + 10=
+        EXPECT_EQ(100, s1.ReferenceStart());     // 100
+        EXPECT_EQ(110, s1.ReferenceEnd());       // RefStart + 10=
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(0, s1_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s1_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s1_rev.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s1_rev.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(503, s1_rev.AlignedStart());       // QStart + 3S
+        EXPECT_EQ(513, s1_rev.AlignedEnd());         // AStart + 10=
+        EXPECT_EQ(100, s1_rev.ReferenceStart());     // 100
+        EXPECT_EQ(110, s1_rev.ReferenceEnd());       // RefStart + 10=
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s1_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(0, s2.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(mapQual, s2.MapQuality());
+
+        EXPECT_EQ(qStart, s2.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s2.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(502, s2.AlignedStart());       // QStart + 2S
+        EXPECT_EQ(512, s2.AlignedEnd());         // AStart + 10=
+        EXPECT_EQ(100, s2.ReferenceStart());     // 100
+        EXPECT_EQ(113, s2.ReferenceEnd());       // RefStart + 10= + 3D
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(0, s2_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s2_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s2_rev.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s2_rev.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(503, s2_rev.AlignedStart());       // QStart + 3S
+        EXPECT_EQ(513, s2_rev.AlignedEnd());         // AStart + 10=
+        EXPECT_EQ(100, s2_rev.ReferenceStart());     // 100
+        EXPECT_EQ(113, s2_rev.ReferenceEnd());       // RefStart + 10= + 3D
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s2_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(0, s3.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(mapQual, s3.MapQuality());
+
+        EXPECT_EQ(qStart, s3.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s3.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(502, s3.AlignedStart());       // QStart + 2S
+        EXPECT_EQ(512, s3.AlignedEnd());         // AStart + 8= + 2I
+        EXPECT_EQ(100, s3.ReferenceStart());     // 100
+        EXPECT_EQ(111, s3.ReferenceEnd());       // RefStart + 8= + 3D
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(0, s3_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s3_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s3_rev.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s3_rev.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(503, s3_rev.AlignedStart());       // QStart + 3S
+        EXPECT_EQ(513, s3_rev.AlignedEnd());         // AStart + 8= + 2I
+        EXPECT_EQ(100, s3_rev.ReferenceStart());     // 100
+        EXPECT_EQ(111, s3_rev.ReferenceEnd());       // RefStart + 8= + 3D
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s3_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+}
+
+TEST(BamRecordMappingTest, MappedCopy)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const uint8_t mapQual = 80;
+    const std::string cigar    = "4=1D2I2D4=";
+
+    const BamRecord orig = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    const BamRecord mapped = orig.Mapped(0, 100, Strand::FORWARD, cigar, mapQual);
+
+    EXPECT_TRUE(mapped.IsMapped());
+    EXPECT_EQ(0, mapped.ReferenceId());
+    EXPECT_EQ(Strand::FORWARD, mapped.AlignedStrand());
+    EXPECT_EQ(mapQual, mapped.MapQuality());
+
+    EXPECT_EQ(500, mapped.QueryStart());      // 500
+    EXPECT_EQ(510, mapped.QueryEnd());        // QStart + seqLength
+    EXPECT_EQ(500, mapped.AlignedStart());    // QStart
+    EXPECT_EQ(510, mapped.AlignedEnd());      // QStart + 8= + 2I
+    EXPECT_EQ(100, mapped.ReferenceStart());  // 100
+    EXPECT_EQ(111, mapped.ReferenceEnd());    // RefStart + 8= + 3D
+
+    const BamRecordView view
+    {
+        mapped,
+        Orientation::NATIVE,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq,      view.Sequence());
+    EXPECT_EQ(quals,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases, view.DeletionTags());
+    EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames,   view.IPD().Data());
+}
+
+TEST(BamRecordMappingTest, StaticMapped)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const uint8_t mapQual = 80;
+    const std::string cigar    = "4=1D2I2D4=";
+
+    const BamRecord orig = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    const BamRecord mapped = BamRecord::Mapped(orig, 0, 100, Strand::FORWARD, cigar, mapQual);
+
+    EXPECT_TRUE(mapped.IsMapped());
+    EXPECT_EQ(0, mapped.ReferenceId());
+    EXPECT_EQ(Strand::FORWARD, mapped.AlignedStrand());
+    EXPECT_EQ(mapQual, mapped.MapQuality());
+
+    EXPECT_EQ(500, mapped.QueryStart());      // 500
+    EXPECT_EQ(510, mapped.QueryEnd());        // QStart + seqLength
+    EXPECT_EQ(500, mapped.AlignedStart());    // QStart
+    EXPECT_EQ(510, mapped.AlignedEnd());      // QStart + 8= + 2I
+    EXPECT_EQ(100, mapped.ReferenceStart());  // 100
+    EXPECT_EQ(111, mapped.ReferenceEnd());    // RefStart + 8= + 3D
+
+    const BamRecordView view
+    {
+        mapped,
+        Orientation::NATIVE,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq,      view.Sequence());
+    EXPECT_EQ(quals,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases, view.DeletionTags());
+    EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames,   view.IPD().Data());
+}
+
+// clang-format on
diff --git a/tests/src/test_BamWriter.cpp b/tests/src/test_BamWriter.cpp

new file mode 100644 (file)

index 0000000..a5ac803
--- /dev/null
+++ b/tests/src/test_BamWriter.cpp
@@ -0,0 +1,113 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamHeader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/EntireFileQuery.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+// clang-format off
+
+namespace BamWriterTests {
+
+void checkSingleRecord(bool useTempFile)
+{
+    const std::string fullName = "test/100/0_5";
+    const std::string rgId = "6002b307";
+    const std::vector<float> expectedSnr = {0.2, 0.2, 0.2, 0.2};
+
+    // setup header
+    const std::string hdrText = {
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:6002b307\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;"
+        "SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\t"
+        "PU:test\tPM:SEQUEL\n"};
+    BamHeader inputHeader(hdrText);
+
+    // setup record
+    BamRecord bamRecord(inputHeader);
+    bamRecord.Impl().Name(fullName);
+    bamRecord.Impl().SetSequenceAndQualities("ACGTC", 5);
+    bamRecord.Impl().CigarData("");
+    bamRecord.Impl().Bin(0);
+    bamRecord.Impl().Flag(0);
+    bamRecord.Impl().InsertSize(0);
+    bamRecord.Impl().MapQuality(0);
+    bamRecord.Impl().MatePosition(-1);
+    bamRecord.Impl().MateReferenceId(-1);
+    bamRecord.Impl().Position(-1);
+    bamRecord.Impl().ReferenceId(-1);
+    bamRecord.Impl().SetMapped(false);
+
+    TagCollection tags;
+    tags["zm"] = int32_t{100};
+    tags["qs"] = int32_t{0};
+    tags["qe"] = int32_t{5};
+    tags["np"] = int32_t{1};
+    tags["rq"] = static_cast<float>(0.6);
+    tags["RG"] = rgId;
+    tags["sn"] = expectedSnr;
+    bamRecord.Impl().Tags(tags);
+
+    // write record to file
+    const std::string generatedBamFn =
+        PbbamTestsConfig::GeneratedData_Dir + "/bamwriter_generated.bam";
+    {
+        BamWriter::Config config;
+        config.useTempFile = useTempFile;
+        BamWriter writer(generatedBamFn, inputHeader, config);
+        writer.Write(bamRecord);
+    }
+
+    // check written header
+    BamFile file(generatedBamFn);
+    const auto header = file.Header();
+    EXPECT_EQ(std::string("1.1"), header.Version());
+    EXPECT_EQ(std::string("unknown"), header.SortOrder());
+    EXPECT_EQ(std::string("3.0.1"), header.PacBioBamVersion());
+
+    // check written record
+    EntireFileQuery entireFile(file);
+    auto firstIter = entireFile.begin();
+    auto record = *firstIter;
+    EXPECT_EQ(std::string("ACGTC"), record.Sequence());
+    EXPECT_EQ(std::string("test/100/0_5"), record.FullName());
+    EXPECT_TRUE(record.HasHoleNumber());
+    EXPECT_TRUE(record.HasNumPasses());
+    EXPECT_TRUE(record.HasQueryEnd());
+    EXPECT_TRUE(record.HasQueryStart());
+    EXPECT_TRUE(record.HasReadAccuracy());
+    EXPECT_TRUE(record.HasSignalToNoise());
+    EXPECT_EQ(100, record.HoleNumber());
+    EXPECT_EQ(1, record.NumPasses());
+    EXPECT_EQ(0, record.QueryStart());
+    EXPECT_EQ(5, record.QueryEnd());
+    EXPECT_EQ(expectedSnr, record.SignalToNoise());
+    EXPECT_EQ(rgId, record.ReadGroupId());
+
+    // clean up
+    remove(generatedBamFn.c_str());
+}
+
+} // namespace BamWriterTests
+
+TEST(BamWriterTest, SingleWrite_UserRecord_WithTempFile)
+{
+    BamWriterTests::checkSingleRecord(true);
+}
+
+TEST(BamWriterTest, SingleWrite_UserRecord_NoTempFile)
+{
+    BamWriterTests::checkSingleRecord(false);
+}
+
+// clang-format on
diff --git a/tests/src/test_BarcodeQuery.cpp b/tests/src/test_BarcodeQuery.cpp

new file mode 100644 (file)

index 0000000..1827ccf
--- /dev/null
+++ b/tests/src/test_BarcodeQuery.cpp
@@ -0,0 +1,17 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BarcodeQuery.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(BarcodeQueryTest, QueryOk)
+{
+    // come back with barcoded data
+}
diff --git a/tests/src/test_BedReader.cpp b/tests/src/test_BedReader.cpp

new file mode 100644 (file)

index 0000000..3495c81
--- /dev/null
+++ b/tests/src/test_BedReader.cpp
@@ -0,0 +1,125 @@
+// Author: Derek Barnett
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/GenomicInterval.h>
+#include <pbbam/bed/BedReader.h>
+
+#include "PbbamTestData.h"
+
+using BedReader = PacBio::BAM::BedReader;
+using GenomicInterval = PacBio::BAM::GenomicInterval;
+
+// clang-format off
+
+namespace BedReaderTests {
+
+const std::string BedFn = PacBio::BAM::PbbamTestsConfig::Data_Dir + "/bed/test.bed";
+const std::string GzipBedFn = PacBio::BAM::PbbamTestsConfig::Data_Dir + "/bed/test.bed.gz";
+
+const std::vector<GenomicInterval> ExpectedIntervals {
+    {"chr1", 213941196, 213942363},
+    {"chr1", 213942363, 213943530},
+    {"chr1", 213943530, 213944697},
+    {"chr2", 158364697, 158365864},
+    {"chr2", 158365864, 158367031},
+    {"chr3", 127477031, 127478198},
+    {"chr3", 127478198, 127479365},
+    {"chr3", 127479365, 127480532},
+    {"chr3", 127480532, 127481699}
+};
+
+}  // namespace BedReaderTests
+
+TEST(BedReaderTest, throws_on_empty_filename)
+{
+    EXPECT_THROW(BedReader reader{""}, std::runtime_error);
+}
+
+TEST(BedReaderTest, throws_on_invalid_extension)
+{
+    EXPECT_THROW(BedReader reader{"wrong.ext"}, std::runtime_error);
+}
+
+TEST(BedReaderTest, can_iterate_manually_on_text_bed)
+{
+    const auto& fn = BedReaderTests::BedFn;
+
+    size_t count = 0;
+    BedReader reader{fn};
+    GenomicInterval interval;
+    while (reader.GetNext(interval)) {
+        EXPECT_EQ(BedReaderTests::ExpectedIntervals.at(count), interval);
+        ++count;
+    }
+    EXPECT_EQ(BedReaderTests::ExpectedIntervals.size(), count);
+}
+
+TEST(BedReaderTest, can_iterate_manually_on_gzip_bed)
+{
+    const auto& fn = BedReaderTests::GzipBedFn;
+
+    size_t count = 0;
+    BedReader reader{fn};
+    GenomicInterval interval;
+    while (reader.GetNext(interval)) {
+        EXPECT_EQ(BedReaderTests::ExpectedIntervals.at(count), interval);
+        ++count;
+    }
+    EXPECT_EQ(BedReaderTests::ExpectedIntervals.size(), count);
+}
+
+TEST(BedReaderTest, can_iterate_using_range_for_on_text_bed)
+{
+    const auto& fn = BedReaderTests::BedFn;
+
+    size_t count = 0;
+    BedReader reader{fn};
+    for (const auto& interval : reader) {
+        EXPECT_EQ(BedReaderTests::ExpectedIntervals.at(count), interval);
+        ++count;
+    }
+    EXPECT_EQ(BedReaderTests::ExpectedIntervals.size(), count);
+}
+
+TEST(BedReaderTest, can_iterate_using_range_for_on_gzip_bed)
+{
+    const auto& fn = BedReaderTests::GzipBedFn;
+
+    size_t count = 0;
+    BedReader reader{fn};
+    for (const auto& interval : reader) {
+        EXPECT_EQ(BedReaderTests::ExpectedIntervals.at(count), interval);
+        ++count;
+    }
+    EXPECT_EQ(BedReaderTests::ExpectedIntervals.size(), count);
+}
+
+TEST(BedReaderTest, BedReaderTest_can_read_all_from_text_bed)
+{
+    const auto& fn = BedReaderTests::BedFn;
+
+    size_t count = 0;
+    for (const auto& interval : BedReader::ReadAll(fn)) {
+        EXPECT_EQ(BedReaderTests::ExpectedIntervals.at(count), interval);
+        ++count;
+    }
+    EXPECT_EQ(BedReaderTests::ExpectedIntervals.size(), count);
+}
+
+TEST(BedReaderTest, BedReaderTest_can_read_all_from_gzip_bed)
+{
+    const auto& fn = BedReaderTests::GzipBedFn;
+
+    size_t count = 0;
+    for (const auto& interval : BedReader::ReadAll(fn)) {
+        EXPECT_EQ(BedReaderTests::ExpectedIntervals.at(count), interval);
+        ++count;
+    }
+    EXPECT_EQ(BedReaderTests::ExpectedIntervals.size(), count);
+}
+
+// clang-foramt on
diff --git a/tests/src/test_BedWriter.cpp b/tests/src/test_BedWriter.cpp

new file mode 100644 (file)

index 0000000..3553b5e
--- /dev/null
+++ b/tests/src/test_BedWriter.cpp
@@ -0,0 +1,72 @@
+// Author: Derek Barnett
+
+#include <cstdio>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/FormatUtils.h>
+#include <pbbam/GenomicInterval.h>
+#include <pbbam/bed/BedReader.h>
+#include <pbbam/bed/BedWriter.h>
+
+#include "PbbamTestData.h"
+
+using BedReader = PacBio::BAM::BedReader;
+using BedWriter = PacBio::BAM::BedWriter;
+using GenomicInterval = PacBio::BAM::GenomicInterval;
+
+namespace BedWriterTests {
+
+const std::vector<GenomicInterval> Intervals{
+    {"chr1", 213941196, 213942363}, {"chr1", 213942363, 213943530}, {"chr1", 213943530, 213944697},
+    {"chr2", 158364697, 158365864}, {"chr2", 158365864, 158367031}, {"chr3", 127477031, 127478198},
+    {"chr3", 127478198, 127479365}, {"chr3", 127479365, 127480532}, {"chr3", 127480532, 127481699}};
+
+}  // namespace BedWriterTests
+
+TEST(BedWriterTest, throws_on_empty_filename)
+{
+    EXPECT_THROW(BedWriter writer{""}, std::runtime_error);
+}
+
+TEST(BedWriterTest, can_write_plain_text)
+{
+    const std::string outFn = PacBio::BAM::PbbamTestsConfig::GeneratedData_Dir + "/out.bed";
+
+    {
+        BedWriter writer{outFn};
+        for (const auto& interval : BedWriterTests::Intervals)
+            writer.Write(interval);
+    }
+    EXPECT_EQ(PacBio::BAM::HtslibCompression::NONE,
+              PacBio::BAM::FormatUtils::CompressionType(outFn));
+
+    const auto contents = BedReader::ReadAll(outFn);
+    EXPECT_TRUE(std::equal(BedWriterTests::Intervals.cbegin(), BedWriterTests::Intervals.cend(),
+                           contents.cbegin()));
+
+    remove(outFn.c_str());
+}
+
+TEST(BedWriterTest, can_write_gzipped_text)
+{
+    const std::string outFn = PacBio::BAM::PbbamTestsConfig::GeneratedData_Dir + "/out.bed.gz";
+
+    {
+        BedWriter writer{outFn};
+        for (const auto& interval : BedWriterTests::Intervals)
+            writer.Write(interval);
+    }
+    EXPECT_EQ(PacBio::BAM::HtslibCompression::GZIP,
+              PacBio::BAM::FormatUtils::CompressionType(outFn));
+
+    const auto contents = BedReader::ReadAll(outFn);
+    EXPECT_TRUE(std::equal(BedWriterTests::Intervals.cbegin(), BedWriterTests::Intervals.cend(),
+                           contents.cbegin()));
+
+    remove(outFn.c_str());
+}
+\ No newline at end of file
diff --git a/tests/src/test_BgzipFastaWriter.cpp b/tests/src/test_BgzipFastaWriter.cpp

new file mode 100644 (file)

index 0000000..5ff3c90
--- /dev/null
+++ b/tests/src/test_BgzipFastaWriter.cpp
@@ -0,0 +1,39 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BgzipFastaWriter.h>
+#include <pbbam/FastaReader.h>
+#include <pbbam/FastaSequence.h>
+#include <pbbam/FormatUtils.h>
+#include "PbbamTestData.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(BgzipFastaWriterTest, writes_bgzf_fasta_data_to_file)
+{
+    const std::string fn{PbbamTestsConfig::GeneratedData_Dir + "/bgzf_fasta_out.fa.gz"};
+
+    const std::vector<FastaSequence> sequences{FastaSequence{"seq1", "ACGT"},
+                                               FastaSequence{"seq2", "GATTACA"},
+                                               FastaSequence{"seq3", "CCCC"}};
+
+    {
+        BgzipFastaWriter writer{fn};
+        for (const auto& seq : sequences)
+            writer.Write(seq);
+    }
+    EXPECT_EQ(HtslibCompression::BGZIP, FormatUtils::CompressionType(fn));
+
+    std::vector<FastaSequence> observed;
+    FastaReader reader{fn};
+    for (const auto& seq : reader)
+        observed.push_back(seq);
+
+    EXPECT_TRUE(std::equal(sequences.cbegin(), sequences.cend(), observed.cbegin()));
+}
diff --git a/tests/src/test_BgzipFastqWriter.cpp b/tests/src/test_BgzipFastqWriter.cpp

new file mode 100644 (file)

index 0000000..c357977
--- /dev/null
+++ b/tests/src/test_BgzipFastqWriter.cpp
@@ -0,0 +1,38 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BgzipFastqWriter.h>
+#include <pbbam/FastqReader.h>
+#include <pbbam/FastqSequence.h>
+#include <pbbam/FormatUtils.h>
+#include "PbbamTestData.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(BgzipFastqWriterTest, writes_bgzf_fastq_data_to_file)
+{
+    const std::string fn{PbbamTestsConfig::GeneratedData_Dir + "/bgzf_fastq_out.fq.gz"};
+
+    const std::vector<FastqSequence> sequences{
+        FastqSequence{"seq1", "ACGT", QualityValues{"zzzz"}},
+        FastqSequence{"seq2", "GATTACA", QualityValues{"~~~~~~~"}},
+        FastqSequence{"seq3", "CCCC", QualityValues{"$$$$"}}};
+
+    {
+        BgzipFastqWriter writer{fn};
+        for (const auto& seq : sequences)
+            writer.Write(seq);
+    }
+    EXPECT_EQ(HtslibCompression::BGZIP, FormatUtils::CompressionType(fn));
+
+    std::vector<FastqSequence> observed;
+    FastqReader reader{fn};
+    for (const auto& seq : reader)
+        observed.push_back(seq);
+
+    EXPECT_TRUE(std::equal(sequences.cbegin(), sequences.cend(), observed.cbegin()));
+}
+\ No newline at end of file
diff --git a/tests/src/test_BgzipWriter.cpp b/tests/src/test_BgzipWriter.cpp

new file mode 100644 (file)

index 0000000..914d143
--- /dev/null
+++ b/tests/src/test_BgzipWriter.cpp
@@ -0,0 +1,26 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BgzipWriter.h>
+#include <pbbam/FormatUtils.h>
+#include "PbbamTestData.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(BgzipWriterTest, writes_bgzf_format_to_file)
+{
+    const std::string fn{PbbamTestsConfig::GeneratedData_Dir + "/bgzf_writer_out.gz"};
+
+    {
+        const std::string data{"Simple output data"};
+        BgzipWriter writer{fn};
+        writer.Write(data.c_str(), data.size());
+        writer.Write(data);
+    }
+
+    EXPECT_EQ(HtslibCompression::BGZIP, FormatUtils::CompressionType(fn));
+}
diff --git a/tests/src/test_CCSPbiBuilder.cpp b/tests/src/test_CCSPbiBuilder.cpp

new file mode 100644 (file)

index 0000000..58f5efa
--- /dev/null
+++ b/tests/src/test_CCSPbiBuilder.cpp
@@ -0,0 +1,77 @@
+// Author: Derek Barnett
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/PbiRawData.h>
+#include <pbbam/ccs/CCSPbiBuilder.h>
+
+using namespace PacBio;
+using namespace PacBio::CCS;
+
+using Frames = PacBio::BAM::Frames;
+using LocalContextFlags = PacBio::BAM::LocalContextFlags;
+
+// clang-format off
+
+namespace CCSPbiBuilderTests {
+
+const CCSRecord& ValidRecord()
+{
+    static const CCSRecord record = [](){
+        CCSRecord r;
+        r.HoleNumber = 4391137;
+        r.QueryStart = 0;
+        r.QueryEnd = 459;
+        r.LocalContextFlags = LocalContextFlags::ADAPTER_AFTER;
+        r.Accuracy = 0.8f;
+        r.SignalToNoise = {7.6, 13.9, 7.0, 12.2};
+        r.Sequence = "GATTACA";
+        r.PulseWidths = Frames{std::vector<uint16_t>{13, 8, 3, 14, 18, 3}};
+        return r;
+    }();
+    return record;
+}
+
+}  // namespace CCSPbiBuilderTests
+
+TEST(CCSPbiBuilderTest, can_create_pbi_file_from_ccs_records)
+{
+    const std::string pbiFilename{"test.pbi"};
+
+    {
+        const auto& record = CCSPbiBuilderTests::ValidRecord();
+        CCSPbiBuilder builder{pbiFilename, "test"};
+        EXPECT_EQ("test", builder.MovieName());
+        builder.AddRecord(record);
+        builder.AddRecord(record);
+        builder.AddRecord(record);
+        builder.Close();
+    }
+    {
+        PacBio::BAM::PbiRawData index{pbiFilename};
+        ASSERT_EQ(3, index.NumReads());
+
+        const auto& basicData = index.BasicData();
+        EXPECT_EQ(1610789639, basicData.rgId_[0]);
+        EXPECT_EQ(1610789639, basicData.rgId_[1]);
+        EXPECT_EQ(1610789639, basicData.rgId_[2]);
+
+        EXPECT_EQ(4391137, basicData.holeNumber_[0]);
+        EXPECT_EQ(4391137, basicData.holeNumber_[1]);
+        EXPECT_EQ(4391137, basicData.holeNumber_[2]);
+
+        EXPECT_EQ(0, basicData.qStart_[0]);
+        EXPECT_EQ(0, basicData.qStart_[1]);
+        EXPECT_EQ(0, basicData.qStart_[2]);
+
+        EXPECT_EQ(459, basicData.qEnd_[0]);
+        EXPECT_EQ(459, basicData.qEnd_[1]);
+        EXPECT_EQ(459, basicData.qEnd_[2]);
+    }
+
+    remove(pbiFilename.c_str());
+}
+
+// clang-format on
+\ No newline at end of file
diff --git a/tests/src/test_CCSRecordIO.cpp b/tests/src/test_CCSRecordIO.cpp

new file mode 100644 (file)

index 0000000..cc25e92
--- /dev/null
+++ b/tests/src/test_CCSRecordIO.cpp
@@ -0,0 +1,236 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/ccs/CCSRecordFormat.h>
+#include <pbbam/ccs/CCSRecordReader.h>
+#include <pbbam/ccs/CCSRecordWriter.h>
+
+#include "PbbamTestData.h"
+
+using namespace PacBio;
+using namespace PacBio::CCS;
+
+using Frames = PacBio::BAM::Frames;
+using LocalContextFlags = PacBio::BAM::LocalContextFlags;
+
+// clang-format off
+
+namespace CCSRecordIOTests {
+
+const std::vector<std::string> ValidHeaderText{
+    "movie_name=m54238_180925_225123",
+    "binding_kit=101-789-500",
+    "sequencing_kit=101-789-300",
+    "basecaller_version=5.0",
+    "framerate=100"
+};
+
+const CCSHeader& ValidHeader() {
+    static const CCSHeader header{
+        "m54238_180925_225123",
+        "101-789-500",
+        "101-789-300",
+        "5.0",
+        "100"
+    };
+    return header;
+}
+
+const std::string ValidRecordText{
+    "4391137\t0\t459\t2\t0.8\t7.6,13.9,7,12.2\tGATTACA\t13,8,3,14,18,3"
+};
+
+const CCSRecord& ValidRecord()
+{
+    static const CCSRecord record = [](){
+        CCSRecord r;
+        r.HoleNumber = 4391137;
+        r.QueryStart = 0;
+        r.QueryEnd = 459;
+        r.LocalContextFlags = LocalContextFlags::ADAPTER_AFTER;
+        r.Accuracy = 0.8f;
+        r.SignalToNoise = {7.6, 13.9, 7.0, 12.2};
+        r.Sequence = "GATTACA";
+        r.PulseWidths = Frames{std::vector<uint16_t>{13, 8, 3, 14, 18, 3}};
+        return r;
+    }();
+    return record;
+}
+
+void CheckHeader(const CCSHeader& expected, const CCSHeader& observed)
+{
+    EXPECT_EQ(expected.MovieName, observed.MovieName);
+    EXPECT_EQ(expected.BindingKit, observed.BindingKit);
+    EXPECT_EQ(expected.SequencingKit, observed.SequencingKit);
+    EXPECT_EQ(expected.BasecallerVersion, observed.BasecallerVersion);
+    EXPECT_EQ(expected.FrameRate, observed.FrameRate);
+}
+
+void CheckRecord(const CCSRecord& expected, const CCSRecord& observed)
+{
+    EXPECT_EQ(expected.HoleNumber, observed.HoleNumber);
+    EXPECT_EQ(expected.QueryStart, observed.QueryStart);
+    EXPECT_EQ(expected.QueryEnd, observed.QueryEnd);
+    EXPECT_EQ(expected.LocalContextFlags, observed.LocalContextFlags);
+    EXPECT_EQ(expected.Accuracy, observed.Accuracy);
+    EXPECT_EQ(expected.SignalToNoise, observed.SignalToNoise);
+    EXPECT_EQ(expected.Sequence, observed.Sequence);
+
+    ASSERT_FALSE(expected.PulseWidths.empty());
+    ASSERT_FALSE(observed.PulseWidths.empty());
+    EXPECT_TRUE(std::equal(expected.PulseWidths.cbegin(), expected.PulseWidths.cend(),
+                           observed.PulseWidths.cbegin()));
+}
+
+}  // namespace CCSRecordIOTests
+
+// clang-format on
+
+TEST(CCSRecordIOTest, can_deserialize_valid_header_text)
+{
+    const auto& lines = CCSRecordIOTests::ValidHeaderText;
+    const CCSHeader result = CCSRecordFormat::DeserializeHeader(lines);
+    CCSRecordIOTests::CheckHeader(CCSRecordIOTests::ValidHeader(), result);
+}
+
+TEST(CCSRecordIOTest, deserialization_throws_on_invalid_header_text)
+{
+    // clang-format off
+
+    const std::vector<std::string> InvalidHeaderText_Empty{};
+
+    const std::vector<std::string> InvalidHeaderText_EmptyLine{
+        "movie_name=m54238_180925_225123=error",
+        "",
+        "binding_kit=101-789-500",
+        "sequencing_kit=101-789-300",
+        "basecaller_version=5.0",
+        "framerate=100"
+    };
+
+    const std::vector<std::string> InvalidHeaderText_ExtraEquals{
+        "movie_name=m54238_180925_225123=error",
+        "binding_kit=101-789-500",
+        "sequencing_kit=101-789-300",
+        "basecaller_version=5.0",
+        "framerate=100"
+    };
+
+    const std::vector<std::string> InvalidHeaderText_MissingEquals{
+        "movie_name=m54238_180925_225123",
+        "binding_kit101-789-500",
+        "sequencing_kit=101-789-300",
+        "basecaller_version=5.0",
+        "framerate=100"
+    };
+
+    const std::vector<std::string> InvalidHeaderText_UnknownFieldName{
+        "movie_name=m54238_180925_225123",
+        "binding_kit101-789-500",
+        "sequencing_kit=101-789-300",
+        "basecaller_version=5.0",
+        "framerate=100",
+        "this=does_not_exist"
+    };
+
+
+    EXPECT_THROW(CCSRecordFormat::DeserializeHeader(InvalidHeaderText_Empty),
+                 std::runtime_error);
+    EXPECT_THROW(CCSRecordFormat::DeserializeHeader(InvalidHeaderText_EmptyLine),
+                 std::runtime_error);
+    EXPECT_THROW(CCSRecordFormat::DeserializeHeader(InvalidHeaderText_ExtraEquals),
+                 std::runtime_error);
+    EXPECT_THROW(CCSRecordFormat::DeserializeHeader(InvalidHeaderText_MissingEquals),
+                 std::runtime_error);
+    EXPECT_THROW(CCSRecordFormat::DeserializeHeader(InvalidHeaderText_UnknownFieldName),
+                 std::runtime_error);
+
+    // clang-format on
+}
+
+TEST(CCSRecordIOTest, can_serialize_header)
+{
+    const auto& expected = CCSRecordIOTests::ValidHeaderText;
+    const auto lines = CCSRecordFormat::SerializeHeader(CCSRecordIOTests::ValidHeader());
+    EXPECT_TRUE(std::equal(lines.cbegin(), lines.cend(), expected.cbegin()));
+}
+
+TEST(CCSRecordIOTest, can_deserialize_valid_record)
+{
+    const auto& line = CCSRecordIOTests::ValidRecordText;
+    const auto observed = CCSRecordFormat::DeserializeRecord(line);
+    CCSRecordIOTests::CheckRecord(CCSRecordIOTests::ValidRecord(), observed);
+}
+
+TEST(CCSRecordIOTest, deserialization_throws_on_invalid_record)
+{
+    // clang-format off
+
+    const std::string InvalidRecordText_Empty;
+
+    const std::string InvalidRecordText_TooFewFields{"4391137\t0\t459\t2"};
+
+    const std::string InvalidRecordText_TooManyFields{
+        "4391137\t0\t459\t2\t0.8\t7.6,13.9,7,12.2\tGATTACA\t13,8,3,14,18,3\ttoo\tmany\fields"};
+
+    const std::string InvalidRecordText_WrongFieldDelmiter{
+        "4391137 0 459 2 0.8 7.6,13.9,7,12.2 GATTACA 13,8,3,14,18,3"};
+
+    const std::string InvalidRecordText_WrongSnrDelmiter{
+        "4391137\t0\t459\t2\t0.8\t7.6-13.9-7-12.2\tGATTACA\t13,8,3,14,18,3"};
+
+    EXPECT_THROW(CCSRecordFormat::DeserializeRecord(InvalidRecordText_Empty),
+                 std::runtime_error);
+    EXPECT_THROW(CCSRecordFormat::DeserializeRecord(InvalidRecordText_TooFewFields),
+                 std::runtime_error);
+    EXPECT_THROW(CCSRecordFormat::DeserializeRecord(InvalidRecordText_TooManyFields),
+                 std::runtime_error);
+    EXPECT_THROW(CCSRecordFormat::DeserializeRecord(InvalidRecordText_WrongFieldDelmiter),
+                 std::runtime_error);
+    EXPECT_THROW(CCSRecordFormat::DeserializeRecord(InvalidRecordText_WrongSnrDelmiter),
+                 std::runtime_error);
+
+    // clang-format on
+}
+
+TEST(CCSRecordIOTest, can_serialize_record)
+{
+    const auto& expected = CCSRecordIOTests::ValidRecordText;
+    const auto result = CCSRecordFormat::SerializeRecord(CCSRecordIOTests::ValidRecord());
+    EXPECT_EQ(expected, result);
+}
+
+TEST(CCSRecordIOTest, can_do_round_trip_read_and_write_to_iostreams)
+{
+    const size_t NumOutputRecords = 3;
+
+    // write to ostream
+    std::ostringstream output;
+    {
+        CCSRecordWriter writer{CCSRecordIOTests::ValidHeader(), output};
+        for (size_t i = 0; i < NumOutputRecords; ++i)
+            writer.Write(CCSRecordIOTests::ValidRecord());
+    }
+
+    // use ostream contents as istream
+    std::istringstream input;
+    input.str(output.str());
+
+    // check contents
+    CCSRecordReader reader{input};
+    CCSRecordIOTests::CheckHeader(CCSRecordIOTests::ValidHeader(), reader.Header());
+
+    size_t recordCount = 0;
+    for (const auto& record : reader) {
+        CCSRecordIOTests::CheckRecord(CCSRecordIOTests::ValidRecord(), record);
+        ++recordCount;
+    }
+    EXPECT_EQ(NumOutputRecords, recordCount);
+}
diff --git a/tests/src/test_Cigar.cpp b/tests/src/test_Cigar.cpp

new file mode 100644 (file)

index 0000000..fff3666
--- /dev/null
+++ b/tests/src/test_Cigar.cpp
@@ -0,0 +1,165 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/Cigar.h>
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(CigarTest, TypeToCar)
+{
+    EXPECT_EQ('M', CigarOperation::TypeToChar(CigarOperationType::ALIGNMENT_MATCH) );
+    EXPECT_EQ('I', CigarOperation::TypeToChar(CigarOperationType::INSERTION) );
+    EXPECT_EQ('D', CigarOperation::TypeToChar(CigarOperationType::DELETION) );
+    EXPECT_EQ('N', CigarOperation::TypeToChar(CigarOperationType::REFERENCE_SKIP) );
+    EXPECT_EQ('S', CigarOperation::TypeToChar(CigarOperationType::SOFT_CLIP) );
+    EXPECT_EQ('H', CigarOperation::TypeToChar(CigarOperationType::HARD_CLIP) );
+    EXPECT_EQ('P', CigarOperation::TypeToChar(CigarOperationType::PADDING) );
+    EXPECT_EQ('=', CigarOperation::TypeToChar(CigarOperationType::SEQUENCE_MATCH) );
+    EXPECT_EQ('X', CigarOperation::TypeToChar(CigarOperationType::SEQUENCE_MISMATCH) );
+}
+
+TEST(CigarTest, CharToType)
+{
+    EXPECT_EQ(CigarOperationType::ALIGNMENT_MATCH,   CigarOperation::CharToType('M'));
+    EXPECT_EQ(CigarOperationType::INSERTION,         CigarOperation::CharToType('I'));
+    EXPECT_EQ(CigarOperationType::DELETION,          CigarOperation::CharToType('D'));
+    EXPECT_EQ(CigarOperationType::REFERENCE_SKIP,    CigarOperation::CharToType('N'));
+    EXPECT_EQ(CigarOperationType::SOFT_CLIP,         CigarOperation::CharToType('S'));
+    EXPECT_EQ(CigarOperationType::HARD_CLIP,         CigarOperation::CharToType('H'));
+    EXPECT_EQ(CigarOperationType::PADDING,           CigarOperation::CharToType('P'));
+    EXPECT_EQ(CigarOperationType::SEQUENCE_MATCH,    CigarOperation::CharToType('='));
+    EXPECT_EQ(CigarOperationType::SEQUENCE_MISMATCH, CigarOperation::CharToType('X'));
+}
+
+TEST(CigarTest, SetOperationYieldsCorrectType)
+{
+    CigarOperation c1; c1.Type(CigarOperationType::ALIGNMENT_MATCH);
+    CigarOperation c2; c2.Type(CigarOperationType::INSERTION);
+    CigarOperation c3; c3.Type(CigarOperationType::DELETION);
+    CigarOperation c4; c4.Type(CigarOperationType::REFERENCE_SKIP);
+    CigarOperation c5; c5.Type(CigarOperationType::SOFT_CLIP);
+    CigarOperation c6; c6.Type(CigarOperationType::HARD_CLIP);
+    CigarOperation c7; c7.Type(CigarOperationType::PADDING);
+    CigarOperation c8; c8.Type(CigarOperationType::SEQUENCE_MATCH);
+    CigarOperation c9; c9.Type(CigarOperationType::SEQUENCE_MISMATCH);
+
+    EXPECT_EQ('M', c1.Char());
+    EXPECT_EQ('I', c2.Char());
+    EXPECT_EQ('D', c3.Char());
+    EXPECT_EQ('N', c4.Char());
+    EXPECT_EQ('S', c5.Char());
+    EXPECT_EQ('H', c6.Char());
+    EXPECT_EQ('P', c7.Char());
+    EXPECT_EQ('=', c8.Char());
+    EXPECT_EQ('X', c9.Char());
+}
+
+TEST(CigarTest, SetTypeYieldsCorrectOperation)
+{
+    CigarOperation c1; c1.Char('M');
+    CigarOperation c2; c2.Char('I');
+    CigarOperation c3; c3.Char('D');
+    CigarOperation c4; c4.Char('N');
+    CigarOperation c5; c5.Char('S');
+    CigarOperation c6; c6.Char('H');
+    CigarOperation c7; c7.Char('P');
+    CigarOperation c8; c8.Char('=');
+    CigarOperation c9; c9.Char('X');
+
+    EXPECT_EQ(CigarOperationType::ALIGNMENT_MATCH,   c1.Type());
+    EXPECT_EQ(CigarOperationType::INSERTION,         c2.Type());
+    EXPECT_EQ(CigarOperationType::DELETION,          c3.Type());
+    EXPECT_EQ(CigarOperationType::REFERENCE_SKIP,    c4.Type());
+    EXPECT_EQ(CigarOperationType::SOFT_CLIP,         c5.Type());
+    EXPECT_EQ(CigarOperationType::HARD_CLIP,         c6.Type());
+    EXPECT_EQ(CigarOperationType::PADDING,           c7.Type());
+    EXPECT_EQ(CigarOperationType::SEQUENCE_MATCH,    c8.Type());
+    EXPECT_EQ(CigarOperationType::SEQUENCE_MISMATCH, c9.Type());
+}
+
+TEST(CigarStringTest, FromStdString_Empty)
+{
+    const std::string emptyCigar = "";
+    Cigar cigar = Cigar::FromStdString(emptyCigar);
+    EXPECT_TRUE(cigar.empty());
+}
+
+TEST(CigarStringTest, FromStdString_SingleOp)
+{
+    const std::string singleCigar = "100=";
+
+    Cigar cigar = Cigar::FromStdString(singleCigar);
+    ASSERT_TRUE(cigar.size() == 1);
+
+    const CigarOperation& op = cigar.front();
+    EXPECT_TRUE(op.Char()   == '=');
+    EXPECT_TRUE(op.Length() == 100);
+}
+
+TEST(CigarStringTest, FromStdString_MultipleOps)
+{
+    const std::string multiCigar = "100=2D34I6=6X6=";
+
+    Cigar cigar = Cigar::FromStdString(multiCigar);
+    ASSERT_TRUE(cigar.size() == 6);
+
+    CigarOperation op0 = cigar.at(0);
+    CigarOperation op1 = cigar.at(1);
+    CigarOperation op2 = cigar.at(2);
+    CigarOperation op3 = cigar.at(3);
+    CigarOperation op4 = cigar.at(4);
+    CigarOperation op5 = cigar.at(5);
+
+    EXPECT_TRUE(op0.Char()   == '=');
+    EXPECT_TRUE(op0.Length() == 100);
+    EXPECT_TRUE(op1.Char()   == 'D');
+    EXPECT_TRUE(op1.Length() == 2);
+    EXPECT_TRUE(op2.Char()   == 'I');
+    EXPECT_TRUE(op2.Length() == 34);
+    EXPECT_TRUE(op3.Char()   == '=');
+    EXPECT_TRUE(op3.Length() == 6);
+    EXPECT_TRUE(op4.Char()   == 'X');
+    EXPECT_TRUE(op4.Length() == 6);
+    EXPECT_TRUE(op5.Char()   == '=');
+    EXPECT_TRUE(op5.Length() == 6);
+}
+
+TEST(CigarStringTest, ToStdString_Empty)
+{
+    const std::string empty;
+    Cigar cigar;
+    EXPECT_EQ(empty, cigar.ToStdString());
+}
+
+TEST(CigarStringTest, ToStdString_SingleOp)
+{
+    const std::string singleCigar = "100=";
+
+    Cigar cigar;
+    cigar.push_back( CigarOperation(CigarOperationType::SEQUENCE_MATCH, 100) );
+
+    EXPECT_EQ(singleCigar, cigar.ToStdString());
+}
+
+TEST(CigarStringTest, ToStdString_MultipleOps)
+{
+    const std::string multiCigar = "100=2D34I6=6X6=";
+
+    Cigar cigar;
+    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH,  100));
+    cigar.push_back(CigarOperation(CigarOperationType::DELETION,          2));
+    cigar.push_back(CigarOperation(CigarOperationType::INSERTION,        34));
+    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH,    6));
+    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MISMATCH, 6));
+    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH,    6));
+
+    EXPECT_EQ(multiCigar, cigar.ToStdString());
+}
+
+// clang-format on
diff --git a/tests/src/test_Compare.cpp b/tests/src/test_Compare.cpp

new file mode 100644 (file)

index 0000000..01c97bf
--- /dev/null
+++ b/tests/src/test_Compare.cpp
@@ -0,0 +1,715 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/Compare.h>
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace CompareTests {
+
+static inline
+BamRecord makeRecordWithTag(const std::string& tagName,
+                            const Tag& tag)
+{
+    auto r = BamRecord{ };
+    r.Impl().AddTag(tagName, tag);
+    return r;
+}
+
+static
+BamRecord makeRecord(const Position qStart,
+                     const Position qEnd,
+                     const std::string& seq,
+                     const std::string& quals,
+                     const std::string& tagBases,
+                     const std::string& tagQuals,
+                     const std::vector<uint16_t>& frames)
+{
+    BamRecordImpl impl;
+    impl.SetSequenceAndQualities(seq, quals);
+
+    TagCollection tags;
+    tags["qs"] = qStart;
+    tags["qe"] = qEnd;
+    tags["ip"] = frames;
+    tags["pw"] = frames;
+    tags["dt"] = tagBases;
+    tags["st"] = tagBases;
+    tags["dq"] = tagQuals;
+    tags["iq"] = tagQuals;
+    tags["mq"] = tagQuals;
+    tags["sq"] = tagQuals;
+    tags["pq"] = tagQuals;
+    tags["pv"] = tagQuals;
+    impl.Tags(tags);
+
+    return BamRecord(std::move(impl));
+}
+
+static
+std::vector<BamRecord> makeMappedRecords()
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const std::vector<uint16_t> frames  = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const uint8_t mapQual = 80;
+
+    const std::string s1_cigar = "10=";
+    const std::string s2_cigar = "5=3D5=";
+    const std::string s3_cigar = "4=1D2I2D2X2=";
+
+    BamRecord s1 = CompareTests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2 = CompareTests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3 = CompareTests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s1_rev = CompareTests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2_rev = CompareTests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3_rev = CompareTests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+
+    s1.Map(0, 100, Strand::FORWARD, s1_cigar, mapQual);
+    s2.Map(0, 100, Strand::FORWARD, s2_cigar, mapQual);
+    s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual);
+    s1_rev.Map(0, 100, Strand::REVERSE, s1_cigar, mapQual);
+    s2_rev.Map(0, 100, Strand::REVERSE, s2_cigar, mapQual);
+    s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual);
+
+    return std::vector<BamRecord> { s1, s2, s3, s1_rev, s2_rev, s3_rev };
+}
+
+} // namespace CompareTests
+
+TEST(CompareTest, TypeToNameOk)
+{
+    EXPECT_EQ(std::string{"Compare::EQUAL"},              Compare::TypeToName(Compare::EQUAL));
+    EXPECT_EQ(std::string{"Compare::NOT_EQUAL"},          Compare::TypeToName(Compare::NOT_EQUAL));
+    EXPECT_EQ(std::string{"Compare::LESS_THAN"},          Compare::TypeToName(Compare::LESS_THAN));
+    EXPECT_EQ(std::string{"Compare::LESS_THAN_EQUAL"},    Compare::TypeToName(Compare::LESS_THAN_EQUAL));
+    EXPECT_EQ(std::string{"Compare::GREATER_THAN"},       Compare::TypeToName(Compare::GREATER_THAN));
+    EXPECT_EQ(std::string{"Compare::GREATER_THAN_EQUAL"}, Compare::TypeToName(Compare::GREATER_THAN_EQUAL));
+    EXPECT_EQ(std::string{"Compare::CONTAINS"},           Compare::TypeToName(Compare::CONTAINS));
+    EXPECT_EQ(std::string{"Compare::NOT_CONTAINS"},       Compare::TypeToName(Compare::NOT_CONTAINS));
+}
+
+TEST(CompareTest, TypeToOperatorOk)
+{
+    { // normal
+        EXPECT_EQ(Compare::TypeToOperator(Compare::EQUAL),              std::string{"=="});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_EQUAL),          std::string{"!="});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN),          std::string{"<"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN_EQUAL),    std::string{"<="});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN),       std::string{">"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN_EQUAL), std::string{">="});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::CONTAINS),           std::string{"&"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_CONTAINS),       std::string{"~"});
+    }
+
+    { // alpha
+        EXPECT_EQ(Compare::TypeToOperator(Compare::EQUAL, true),              std::string{"eq"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_EQUAL, true),          std::string{"ne"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN, true),          std::string{"lt"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN_EQUAL, true),    std::string{"lte"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN, true),       std::string{"gt"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN_EQUAL, true), std::string{"gte"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::CONTAINS, true),           std::string{"and"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_CONTAINS, true),       std::string{"not"});
+    }
+}
+
+TEST(CompareTest, FromOperatorOk)
+{
+    EXPECT_EQ(Compare::EQUAL,              Compare::TypeFromOperator("=="));
+    EXPECT_EQ(Compare::EQUAL,              Compare::TypeFromOperator("="));
+    EXPECT_EQ(Compare::EQUAL,              Compare::TypeFromOperator("eq"));
+    EXPECT_EQ(Compare::NOT_EQUAL,          Compare::TypeFromOperator("!="));
+    EXPECT_EQ(Compare::NOT_EQUAL,          Compare::TypeFromOperator("ne"));
+    EXPECT_EQ(Compare::LESS_THAN,          Compare::TypeFromOperator("<"));
+    EXPECT_EQ(Compare::LESS_THAN,          Compare::TypeFromOperator("lt"));
+    EXPECT_EQ(Compare::LESS_THAN,          Compare::TypeFromOperator("&lt;"));
+    EXPECT_EQ(Compare::LESS_THAN_EQUAL,    Compare::TypeFromOperator("<="));
+    EXPECT_EQ(Compare::LESS_THAN_EQUAL,    Compare::TypeFromOperator("lte"));
+    EXPECT_EQ(Compare::LESS_THAN_EQUAL,    Compare::TypeFromOperator("&lt;="));
+    EXPECT_EQ(Compare::GREATER_THAN,       Compare::TypeFromOperator(">"));
+    EXPECT_EQ(Compare::GREATER_THAN,       Compare::TypeFromOperator("gt"));
+    EXPECT_EQ(Compare::GREATER_THAN,       Compare::TypeFromOperator("&gt;"));
+    EXPECT_EQ(Compare::GREATER_THAN_EQUAL, Compare::TypeFromOperator(">="));
+    EXPECT_EQ(Compare::GREATER_THAN_EQUAL, Compare::TypeFromOperator("gte"));
+    EXPECT_EQ(Compare::GREATER_THAN_EQUAL, Compare::TypeFromOperator("&gt;="));
+    EXPECT_EQ(Compare::CONTAINS,           Compare::TypeFromOperator("&"));
+    EXPECT_EQ(Compare::NOT_CONTAINS,       Compare::TypeFromOperator("~"));
+
+    // invalid operator strings throw
+    EXPECT_THROW(Compare::TypeFromOperator(""),        std::runtime_error);
+    EXPECT_THROW(Compare::TypeFromOperator("invalid"), std::runtime_error);
+}
+
+TEST(CompareTest, AlignedEndOk)
+{
+    BamRecord r1;
+    r1.Map(0, 290, Strand::FORWARD, Cigar{"10="}, 255);
+
+    BamRecord r2;
+    r2.Map(0, 190, Strand::FORWARD, Cigar{"10="}, 255);
+
+    BamRecord r3;
+    r3.Map(0, 290, Strand::FORWARD, Cigar{"10="}, 255);
+
+    BamRecord r4;
+    r4.Map(0, 90, Strand::FORWARD, Cigar{"10="}, 255);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::AlignedEnd());
+
+    EXPECT_EQ(r4.AlignedEnd(), records.at(0).AlignedEnd());
+    EXPECT_EQ(r2.AlignedEnd(), records.at(1).AlignedEnd());
+    EXPECT_EQ(r1.AlignedEnd(), records.at(2).AlignedEnd());
+    EXPECT_EQ(r3.AlignedEnd(), records.at(3).AlignedEnd());
+}
+
+TEST(CompareTest, AlignedStartOk)
+{
+    BamRecord r1;
+    r1.Map(0, 300, Strand::FORWARD, Cigar{"10="}, 255);
+
+    BamRecord r2;
+    r2.Map(0, 200, Strand::FORWARD, Cigar{"10="}, 255);
+
+    BamRecord r3;
+    r3.Map(0, 400, Strand::FORWARD, Cigar{"10="}, 255);
+
+    BamRecord r4;
+    r4.Map(0, 100, Strand::FORWARD, Cigar{"10="}, 255);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::AlignedStart());
+
+    EXPECT_EQ(r4.AlignedStart(), records.at(0).AlignedStart());
+    EXPECT_EQ(r2.AlignedStart(), records.at(1).AlignedStart());
+    EXPECT_EQ(r1.AlignedStart(), records.at(2).AlignedStart());
+    EXPECT_EQ(r3.AlignedStart(), records.at(3).AlignedStart());
+}
+
+TEST(CompareTest, AlignedStrandOk)
+{
+    BamRecord r1; r1.Impl().SetReverseStrand(true);
+    BamRecord r2; r2.Impl().SetReverseStrand(false);
+    BamRecord r3; r3.Impl().SetReverseStrand(true);
+    BamRecord r4; r4.Impl().SetReverseStrand(false);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::AlignedStrand());
+
+    EXPECT_EQ(Strand::FORWARD, records.at(0).AlignedStrand());
+    EXPECT_EQ(Strand::FORWARD, records.at(1).AlignedStrand());
+    EXPECT_EQ(Strand::REVERSE, records.at(2).AlignedStrand());
+    EXPECT_EQ(Strand::REVERSE, records.at(3).AlignedStrand());
+}
+
+TEST(CompareTest, BarcodeForwardOk)
+{
+    BamRecord r1; r1.Barcodes(std::make_pair<int16_t,int16_t>(30,20));
+    BamRecord r2; r2.Barcodes(std::make_pair<int16_t,int16_t>(20,30));
+    BamRecord r3; r3.Barcodes(std::make_pair<int16_t,int16_t>(40,10));
+    BamRecord r4; r4.Barcodes(std::make_pair<int16_t,int16_t>(10,40));
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::BarcodeForward());
+
+    EXPECT_EQ(r4.BarcodeForward(), records.at(0).BarcodeForward());
+    EXPECT_EQ(r2.BarcodeForward(), records.at(1).BarcodeForward());
+    EXPECT_EQ(r1.BarcodeForward(), records.at(2).BarcodeForward());
+    EXPECT_EQ(r3.BarcodeForward(), records.at(3).BarcodeForward());
+}
+
+TEST(CompareTest, BarcodeReverseOk)
+{
+    BamRecord r1; r1.Barcodes(std::make_pair<int16_t,int16_t>(30,20));
+    BamRecord r2; r2.Barcodes(std::make_pair<int16_t,int16_t>(20,30));
+    BamRecord r3; r3.Barcodes(std::make_pair<int16_t,int16_t>(40,10));
+    BamRecord r4; r4.Barcodes(std::make_pair<int16_t,int16_t>(10,40));
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::BarcodeReverse());
+
+    EXPECT_EQ(r3.BarcodeReverse(), records.at(0).BarcodeReverse());
+    EXPECT_EQ(r1.BarcodeReverse(), records.at(1).BarcodeReverse());
+    EXPECT_EQ(r2.BarcodeReverse(), records.at(2).BarcodeReverse());
+    EXPECT_EQ(r4.BarcodeReverse(), records.at(3).BarcodeReverse());
+}
+
+TEST(CompareTest, BarcodeQualityOk)
+{
+    uint8_t q1 = 30;
+    uint8_t q2 = 20;
+    uint8_t q3 = 40;
+    uint8_t q4 = 10;
+
+    auto records = std::vector<BamRecord>
+    {
+        CompareTests::makeRecordWithTag("bq", Tag(q1)),
+        CompareTests::makeRecordWithTag("bq", Tag(q2)),
+        CompareTests::makeRecordWithTag("bq", Tag(q3)),
+        CompareTests::makeRecordWithTag("bq", Tag(q4))
+    };
+    std::sort(records.begin(), records.end(), Compare::BarcodeQuality());
+
+    EXPECT_EQ(q4, records.at(0).BarcodeQuality());
+    EXPECT_EQ(q2, records.at(1).BarcodeQuality());
+    EXPECT_EQ(q1, records.at(2).BarcodeQuality());
+    EXPECT_EQ(q3, records.at(3).BarcodeQuality());
+}
+
+TEST(CompareTest, CustomCompareOk)
+{
+    struct CustomCompare : public Compare::MemberFunctionBase<bool, &BamRecord::HasDeletionTag> { };
+
+    auto records = std::vector<BamRecord>
+    {
+        CompareTests::makeRecordWithTag("dt", Tag(std::string("foo"))),
+        CompareTests::makeRecordWithTag("dt", Tag(std::string("foo"))),
+        CompareTests::makeRecordWithTag("dt", Tag(std::string("foo"))),
+        CompareTests::makeRecordWithTag("dt", Tag(std::string("foo")))
+    };
+    records.push_back(BamRecord());
+    records.push_back(BamRecord());
+    records.push_back(BamRecord());
+    records.push_back(BamRecord());
+    EXPECT_EQ(8, records.size());
+
+    std::sort(records.begin(), records.end(), CustomCompare());
+
+    EXPECT_FALSE(records.at(0).HasDeletionTag());
+    EXPECT_FALSE(records.at(1).HasDeletionTag());
+    EXPECT_FALSE(records.at(2).HasDeletionTag());
+    EXPECT_FALSE(records.at(3).HasDeletionTag());
+    EXPECT_TRUE(records.at(4).HasDeletionTag());
+    EXPECT_TRUE(records.at(5).HasDeletionTag());
+    EXPECT_TRUE(records.at(6).HasDeletionTag());
+    EXPECT_TRUE(records.at(7).HasDeletionTag());
+}
+
+TEST(CompareTest, FullNameOk)
+{
+    BamRecord r1; r1.Impl().Name("c");
+    BamRecord r2; r2.Impl().Name("b");
+    BamRecord r3; r3.Impl().Name("d");
+    BamRecord r4; r4.Impl().Name("a");
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::FullName());
+
+    EXPECT_EQ(r4.FullName(), records.at(0).FullName());
+    EXPECT_EQ(r2.FullName(), records.at(1).FullName());
+    EXPECT_EQ(r1.FullName(), records.at(2).FullName());
+    EXPECT_EQ(r3.FullName(), records.at(3).FullName());
+}
+
+TEST(CompareTest, LocalContextFlagOk)
+{
+    BamRecord r1; r1.LocalContextFlags(LocalContextFlags::BARCODE_AFTER);
+    BamRecord r2; r2.LocalContextFlags(LocalContextFlags::ADAPTER_AFTER);
+    BamRecord r3; r3.LocalContextFlags(LocalContextFlags::REVERSE_PASS);
+    BamRecord r4; r4.LocalContextFlags(LocalContextFlags::NO_LOCAL_CONTEXT);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::LocalContextFlag());
+
+    EXPECT_EQ(r4.LocalContextFlags(), records.at(0).LocalContextFlags());
+    EXPECT_EQ(r2.LocalContextFlags(), records.at(1).LocalContextFlags());
+    EXPECT_EQ(r1.LocalContextFlags(), records.at(2).LocalContextFlags());
+    EXPECT_EQ(r3.LocalContextFlags(), records.at(3).LocalContextFlags());
+}
+
+TEST(CompareTest, MapQualityOk)
+{
+    BamRecord r1; r1.Impl().MapQuality(30);
+    BamRecord r2; r2.Impl().MapQuality(20);
+    BamRecord r3; r3.Impl().MapQuality(40);
+    BamRecord r4; r4.Impl().MapQuality(10);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::MapQuality());
+
+    EXPECT_EQ(r4.MapQuality(), records.at(0).MapQuality());
+    EXPECT_EQ(r2.MapQuality(), records.at(1).MapQuality());
+    EXPECT_EQ(r1.MapQuality(), records.at(2).MapQuality());
+    EXPECT_EQ(r3.MapQuality(), records.at(3).MapQuality());
+}
+
+TEST(CompareTest, MovieNameOk)
+{
+    auto rg1 = ReadGroupInfo { "a", "SUBREAD" };
+    auto rg2 = ReadGroupInfo { "b", "SUBREAD" };
+    auto rg3 = ReadGroupInfo { "c", "SUBREAD" };
+    auto rg4 = ReadGroupInfo { "d", "SUBREAD" };
+
+    BamHeader header;
+    header.AddReadGroup(rg1)
+          .AddReadGroup(rg2)
+          .AddReadGroup(rg3)
+          .AddReadGroup(rg4);
+
+    BamRecord r1(header); r1.ReadGroup(rg3);
+    BamRecord r2(header); r2.ReadGroup(rg2);
+    BamRecord r3(header); r3.ReadGroup(rg4);
+    BamRecord r4(header); r4.ReadGroup(rg1);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::MovieName());
+
+    EXPECT_EQ(r4.MovieName(), records.at(0).MovieName());
+    EXPECT_EQ(r2.MovieName(), records.at(1).MovieName());
+    EXPECT_EQ(r1.MovieName(), records.at(2).MovieName());
+    EXPECT_EQ(r3.MovieName(), records.at(3).MovieName());
+}
+
+TEST(CompareTest, NoneOk)
+{
+    BamRecord r1; r1.Impl().Name("c");
+    BamRecord r2; r2.Impl().Name("b");
+    BamRecord r3; r3.Impl().Name("d");
+    BamRecord r4; r4.Impl().Name("a");
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::None());
+
+    EXPECT_EQ(r1.FullName(), records.at(0).FullName());
+    EXPECT_EQ(r2.FullName(), records.at(1).FullName());
+    EXPECT_EQ(r3.FullName(), records.at(2).FullName());
+    EXPECT_EQ(r4.FullName(), records.at(3).FullName());
+}
+
+TEST(CompareTest, NumDeletedBasesOk)
+{
+    // create test data
+    auto records = CompareTests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(0, records.at(0).NumDeletedBases());
+    EXPECT_EQ(3, records.at(1).NumDeletedBases());
+    EXPECT_EQ(3, records.at(2).NumDeletedBases());
+    EXPECT_EQ(0, records.at(3).NumDeletedBases());
+    EXPECT_EQ(3, records.at(4).NumDeletedBases());
+    EXPECT_EQ(3, records.at(5).NumDeletedBases());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::NumDeletedBases());
+    EXPECT_EQ(0, records.at(0).NumDeletedBases());
+    EXPECT_EQ(0, records.at(1).NumDeletedBases());
+    EXPECT_EQ(3, records.at(2).NumDeletedBases());
+    EXPECT_EQ(3, records.at(3).NumDeletedBases());
+    EXPECT_EQ(3, records.at(4).NumDeletedBases());
+    EXPECT_EQ(3, records.at(5).NumDeletedBases());
+}
+
+TEST(CompareTest, NumInsertedBasesOk)
+{
+    // create test data
+    auto records = CompareTests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(0, records.at(0).NumInsertedBases());
+    EXPECT_EQ(0, records.at(1).NumInsertedBases());
+    EXPECT_EQ(2, records.at(2).NumInsertedBases());
+    EXPECT_EQ(0, records.at(3).NumInsertedBases());
+    EXPECT_EQ(0, records.at(4).NumInsertedBases());
+    EXPECT_EQ(2, records.at(5).NumInsertedBases());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::NumInsertedBases());
+    EXPECT_EQ(0, records.at(0).NumInsertedBases());
+    EXPECT_EQ(0, records.at(1).NumInsertedBases());
+    EXPECT_EQ(0, records.at(2).NumInsertedBases());
+    EXPECT_EQ(0, records.at(3).NumInsertedBases());
+    EXPECT_EQ(2, records.at(4).NumInsertedBases());
+    EXPECT_EQ(2, records.at(5).NumInsertedBases());
+}
+
+TEST(CompareTest, NumMatchesOk)
+{
+    // create test data
+    auto records = CompareTests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(10, records.at(0).NumMatches());
+    EXPECT_EQ(10, records.at(1).NumMatches());
+    EXPECT_EQ(6,  records.at(2).NumMatches());
+    EXPECT_EQ(10, records.at(3).NumMatches());
+    EXPECT_EQ(10, records.at(4).NumMatches());
+    EXPECT_EQ(6,  records.at(5).NumMatches());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::NumMatches());
+    EXPECT_EQ(6,  records.at(0).NumMatches());
+    EXPECT_EQ(6,  records.at(1).NumMatches());
+    EXPECT_EQ(10, records.at(2).NumMatches());
+    EXPECT_EQ(10, records.at(3).NumMatches());
+    EXPECT_EQ(10, records.at(4).NumMatches());
+    EXPECT_EQ(10, records.at(5).NumMatches());
+}
+
+TEST(CompareTest, NumMismatchesOk)
+{
+    // create test data
+    auto records = CompareTests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(0, records.at(0).NumMismatches());
+    EXPECT_EQ(0, records.at(1).NumMismatches());
+    EXPECT_EQ(2, records.at(2).NumMismatches());
+    EXPECT_EQ(0, records.at(3).NumMismatches());
+    EXPECT_EQ(0, records.at(4).NumMismatches());
+    EXPECT_EQ(2, records.at(5).NumMismatches());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::NumMismatches());
+    EXPECT_EQ(0, records.at(0).NumMismatches());
+    EXPECT_EQ(0, records.at(1).NumMismatches());
+    EXPECT_EQ(0, records.at(2).NumMismatches());
+    EXPECT_EQ(0, records.at(3).NumMismatches());
+    EXPECT_EQ(2, records.at(4).NumMismatches());
+    EXPECT_EQ(2, records.at(5).NumMismatches());
+}
+
+TEST(CompareTest, QueryEndOk)
+{
+    Position q1 = 30;
+    Position q2 = 20;
+    Position q3 = 40;
+    Position q4 = 10;
+
+    auto records = std::vector<BamRecord>
+    {
+        CompareTests::makeRecordWithTag("qe", Tag(q1)),
+        CompareTests::makeRecordWithTag("qe", Tag(q2)),
+        CompareTests::makeRecordWithTag("qe", Tag(q3)),
+        CompareTests::makeRecordWithTag("qe", Tag(q4))
+    };
+    std::sort(records.begin(), records.end(), Compare::QueryEnd());
+
+    EXPECT_EQ(q4, records.at(0).QueryEnd());
+    EXPECT_EQ(q2, records.at(1).QueryEnd());
+    EXPECT_EQ(q1, records.at(2).QueryEnd());
+    EXPECT_EQ(q3, records.at(3).QueryEnd());
+}
+
+TEST(CompareTest, QueryStartOk)
+{
+    Position q1 = 30;
+    Position q2 = 20;
+    Position q3 = 40;
+    Position q4 = 10;
+
+    auto records = std::vector<BamRecord>
+    {
+        CompareTests::makeRecordWithTag("qs", Tag(q1)),
+        CompareTests::makeRecordWithTag("qs", Tag(q2)),
+        CompareTests::makeRecordWithTag("qs", Tag(q3)),
+        CompareTests::makeRecordWithTag("qs", Tag(q4))
+    };
+    std::sort(records.begin(), records.end(), Compare::QueryStart());
+
+    EXPECT_EQ(q4, records.at(0).QueryStart());
+    EXPECT_EQ(q2, records.at(1).QueryStart());
+    EXPECT_EQ(q1, records.at(2).QueryStart());
+    EXPECT_EQ(q3, records.at(3).QueryStart());
+}
+
+TEST(CompareTest, ReadGroupIdOk)
+{
+    auto rg1 = ReadGroupInfo { "foo", "SUBREAD" };
+    auto rg2 = ReadGroupInfo { "bar", "SUBREAD" };
+    auto rg3 = ReadGroupInfo { "c",   "SUBREAD" };
+    auto rg4 = ReadGroupInfo { "d",   "SUBREAD" };
+
+    BamHeader header;
+    header.AddReadGroup(rg1)
+          .AddReadGroup(rg2)
+          .AddReadGroup(rg3)
+          .AddReadGroup(rg4);
+
+    BamRecord r1(header); r1.ReadGroup(rg3); // -> 99365356
+    BamRecord r2(header); r2.ReadGroup(rg2); // -> d9f305e4
+    BamRecord r3(header); r3.ReadGroup(rg4); // -> 54397cd6
+    BamRecord r4(header); r4.ReadGroup(rg1); // -> a60ddc69
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReadGroupId()); // lexical, NOT numeric ordering
+
+    EXPECT_EQ(r3.ReadGroupId(), records.at(0).ReadGroupId());
+    EXPECT_EQ(r1.ReadGroupId(), records.at(1).ReadGroupId());
+    EXPECT_EQ(r4.ReadGroupId(), records.at(2).ReadGroupId());
+    EXPECT_EQ(r2.ReadGroupId(), records.at(3).ReadGroupId());
+}
+
+TEST(CompareTest, ReadGroupNumericIdOk)
+{
+    auto rg1 = ReadGroupInfo { "a", "SUBREAD" };
+    auto rg2 = ReadGroupInfo { "b", "SUBREAD" };
+    auto rg3 = ReadGroupInfo { "c", "SUBREAD" };
+    auto rg4 = ReadGroupInfo { "d", "SUBREAD" };
+
+    BamHeader header;
+    header.AddReadGroup(rg1)
+          .AddReadGroup(rg2)
+          .AddReadGroup(rg3)
+          .AddReadGroup(rg4);
+
+    BamRecord r1(header); r1.ReadGroup(rg3); // -> -1724492970
+    BamRecord r2(header); r2.ReadGroup(rg2); // ->   235381373
+    BamRecord r3(header); r3.ReadGroup(rg4); // ->  1413053654
+    BamRecord r4(header); r4.ReadGroup(rg1); // ->  1153643386
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReadGroupNumericId()); // numeric ordering
+
+    EXPECT_EQ(r1.ReadGroupNumericId(), records.at(0).ReadGroupNumericId());
+    EXPECT_EQ(r2.ReadGroupNumericId(), records.at(1).ReadGroupNumericId());
+    EXPECT_EQ(r4.ReadGroupNumericId(), records.at(2).ReadGroupNumericId());
+    EXPECT_EQ(r3.ReadGroupNumericId(), records.at(3).ReadGroupNumericId());
+}
+
+TEST(CompareTest, ReadAccuracyOk)
+{
+    Accuracy a1 = 30;
+    Accuracy a2 = 20;
+    Accuracy a3 = 40;
+    Accuracy a4 = 10;
+
+    auto records = std::vector<BamRecord>
+    {
+        CompareTests::makeRecordWithTag("rq", Tag(a1)),
+        CompareTests::makeRecordWithTag("rq", Tag(a2)),
+        CompareTests::makeRecordWithTag("rq", Tag(a3)),
+        CompareTests::makeRecordWithTag("rq", Tag(a4))
+    };
+    std::sort(records.begin(), records.end(), Compare::ReadAccuracy());
+
+    EXPECT_EQ(a4, records.at(0).ReadAccuracy());
+    EXPECT_EQ(a2, records.at(1).ReadAccuracy());
+    EXPECT_EQ(a1, records.at(2).ReadAccuracy());
+    EXPECT_EQ(a3, records.at(3).ReadAccuracy());
+}
+
+TEST(CompareTest, ReferenceEndOk)
+{
+    // create test data
+    auto records = CompareTests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(110, records.at(0).ReferenceEnd());
+    EXPECT_EQ(113, records.at(1).ReferenceEnd());
+    EXPECT_EQ(111, records.at(2).ReferenceEnd());
+    EXPECT_EQ(110, records.at(3).ReferenceEnd());
+    EXPECT_EQ(113, records.at(4).ReferenceEnd());
+    EXPECT_EQ(111, records.at(5).ReferenceEnd());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::ReferenceEnd());
+    EXPECT_EQ(110, records.at(0).ReferenceEnd());
+    EXPECT_EQ(110, records.at(1).ReferenceEnd());
+    EXPECT_EQ(111, records.at(2).ReferenceEnd());
+    EXPECT_EQ(111, records.at(3).ReferenceEnd());
+    EXPECT_EQ(113, records.at(4).ReferenceEnd());
+    EXPECT_EQ(113, records.at(5).ReferenceEnd());
+}
+
+TEST(CompareTest, ReferenceIdOk)
+{
+    BamRecord r1; r1.Impl().ReferenceId(30);
+    BamRecord r2; r2.Impl().ReferenceId(20);
+    BamRecord r3; r3.Impl().ReferenceId(40);
+    BamRecord r4; r4.Impl().ReferenceId(10);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReferenceId());
+
+    EXPECT_EQ(r4.ReferenceId(), records.at(0).ReferenceId());
+    EXPECT_EQ(r2.ReferenceId(), records.at(1).ReferenceId());
+    EXPECT_EQ(r1.ReferenceId(), records.at(2).ReferenceId());
+    EXPECT_EQ(r3.ReferenceId(), records.at(3).ReferenceId());
+}
+
+TEST(CompareTest, ReferenceNameOk)
+{
+    auto seq1 = SequenceInfo { "seq1" };
+    auto seq2 = SequenceInfo { "seq2" };
+    auto seq3 = SequenceInfo { "seq3" };
+    auto seq4 = SequenceInfo { "seq4" };
+
+    BamHeader header;
+    header.AddSequence(seq1)  // -> 0
+          .AddSequence(seq2)  // -> 1
+          .AddSequence(seq3)  // -> 2
+          .AddSequence(seq4); // -> 3
+
+    BamRecord r1(header); r1.Impl().SetMapped(true); r1.Impl().ReferenceId(2);
+    BamRecord r2(header); r2.Impl().SetMapped(true); r2.Impl().ReferenceId(1);
+    BamRecord r3(header); r3.Impl().SetMapped(true); r3.Impl().ReferenceId(3);
+    BamRecord r4(header); r4.Impl().SetMapped(true); r4.Impl().ReferenceId(0);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReferenceName());
+
+    EXPECT_EQ(seq1.Name(), records.at(0).ReferenceName());
+    EXPECT_EQ(seq2.Name(), records.at(1).ReferenceName());
+    EXPECT_EQ(seq3.Name(), records.at(2).ReferenceName());
+    EXPECT_EQ(seq4.Name(), records.at(3).ReferenceName());
+}
+
+TEST(CompareTest, ReferenceStartOk)
+{
+    BamRecord r1; r1.Impl().Position(30);
+    BamRecord r2; r2.Impl().Position(20);
+    BamRecord r3; r3.Impl().Position(40);
+    BamRecord r4; r4.Impl().Position(10);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReferenceStart());
+
+    EXPECT_EQ(r4.ReferenceStart(), records.at(0).ReferenceStart());
+    EXPECT_EQ(r2.ReferenceStart(), records.at(1).ReferenceStart());
+    EXPECT_EQ(r1.ReferenceStart(), records.at(2).ReferenceStart());
+    EXPECT_EQ(r3.ReferenceStart(), records.at(3).ReferenceStart());
+}
+
+TEST(CompareTest, ZmwOk)
+{
+    int32_t z1 = 30;
+    int32_t z2 = 20;
+    int32_t z3 = 40;
+    int32_t z4 = 10;
+
+    auto records = std::vector<BamRecord>
+    {
+        CompareTests::makeRecordWithTag("zm", Tag(z1)),
+        CompareTests::makeRecordWithTag("zm", Tag(z2)),
+        CompareTests::makeRecordWithTag("zm", Tag(z3)),
+        CompareTests::makeRecordWithTag("zm", Tag(z4))
+    };
+    std::sort(records.begin(), records.end(), Compare::Zmw());
+
+    EXPECT_EQ(z4, records.at(0).HoleNumber());
+    EXPECT_EQ(z2, records.at(1).HoleNumber());
+    EXPECT_EQ(z1, records.at(2).HoleNumber());
+    EXPECT_EQ(z3, records.at(3).HoleNumber());
+}
+
+// clang-format on
diff --git a/tests/src/test_DataSetCore.cpp b/tests/src/test_DataSetCore.cpp

new file mode 100644 (file)

index 0000000..edc444e
--- /dev/null
+++ b/tests/src/test_DataSetCore.cpp
@@ -0,0 +1,569 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "../src/FileUtils.h"
+#include "PbbamTestData.h"
+
+#include <pbbam/DataSet.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace DataSetCoreTests {
+
+const std::string subreadsetBioSample =
+    PbbamTestsConfig::Data_Dir + "/dataset/biosample.subreadset.xml";
+
+static inline DataSet CreateDataSet()
+{
+    DataSet d;
+    d.Name("foo");
+    return d;
+}
+
+}  // namespace DataSetCoreTests
+
+TEST(DataSetCoreTest, XmlNameParts)
+{
+    internal::XmlName name("ns:node_name");
+    EXPECT_EQ(boost::string_ref("ns"), name.Prefix());
+    EXPECT_EQ(boost::string_ref("node_name"), name.LocalName());
+    EXPECT_EQ(boost::string_ref("ns:node_name"), name.QualifiedName());
+
+    internal::XmlName bareName("node_name");
+    EXPECT_EQ(boost::string_ref(""), bareName.Prefix());
+    EXPECT_EQ(boost::string_ref("node_name"), bareName.LocalName());
+    EXPECT_EQ(boost::string_ref("node_name"), bareName.QualifiedName());
+
+    internal::XmlName leadingColon(":node_name");
+    EXPECT_EQ(boost::string_ref(""), leadingColon.Prefix());
+    EXPECT_EQ(boost::string_ref(":node_name"), leadingColon.LocalName());
+    EXPECT_EQ(boost::string_ref(":node_name"), leadingColon.QualifiedName());
+}
+
+TEST(DataSetCoreTest, DefaultsOk)
+{
+    DataSet dataset;
+    EXPECT_EQ(DataSet::GENERIC, dataset.Type());
+    EXPECT_FALSE(dataset.CreatedAt().empty());
+    EXPECT_FALSE(dataset.MetaType().empty());
+    EXPECT_FALSE(dataset.TimeStampedName().empty());
+    EXPECT_FALSE(dataset.UniqueId().empty());
+    EXPECT_FALSE(dataset.Version().empty());
+
+    EXPECT_EQ(0, dataset.TimeStampedName().find("pacbio_dataset_"));
+
+    EXPECT_TRUE(dataset.Format().empty());
+    EXPECT_TRUE(dataset.ModifiedAt().empty());
+    EXPECT_TRUE(dataset.Name().empty());
+    EXPECT_TRUE(dataset.ResourceId().empty());
+    EXPECT_TRUE(dataset.Tags().empty());
+    EXPECT_EQ(0, dataset.ExternalResources().Size());
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    EXPECT_EQ(std::string{"3.0.1"}, dataset.Version());
+}
+
+TEST(DataSetCoreTest, TimeStampedNamesOk)
+{
+    DataSet dataset;
+    AlignmentSet alignmentSet;
+    BarcodeSet barcodeSet;
+    ContigSet contigSet;
+    ConsensusAlignmentSet consensusAlignmentSet;
+    ConsensusReadSet consensusReadSet;
+    HdfSubreadSet hdfSubreadSet;
+    ReferenceSet referenceSet;
+    SubreadSet subreadSet;
+    TranscriptSet transcriptSet;
+
+    EXPECT_EQ(0, dataset.TimeStampedName().find("pacbio_dataset_dataset-"));
+    EXPECT_EQ(0, alignmentSet.TimeStampedName().find("pacbio_dataset_alignmentset-"));
+    EXPECT_EQ(0, barcodeSet.TimeStampedName().find("pacbio_dataset_barcodeset-"));
+    EXPECT_EQ(0, contigSet.TimeStampedName().find("pacbio_dataset_contigset-"));
+    EXPECT_EQ(
+        0, consensusAlignmentSet.TimeStampedName().find("pacbio_dataset_consensusalignmentset-"));
+    EXPECT_EQ(0, consensusReadSet.TimeStampedName().find("pacbio_dataset_consensusreadset-"));
+    EXPECT_EQ(0, hdfSubreadSet.TimeStampedName().find("pacbio_dataset_hdfsubreadset-"));
+    EXPECT_EQ(0, referenceSet.TimeStampedName().find("pacbio_dataset_referenceset-"));
+    EXPECT_EQ(0, subreadSet.TimeStampedName().find("pacbio_dataset_subreadset-"));
+    EXPECT_EQ(0, transcriptSet.TimeStampedName().find("pacbio_dataset_transcriptset-"));
+}
+
+TEST(DataSetCoreTest, BasicGettersSettersOk)
+{
+    DataSet dataset;
+    dataset.CreatedAt("now");
+    dataset.Format("format");
+    dataset.MetaType("meta");
+    dataset.ModifiedAt("later");
+    dataset.Name("foo");
+    dataset.ResourceId("path/to/file");
+    dataset.Tags("tag tag");
+    dataset.TimeStampedName("now:30");
+    dataset.UniqueId("uuid");
+    dataset.Version("0.0.0");
+
+    EXPECT_EQ(std::string("now"), dataset.CreatedAt());
+    EXPECT_EQ(std::string("format"), dataset.Format());
+    EXPECT_EQ(std::string("meta"), dataset.MetaType());
+    EXPECT_EQ(std::string("later"), dataset.ModifiedAt());
+    EXPECT_EQ(std::string("foo"), dataset.Name());
+    EXPECT_EQ(std::string("path/to/file"), dataset.ResourceId());
+    EXPECT_EQ(std::string("tag tag"), dataset.Tags());
+    EXPECT_EQ(std::string("now:30"), dataset.TimeStampedName());
+    EXPECT_EQ(std::string("uuid"), dataset.UniqueId());
+    EXPECT_EQ(std::string("0.0.0"), dataset.Version());
+}
+
+TEST(DataSetCoreTest, CopyOk)
+{
+    DataSet d1;
+    d1.Name("foo");
+
+    // copy ctor
+    DataSet d2(d1);
+    EXPECT_EQ(std::string("foo"), d2.Name());
+
+    // copy assignment
+    DataSet d3;
+    d3 = d1;
+    EXPECT_EQ(std::string("foo"), d3.Name());
+}
+
+TEST(DataSetCoreTest, MoveOk)
+{
+    DataSet d1;
+    d1.Name("foo");
+
+// move ctor
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    DataSet d2(std::move(DataSetCoreTests::CreateDataSet()));
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+    EXPECT_EQ(std::string("foo"), d2.Name());
+
+    // move assignment
+    DataSet d3;
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    d3 = std::move(DataSetCoreTests::CreateDataSet());
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+    EXPECT_EQ(std::string("foo"), d3.Name());
+}
+
+TEST(DataSetCoreTest, AddExternalResources)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.ExternalResources().Size());
+
+    ExternalResource resource1("metatype", "id");
+    resource1.Name("file1");
+
+    ExternalResource resource2("metatype", "id2");
+    resource2.Name("file2");
+
+    dataset.ExternalResources().Add(resource1);
+    dataset.ExternalResources().Add(resource2);
+    EXPECT_EQ(2, dataset.ExternalResources().Size());
+
+    // disallow duplicates (checking on ResourceId)
+    ExternalResource duplicateResource("metatype", "id");
+    dataset.ExternalResources().Add(duplicateResource);
+    EXPECT_EQ(2, dataset.ExternalResources().Size());
+
+    // direct access
+    const ExternalResources& resources = dataset.ExternalResources();
+    EXPECT_EQ(std::string("file1"), resources[0].Name());
+    EXPECT_EQ(std::string("file2"), resources[1].Name());
+
+    // iterable
+    size_t i = 0;
+    for (auto r : resources) {
+        if (i == 0)
+            EXPECT_EQ(std::string("file1"), r.Name());
+        else
+            EXPECT_EQ(std::string("file2"), r.Name());
+        ++i;
+    }
+}
+
+TEST(DataSetCoreTest, EditExternalResources)
+{
+    DataSet dataset;
+
+    ExternalResource resource("metatype", "id");
+    resource.Name("file1");
+    dataset.ExternalResources().Add(resource);
+
+    resource.Name("file2").ResourceId("id2");
+    dataset.ExternalResources().Add(resource);
+    EXPECT_EQ(2, dataset.ExternalResources().Size());
+
+    // edit
+    dataset.ExternalResources()[0].Name("some new name");
+    EXPECT_EQ(std::string("some new name"), dataset.ExternalResources()[0].Name());
+    EXPECT_EQ(std::string("file2"), dataset.ExternalResources()[1].Name());
+}
+
+TEST(DataSetCoreTest, NestedExternalResources)
+{
+    ExternalResource resource("metatype", "filename");
+    resource.ExternalResources().Add(ExternalResource("metatype.child", "filename.child"));
+    resource.ExternalResources().Add(ExternalResource("metatype.child2", "filename.child2"));
+
+    const ExternalResources& childResources = resource.ExternalResources();
+    EXPECT_EQ(2, childResources.Size());
+    EXPECT_EQ(std::string("metatype.child"), childResources[0].MetaType());
+    EXPECT_EQ(std::string("metatype.child2"), childResources[1].MetaType());
+    EXPECT_EQ(std::string("filename.child"), childResources[0].ResourceId());
+    EXPECT_EQ(std::string("filename.child2"), childResources[1].ResourceId());
+}
+
+TEST(DataSetCoreTest, AddFilters)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    Filter filter;
+    filter.Properties().Add(Property("rq", "0.85", ">"));
+    filter.Properties().Add(Property("RNAME", "chr1", "=="));
+    EXPECT_EQ(2, filter.Properties().Size());
+
+    Filter filter2;
+    filter2.Properties().Add(Property("rq", "0.50", ">="));
+    filter2.Properties().Add(Property("RNAME", "chr2", "!="));
+    EXPECT_EQ(2, filter2.Properties().Size());
+
+    dataset.Filters().Add(filter);
+    dataset.Filters().Add(filter2);
+
+    const Filters& filters = dataset.Filters();
+    EXPECT_EQ(2, filters.Size());
+    EXPECT_EQ(2, filters[0].Properties().Size());
+    EXPECT_EQ(2, filters[1].Properties().Size());
+
+    // direct access
+    const Property& p0 = filters[0].Properties()[0];
+    EXPECT_EQ(std::string("rq"), p0.Name());
+    EXPECT_EQ(std::string("0.85"), p0.Value());
+    EXPECT_EQ(std::string(">"), p0.Operator());
+
+    const Property& p1 = filters[0].Properties()[1];
+    EXPECT_EQ(std::string("RNAME"), p1.Name());
+    EXPECT_EQ(std::string("chr1"), p1.Value());
+    EXPECT_EQ(std::string("=="), p1.Operator());
+
+    const Property& p2 = filters[1].Properties()[0];
+    EXPECT_EQ(std::string("rq"), p2.Name());
+    EXPECT_EQ(std::string("0.50"), p2.Value());
+    EXPECT_EQ(std::string(">="), p2.Operator());
+
+    const Property& p3 = filters[1].Properties()[1];
+    EXPECT_EQ(std::string("RNAME"), p3.Name());
+    EXPECT_EQ(std::string("chr2"), p3.Value());
+    EXPECT_EQ(std::string("!="), p3.Operator());
+
+    // iteratable
+    size_t i = 0;
+    size_t j = 0;
+    for (const Filter& f : filters) {
+        if (i == 0) {
+            const Properties& properties = f.Properties();
+            for (const Property& p : properties) {
+                if (j == 0) {
+                    EXPECT_EQ(std::string("rq"), p.Name());
+                    EXPECT_EQ(std::string("0.85"), p.Value());
+                    EXPECT_EQ(std::string(">"), p.Operator());
+                } else {
+                    EXPECT_EQ(std::string("RNAME"), p.Name());
+                    EXPECT_EQ(std::string("chr1"), p.Value());
+                    EXPECT_EQ(std::string("=="), p.Operator());
+                }
+                ++j;
+            }
+        } else {
+            const Properties& properties = f.Properties();
+            for (const Property& p : properties) {
+                if (j == 0) {
+                    EXPECT_EQ(std::string("rq"), p.Name());
+                    EXPECT_EQ(std::string("0.50"), p.Value());
+                    EXPECT_EQ(std::string(">="), p.Operator());
+                } else {
+                    EXPECT_EQ(std::string("RNAME"), p.Name());
+                    EXPECT_EQ(std::string("chr2"), p.Value());
+                    EXPECT_EQ(std::string("!="), p.Operator());
+                }
+                ++j;
+            }
+        }
+        ++i;
+        j = 0;
+    }
+}
+
+TEST(DataSetCoreTest, EditFilters)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    Filter filter;
+    filter.Properties().Add(Property("rq", "0.85", ">"));
+    filter.Properties().Add(Property("RNAME", "chr1", "=="));
+    EXPECT_EQ(2, filter.Properties().Size());
+
+    Filter filter2;
+    filter2.Properties().Add(Property("rq", "0.50", ">="));
+    filter2.Properties().Add(Property("RNAME", "chr2", "!="));
+    EXPECT_EQ(2, filter2.Properties().Size());
+
+    dataset.Filters().Add(filter);
+    dataset.Filters().Add(filter2);
+    EXPECT_EQ(2, dataset.Filters().Size());
+    EXPECT_EQ(2, dataset.Filters()[0].Properties().Size());
+    EXPECT_EQ(2, dataset.Filters()[1].Properties().Size());
+
+    // edit property in-place
+    Property& p = dataset.Filters()[0].Properties()[0];
+    p.Name("someNewName");
+    p.Value("someNewValue");
+    p.Operator("==");
+
+    const Property& p0 = dataset.Filters()[0].Properties()[0];
+    EXPECT_EQ(std::string("someNewName"), p0.Name());
+    EXPECT_EQ(std::string("someNewValue"), p0.Value());
+    EXPECT_EQ(std::string("=="), p0.Operator());
+
+    const Property& p1 = dataset.Filters()[0].Properties()[1];
+    EXPECT_EQ(std::string("RNAME"), p1.Name());
+    EXPECT_EQ(std::string("chr1"), p1.Value());
+    EXPECT_EQ(std::string("=="), p1.Operator());
+
+    const Property& p2 = dataset.Filters()[1].Properties()[0];
+    EXPECT_EQ(std::string("rq"), p2.Name());
+    EXPECT_EQ(std::string("0.50"), p2.Value());
+    EXPECT_EQ(std::string(">="), p2.Operator());
+
+    const Property& p3 = dataset.Filters()[1].Properties()[1];
+    EXPECT_EQ(std::string("RNAME"), p3.Name());
+    EXPECT_EQ(std::string("chr2"), p3.Value());
+    EXPECT_EQ(std::string("!="), p3.Operator());
+}
+
+TEST(DataSetCoreTest, AddSubDataSets)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    DataSetBase sub1;
+    sub1.Name("subset_1");
+
+    DataSetBase sub2;
+    sub2.Name("subset_2");
+
+    dataset.SubDataSets().Add(sub1);
+    dataset.SubDataSets().Add(sub2);
+    EXPECT_EQ(2, dataset.SubDataSets().Size());
+
+    // direct access
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    EXPECT_EQ(std::string("subset_1"), subdatasets[0].Name());
+    EXPECT_EQ(std::string("subset_2"), subdatasets[1].Name());
+
+    // iterable
+    size_t i = 0;
+    for (const DataSetBase& ds : subdatasets) {
+        if (i == 0)
+            EXPECT_EQ(std::string("subset_1"), ds.Name());
+        else
+            EXPECT_EQ(std::string("subset_2"), ds.Name());
+        ++i;
+    }
+}
+
+TEST(DataSetCoreTest, EditSubDataSets)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    DataSetBase sub1;
+    sub1.Name("subset_1");
+
+    DataSetBase sub2;
+    sub2.Name("subset_2");
+
+    dataset.SubDataSets().Add(sub1);
+    dataset.SubDataSets().Add(sub2);
+    EXPECT_EQ(2, dataset.SubDataSets().Size());
+
+    // edit
+    dataset.SubDataSets()[0].Name("subset_1_edited");
+
+    // direct access
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    EXPECT_EQ(std::string("subset_1_edited"), subdatasets[0].Name());
+    EXPECT_EQ(std::string("subset_2"), subdatasets[1].Name());
+
+    // iterable
+    size_t i = 0;
+    for (const DataSetBase& ds : subdatasets) {
+        if (i == 0)
+            EXPECT_EQ(std::string("subset_1_edited"), ds.Name());
+        else
+            EXPECT_EQ(std::string("subset_2"), ds.Name());
+        ++i;
+    }
+}
+
+TEST(DataSetCoreTest, RemoveExternalResources)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.ExternalResources().Size());
+
+    ExternalResource resource1("metatype", "id");
+    resource1.Name("file1");
+
+    ExternalResource resource2("metatype", "id2");
+    resource2.Name("file2");
+
+    dataset.ExternalResources().Add(resource1);
+    dataset.ExternalResources().Add(resource2);
+    EXPECT_EQ(2, dataset.ExternalResources().Size());
+
+    // remove
+    dataset.ExternalResources().Remove(resource1);
+    EXPECT_EQ(1, dataset.ExternalResources().Size());
+
+    // direct access
+    const ExternalResources& resources = dataset.ExternalResources();
+    EXPECT_EQ(std::string("file2"), resources[0].Name());
+
+    // iterable
+    size_t i = 0;
+    for (auto r : resources) {
+        if (i == 0) {
+            EXPECT_EQ(std::string("file2"), r.Name());
+        }
+        ++i;
+    }
+}
+
+TEST(DataSetCoreTest, RemoveFilters)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    Filter filter;
+    filter.Properties().Add(Property("rq", "0.85", ">"));
+    filter.Properties().Add(Property("RNAME", "chr1", "=="));
+    EXPECT_EQ(2, filter.Properties().Size());
+
+    Filter filter2;
+    filter2.Properties().Add(Property("rq", "0.50", ">="));
+    filter2.Properties().Add(Property("RNAME", "chr2", "!="));
+    EXPECT_EQ(2, filter2.Properties().Size());
+
+    dataset.Filters().Add(filter);
+    dataset.Filters().Add(filter2);
+    EXPECT_EQ(2, dataset.Filters().Size());
+
+    // remove
+    dataset.Filters().Remove(filter);
+    EXPECT_EQ(1, dataset.Filters().Size());
+
+    const Filters& filters = dataset.Filters();
+    EXPECT_EQ(2, filters[0].Properties().Size());
+}
+
+TEST(DataSetCoreTest, RemoveSubDataSets)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    DataSetBase sub1;
+    sub1.Name("subset_1");
+
+    DataSetBase sub2;
+    sub2.Name("subset_2");
+
+    dataset.SubDataSets().Add(sub1);
+    dataset.SubDataSets().Add(sub2);
+    EXPECT_EQ(2, dataset.SubDataSets().Size());
+
+    // remove
+    dataset.SubDataSets().Remove(sub2);
+    EXPECT_EQ(1, dataset.SubDataSets().Size());
+}
+
+TEST(DataSetCoreTest, EnsureCreatedAtAttribute)
+{
+    DataSet ds;
+    ReferenceSet ref;
+
+    EXPECT_FALSE(ds.CreatedAt().empty());
+    EXPECT_FALSE(ref.CreatedAt().empty());
+}
+
+TEST(DataSetCoreTest, BiosamplesOk)
+{
+    const std::string barcode_1_1{"lbc1--lbc1"};
+    const std::string barcode_1_2{"lbc1--lbc2"};
+    const std::string barcode_2_1{"lbc2--lbc1"};
+    const std::string barcode_2_2{"lbc2--lbc2"};
+
+    BioSample alice{"Alice"};
+    alice.DNABarcodes().Add(barcode_1_1);
+    alice.DNABarcodes().Add(barcode_1_2);
+
+    EXPECT_EQ("Alice", alice.Name());
+    ASSERT_EQ(2, alice.DNABarcodes().Size());
+    EXPECT_EQ(barcode_1_1, alice.DNABarcodes()[0].Name());
+    EXPECT_EQ(barcode_1_2, alice.DNABarcodes()[1].Name());
+    EXPECT_FALSE(alice.DNABarcodes()[0].UniqueId().empty());
+    EXPECT_FALSE(alice.DNABarcodes()[1].UniqueId().empty());
+
+    BioSample bob{"Bob"};
+    bob.DNABarcodes().Add(barcode_2_1);
+    bob.DNABarcodes().Add(DNABarcode{barcode_2_2, "explicit_uuid"});
+
+    EXPECT_EQ("Bob", bob.Name());
+    ASSERT_EQ(2, bob.DNABarcodes().Size());
+    EXPECT_EQ(barcode_2_1, bob.DNABarcodes()[0].Name());
+    EXPECT_EQ(barcode_2_2, bob.DNABarcodes()[1].Name());
+    EXPECT_FALSE(bob.DNABarcodes()[0].UniqueId().empty());
+    EXPECT_EQ("explicit_uuid", bob.DNABarcodes()[1].UniqueId());
+
+    DataSet dataset;
+    DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(0, metadata.BioSamples().Size());
+
+    metadata.BioSamples().Add(alice);
+    metadata.BioSamples().Add(bob);
+
+    ASSERT_EQ(2, metadata.BioSamples().Size());
+    EXPECT_EQ("Alice", metadata.BioSamples()[0].Name());
+    EXPECT_EQ("Bob", metadata.BioSamples()[1].Name());
+}
+
+TEST(DataSetCoreTest, BiosamplesFromXML)
+{
+    BAM::DataSet ds{DataSetCoreTests::subreadsetBioSample};
+    const auto& metadata = ds.Metadata();
+    const auto& biosamples = metadata.BioSamples();
+
+    ASSERT_EQ(1, biosamples.Size());
+    EXPECT_EQ("test test", biosamples[0].Name());
+}
diff --git a/tests/src/test_DataSetIO.cpp b/tests/src/test_DataSetIO.cpp

new file mode 100644 (file)

index 0000000..3d805d8
--- /dev/null
+++ b/tests/src/test_DataSetIO.cpp
@@ -0,0 +1,1754 @@
+// Author: Derek Barnett
+
+#include <unistd.h>
+#include <cstddef>
+#include <fstream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "../src/FileUtils.h"
+#include "PbbamTestData.h"
+
+#include <pbbam/DataSet.h>
+#include <pbbam/internal/DataSetElement.h>
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace DataSetIOTests {
+
+const std::string alignedBamFn  = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+const std::string bamGroupFofn  = PbbamTestsConfig::Generated_Dir + "/group.fofn";
+
+const std::string ali1XmlFn = PbbamTestsConfig::Data_Dir + "/dataset/ali1.xml";
+const std::string ali2XmlFn = PbbamTestsConfig::Data_Dir + "/dataset/ali2.xml";
+const std::string ali3XmlFn = PbbamTestsConfig::Data_Dir + "/dataset/ali3.xml";
+const std::string ali4XmlFn = PbbamTestsConfig::Data_Dir + "/dataset/ali4.xml";
+const std::string mappingStaggeredXmlFn = PbbamTestsConfig::Data_Dir + "/dataset/bam_mapping_staggered.xml";
+const std::string barcodeXmlFn = PbbamTestsConfig::Data_Dir + "/dataset/barcode.dataset.xml";
+const std::string ccsReadXmlFn = PbbamTestsConfig::Data_Dir + "/dataset/ccsread.dataset.xml";
+const std::string lambdaContigsXmlFn = PbbamTestsConfig::Data_Dir + "/dataset/lambda_contigs.xml";
+const std::string pbalchemyXmlFn   = PbbamTestsConfig::Data_Dir + "/dataset/pbalchemy10kbp.xml";
+const std::string referenceXmlFn   = PbbamTestsConfig::Data_Dir + "/dataset/reference.dataset.xml";
+const std::string subread1XmlFn    = PbbamTestsConfig::Data_Dir + "/dataset/subread_dataset1.xml";
+const std::string subread2XmlFn    = PbbamTestsConfig::Data_Dir + "/dataset/subread_dataset2.xml";
+const std::string subread3XmlFn    = PbbamTestsConfig::Data_Dir + "/dataset/subread_dataset3.xml";
+const std::string transformedXmlFn = PbbamTestsConfig::Data_Dir + "/dataset/transformed_rs_subread_dataset.xml";
+
+static void TestFromXmlString();
+static void TestAli1Xml();
+static void TestAli2Xml();
+static void TestAli3Xml();
+static void TestAli4Xml();
+static void TestMappingStaggeredXml();
+static void TestBarcodeXml();
+static void TestCcsReadXml();
+static void TestLambdaContigsXml();
+static void TestPbalchemyXml();
+static void TestReferenceXml();
+static void TestSubread1Xml();
+static void TestSubread2Xml();
+static void TestSubread3Xml();
+static void TestTransformedXml();
+
+static inline
+void changeCurrentDirectory(const std::string& dir)
+{ ASSERT_EQ(0, chdir(dir.c_str())); }
+
+} // namespace DataSetIOTests
+
+TEST(DataSetIOTest, FromBamFilename)
+{
+    DataSet dataset(DataSetIOTests::alignedBamFn);
+
+    EXPECT_EQ(1, dataset.ExternalResources().Size());
+    const ExternalResource& bamRef = dataset.ExternalResources()[0];
+
+    EXPECT_EQ(DataSetIOTests::alignedBamFn, bamRef.ResourceId());
+}
+
+TEST(DataSetIOTest, FromBamFilenames)
+{
+    std::ifstream fofn(DataSetIOTests::bamGroupFofn);
+    std::vector<std::string> files;
+    std::string file;
+    while (std::getline(fofn, file)) if (!file.empty()) files.emplace_back(file);
+    DataSet dataset(files);
+    EXPECT_EQ(3, dataset.ExternalResources().Size());
+}
+
+TEST(DataSetIOTest, FromBamFileObject)
+{
+    BamFile bamFile(DataSetIOTests::alignedBamFn);
+    DataSet dataset(bamFile.Filename());
+
+    EXPECT_EQ(1, dataset.ExternalResources().Size());
+    const ExternalResource& bamRef = dataset.ExternalResources()[0];
+
+    EXPECT_EQ(DataSetIOTests::alignedBamFn, bamRef.ResourceId());
+}
+
+TEST(DataSetIOTest, FromFofn)
+{
+    DataSet dataset(DataSetIOTests::bamGroupFofn);
+    EXPECT_EQ(3, dataset.ExternalResources().Size());
+}
+
+TEST(DataSetIOTest, FromXml)
+{
+    EXPECT_NO_THROW(DataSetIOTests::TestFromXmlString());
+}
+
+TEST(DataSetIOTest, FromXmlFile)
+{
+    EXPECT_NO_THROW(DataSetIOTests::TestAli1Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestAli2Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestAli3Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestAli4Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestMappingStaggeredXml());
+    EXPECT_NO_THROW(DataSetIOTests::TestBarcodeXml());
+    EXPECT_NO_THROW(DataSetIOTests::TestCcsReadXml());
+    EXPECT_NO_THROW(DataSetIOTests::TestLambdaContigsXml());
+    EXPECT_NO_THROW(DataSetIOTests::TestPbalchemyXml());
+    EXPECT_NO_THROW(DataSetIOTests::TestReferenceXml());
+    EXPECT_NO_THROW(DataSetIOTests::TestSubread1Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestSubread2Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestSubread3Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestTransformedXml());
+}
+
+TEST(DataSetIOTest, ThrowsOnNonexistentFofnFile)
+{
+    bool checkedException = false;
+    try
+    {
+        auto ds = DataSet{"does/not/exist.fofn"};
+    }
+    catch(std::runtime_error& e)
+    {
+        const std::string msg = "DataSet: could not open FOFN for reading: does/not/exist.fofn";
+        EXPECT_EQ(msg, e.what()) ;
+        checkedException = true;
+    }
+    EXPECT_TRUE(checkedException);
+}
+
+TEST(DataSetIOTest, ThrowsOnNonexistentXmlFile)
+{
+    bool checkedException = false;
+    try
+    {
+        auto ds = DataSet{"does/not/exist.xml"};
+    }
+    catch(std::runtime_error& e)
+    {
+        const std::string msg = "DataSet: could not open XML file for reading: does/not/exist.xml";
+        EXPECT_EQ(msg, e.what()) ;
+        checkedException = true;
+    }
+    EXPECT_TRUE(checkedException);
+}
+
+TEST(DataSetIOTest, ThrowsOnUnsupportedExtension)
+{
+    bool checkedException = false;
+    try
+    {
+        auto ds = DataSet{"bad/extension.foo"};
+    }
+    catch(std::runtime_error& e)
+    {
+        const std::string msg = "DataSet: unsupported extension on input file: bad/extension.foo";
+        EXPECT_EQ(msg, e.what()) ;
+        checkedException = true;
+    }
+    EXPECT_TRUE(checkedException);
+}
+
+TEST(DataSetIOTest, ThrowsIfCannotOpenSaveFile)
+{
+    bool checkedException = false;
+    try
+    {
+        auto ds = DataSet{};
+        std::string fn = "fake_directory_that_should_not_exist/out.xml";
+        ds.Save(fn);
+    }
+    catch(std::runtime_error& e)
+    {
+        const std::string msg = "DataSet: could not open XML file for writing: fake_directory_that_should_not_exist/out.xml";
+        EXPECT_EQ(msg, e.what()) ;
+        checkedException = true;
+    }
+    EXPECT_TRUE(checkedException);
+}
+
+TEST(DataSetIOTest, ToXml)
+{
+    // top-level data
+    DataSet dataset(DataSet::ALIGNMENT);
+    dataset.CreatedAt("2015-01-27T09:00:01");
+    dataset.MetaType("PacBio.DataSet.AlignmentSet");
+    dataset.Name("DataSet_AlignmentSet");
+    dataset.Tags("barcode moreTags mapping mytags");
+    dataset.TimeStampedName("my_tsn");
+    dataset.UniqueId("b095d0a3-94b8-4918-b3af-a3f81bbe519c");
+    dataset.Attribute("xmlns",              "http://pacificbiosciences.com/PacBioDatasets.xsd")
+           .Attribute("xmlns:xsi",          "http://www.w3.org/2001/XMLSchema-instance")
+           .Attribute("xsi:schemaLocation", "http://pacificbiosciences.com/PacBioDatasets.xsd");
+
+    // external resources
+    ExternalResource resource1("AlignmentFile.AlignmentBamFile", "/mnt/path/to/alignments2.bam");
+    resource1.Name("Third Alignments BAM");
+    resource1.CreatedAt("2015-01-27T09:00:01");
+    resource1.Description("Points to an example Alignments BAM file.");
+    resource1.Tags("Example");
+    resource1.TimeStampedName("my_tsn");
+    resource1.UniqueId("my_uuid");
+    FileIndex pbi1("PacBio.Index.PacBioIndex", "/mnt/path/to/alignments2.pbi");
+    pbi1.CreatedAt("2015-01-27T09:00:01");
+    pbi1.TimeStampedName("my_tsn");
+    pbi1.UniqueId("my_uuid");
+    resource1.FileIndices().Add(pbi1);
+    dataset.ExternalResources().Add(resource1);
+
+    ExternalResource resource2("AlignmentFile.AlignmentBamFile", "./alignments3.bam");
+    resource2.CreatedAt("2015-01-27T09:00:01");
+    resource2.Name("Fourth Alignments BAM");
+    resource2.Description("Points to another example Alignments BAM file, by relative path.");
+    resource2.Tags("Example");
+    resource2.TimeStampedName("my_tsn");
+    resource2.UniqueId("my_uuid");
+    FileIndex pbi2("PacBio.Index.PacBioIndex", "./alignments3.pbi");
+    pbi2.CreatedAt("2015-01-27T09:00:01");
+    pbi2.TimeStampedName("my_tsn");
+    pbi2.UniqueId("my_uuid");
+
+    resource2.FileIndices().Add(pbi2);
+    dataset.ExternalResources().Add(resource2);
+
+    // sub-datasets with filters
+    DataSetBase subDataSet1;
+    subDataSet1.CreatedAt("2015-01-27T09:00:01");
+    subDataSet1.Name("HighQuality Read Alignments");
+    subDataSet1.TimeStampedName("my_tsn");
+    subDataSet1.UniqueId("ab95d0a3-94b8-4918-b3af-a3f81bbe519c");
+    Filter filter1;
+    filter1.Properties().Add(Property("rq", "0.85", ">"));
+    subDataSet1.Filters().Add(filter1);
+    dataset.SubDataSets().Add(subDataSet1);
+
+    DataSetBase subDataSet2;
+    subDataSet2.CreatedAt("2015-01-27T09:00:01");
+    subDataSet2.Name("Alignments to chromosome 1");
+    subDataSet2.TimeStampedName("my_tsn");
+    subDataSet2.UniqueId("ac95d0a3-94b8-4918-b3af-a3f81bbe519c");
+    Filter filter2;
+    filter2.Properties().Add(Property("RNAME", "chr1", "=="));
+    subDataSet2.Filters().Add(filter2);
+    dataset.SubDataSets().Add(subDataSet2);
+
+    // write dataset
+    const std::string expectedXml{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet "
+                "CreatedAt=\"2015-01-27T09:00:01\" "
+                "MetaType=\"PacBio.DataSet.AlignmentSet\" "
+                "Name=\"DataSet_AlignmentSet\" "
+                "Tags=\"barcode moreTags mapping mytags\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" Version=\"3.0.1\" "
+                "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:pbbase=\"http://pacificbiosciences.com/PacBioBaseDataModel.xsd\" "
+                "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t<pbbase:ExternalResources>\n"
+        "\t\t<pbbase:ExternalResource "
+                "CreatedAt=\"2015-01-27T09:00:01\" "
+                "Description=\"Points to an example Alignments BAM file.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Third Alignments BAM\" "
+                "ResourceId=\"/mnt/path/to/alignments2.bam\" "
+                "Tags=\"Example\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"my_uuid\" Version=\"3.0.1\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                "CreatedAt=\"2015-01-27T09:00:01\" "
+                "MetaType=\"PacBio.Index.PacBioIndex\" "
+                "ResourceId=\"/mnt/path/to/alignments2.pbi\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"my_uuid\" Version=\"3.0.1\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t\t<pbbase:ExternalResource "
+                "CreatedAt=\"2015-01-27T09:00:01\" "
+                "Description=\"Points to another example Alignments BAM file, by relative path.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Fourth Alignments BAM\" "
+                "ResourceId=\"" + dataset.Path() + "/alignments3.bam\" "
+                "Tags=\"Example\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"my_uuid\" Version=\"3.0.1\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                "CreatedAt=\"2015-01-27T09:00:01\" "
+                "MetaType=\"PacBio.Index.PacBioIndex\" "
+                "ResourceId=\"" + dataset.Path() + "/alignments3.pbi\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"my_uuid\" Version=\"3.0.1\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t</pbbase:ExternalResources>\n"
+        "\t<pbds:DataSets>\n"
+        "\t\t<pbds:DataSet "
+                "CreatedAt=\"2015-01-27T09:00:01\" "
+                "MetaType=\"PacBio.DataSet.DataSet\" "
+                "Name=\"HighQuality Read Alignments\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"ab95d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+                "Version=\"3.0.1\">\n"
+        "\t\t\t<pbds:Filters>\n"
+        "\t\t\t\t<pbds:Filter>\n"
+        "\t\t\t\t\t<pbbase:Properties>\n"
+        "\t\t\t\t\t\t<pbbase:Property Name=\"rq\" Operator=\">\" Value=\"0.85\" />\n"
+        "\t\t\t\t\t</pbbase:Properties>\n"
+        "\t\t\t\t</pbds:Filter>\n"
+        "\t\t\t</pbds:Filters>\n"
+        "\t\t</pbds:DataSet>\n"
+        "\t\t<pbds:DataSet "
+                "CreatedAt=\"2015-01-27T09:00:01\" "
+                "MetaType=\"PacBio.DataSet.DataSet\" "
+                "Name=\"Alignments to chromosome 1\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"ac95d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+                "Version=\"3.0.1\">\n"
+        "\t\t\t<pbds:Filters>\n"
+        "\t\t\t\t<pbds:Filter>\n"
+        "\t\t\t\t\t<pbbase:Properties>\n"
+        "\t\t\t\t\t\t<pbbase:Property Name=\"RNAME\" Operator=\"==\" Value=\"chr1\" />\n"
+        "\t\t\t\t\t</pbbase:Properties>\n"
+        "\t\t\t\t</pbds:Filter>\n"
+        "\t\t\t</pbds:Filters>\n"
+        "\t\t</pbds:DataSet>\n"
+        "\t</pbds:DataSets>\n"
+        "</pbds:AlignmentSet>\n"};
+
+    std::ostringstream s;
+    dataset.SaveToStream(s);
+    EXPECT_EQ(expectedXml, s.str());
+}
+
+TEST(DataSetIOTest, DataSetBaseTypeToXml)
+{
+    // top-level data
+    ContigSet dataset;
+    dataset.CreatedAt("2015-01-27T09:00:01");
+    dataset.Name("DataSet_ContigSet");
+    dataset.Tags("barcode moreTags mapping mytags");
+    dataset.TimeStampedName("my_tsn");
+    dataset.UniqueId("b095d0a3-94b8-4918-b3af-a3f81bbe519c");
+
+    // write dataset
+    const std::string expectedXml{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:ContigSet "
+                "CreatedAt=\"2015-01-27T09:00:01\" "
+                "MetaType=\"PacBio.DataSet.ContigSet\" "
+                "Name=\"DataSet_ContigSet\" "
+                "Tags=\"barcode moreTags mapping mytags\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" Version=\"3.0.1\" "
+                "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" />\n"};
+
+        std::ostringstream s;
+        dataset.SaveToStream(s);
+        EXPECT_EQ(expectedXml, s.str());
+}
+
+
+namespace DataSetIOTests {
+
+static void TestFromXmlString()
+{
+    const std::string inputXml{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet "
+            "CreatedAt=\"2015-01-27T09:00:01\" "
+            "MetaType=\"PacBio.DataSet.AlignmentSet\" "
+            "Name=\"DataSet_AlignmentSet\" "
+            "Tags=\"barcode moreTags mapping mytags\" "
+            "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+            "Version=\"2.3.0\" "
+            "xmlns=\"http://pacificbiosciences.com/PacBioDataModel.xsd\" "
+            "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+            "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDataModel.xsd\">\n"
+        "\t<pbbase:ExternalResources>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to an example Alignments BAM file.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Third Alignments BAM\" "
+                "ResourceId=\"file:/mnt/path/to/alignments2.bam\" "
+                "Tags=\"Example\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                    "MetaType=\"PacBio.Index.PacBioIndex\" "
+                    "ResourceId=\"file:/mnt/path/to/alignments2.pbi\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to another example Alignments BAM file, by relative path.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Fourth Alignments BAM\" "
+                "ResourceId=\"file:./alignments3.bam\" "
+                "Tags=\"Example\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                    "MetaType=\"PacBio.Index.PacBioIndex\" "
+                    "ResourceId=\"file:/mnt/path/to/alignments3.pbi\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t</pbbase:ExternalResources>\n"
+        "\t<pbds:DataSets>\n"
+        "\t\t<pbds:DataSet "
+                "Name=\"HighQuality Read Alignments\" "
+                "UniqueId=\"ab95d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+                "Version=\"2.3.0\">\n"
+        "\t\t\t<pbds:Filters>\n"
+        "\t\t\t\t<pbds:Filter>\n"
+        "\t\t\t\t\t<pbbase:Properties>\n"
+        "\t\t\t\t\t\t<pbbase:Property Name=\"rq\" Operator=\">\" Value=\"0.85\" />\n"
+        "\t\t\t\t\t</pbbase:Properties>\n"
+        "\t\t\t\t</pbds:Filter>\n"
+        "\t\t\t</pbds:Filters>\n"
+        "\t\t</pbds:DataSet>\n"
+        "\t\t<pbds:DataSet "
+                "Name=\"Alignments to chromosome 1\" "
+                "UniqueId=\"ac95d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+                "Version=\"2.3.0\">\n"
+        "\t\t\t<pbds:Filters>\n"
+        "\t\t\t\t<pbds:Filter>\n"
+        "\t\t\t\t\t<pbbase:Properties>\n"
+        "\t\t\t\t\t\t<pbbase:Property Name=\"RNAME\" Operator=\"==\" Value=\"chr1\" />\n"
+        "\t\t\t\t\t</pbbase:Properties>\n"
+        "\t\t\t\t</pbds:Filter>\n"
+        "\t\t\t</pbds:Filters>\n"
+        "\t\t</pbds:DataSet>\n"
+        "\t</pbds:DataSets>\n"
+        "</pbds:AlignmentSet>\n"};
+
+    const DataSet dataset = DataSet::FromXml(inputXml);
+
+    EXPECT_EQ(DataSet::ALIGNMENT,                     dataset.Type());
+    EXPECT_EQ("2015-01-27T09:00:01",                  dataset.CreatedAt());
+    EXPECT_EQ("PacBio.DataSet.AlignmentSet",          dataset.MetaType());
+    EXPECT_EQ("DataSet_AlignmentSet",                 dataset.Name());
+    EXPECT_EQ("barcode moreTags mapping mytags",      dataset.Tags());
+    EXPECT_EQ("b095d0a3-94b8-4918-b3af-a3f81bbe519c", dataset.UniqueId());
+    EXPECT_EQ("2.3.0",                                dataset.Version());
+    EXPECT_EQ("http://pacificbiosciences.com/PacBioDataModel.xsd", dataset.Attribute("xmlns"));
+    EXPECT_EQ("http://www.w3.org/2001/XMLSchema-instance",         dataset.Attribute("xmlns:xsi"));
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    EXPECT_EQ(2, resources.NumChildren());
+
+    const ExternalResource& resource1 = resources[0];
+    EXPECT_EQ("Third Alignments BAM",                      resource1.Name());
+    EXPECT_EQ("Points to an example Alignments BAM file.", resource1.Description());
+    EXPECT_EQ("AlignmentFile.AlignmentBamFile",            resource1.MetaType());
+    EXPECT_EQ("file:/mnt/path/to/alignments2.bam",         resource1.ResourceId());
+    EXPECT_EQ("Example",                                   resource1.Tags());
+    const FileIndices& fileIndices1 = resource1.FileIndices();
+    EXPECT_EQ(1, fileIndices1.Size());
+    const FileIndex& pbi1 = fileIndices1[0];
+    EXPECT_EQ("PacBio.Index.PacBioIndex",          pbi1.MetaType());
+    EXPECT_EQ("file:/mnt/path/to/alignments2.pbi", pbi1.ResourceId());
+
+    const ExternalResource& resource2 = resources[1];
+    EXPECT_EQ("Fourth Alignments BAM",                     resource2.Name());
+    EXPECT_EQ("Points to another example Alignments BAM file, by relative path.", resource2.Description());
+    EXPECT_EQ("AlignmentFile.AlignmentBamFile",            resource2.MetaType());
+    EXPECT_EQ("file:./alignments3.bam",                    resource2.ResourceId());
+    EXPECT_EQ("Example",                                   resource2.Tags());
+    const FileIndices& fileIndices2 = resource2.FileIndices();
+    EXPECT_EQ(1, fileIndices2.Size());
+    const FileIndex& pbi2 = fileIndices2[0];
+    EXPECT_EQ("PacBio.Index.PacBioIndex",          pbi2.MetaType());
+    EXPECT_EQ("file:/mnt/path/to/alignments3.pbi", pbi2.ResourceId());
+
+    const SubDataSets& subDatasets = dataset.SubDataSets();
+    EXPECT_EQ(2, subDatasets.Size());
+
+    const DataSetBase& sub1 = subDatasets[0];
+    EXPECT_EQ("HighQuality Read Alignments",          sub1.Name());
+    EXPECT_EQ("ab95d0a3-94b8-4918-b3af-a3f81bbe519c", sub1.UniqueId());
+    EXPECT_EQ("2.3.0",                                sub1.Version());
+    const Filters& sub1Filters = sub1.Filters();
+    EXPECT_EQ(1, sub1Filters.Size());
+    const Filter& sub1Filter = sub1Filters[0];
+    EXPECT_EQ(1, sub1Filter.Properties().Size());
+    const Property& property1 = sub1Filter.Properties()[0];
+    EXPECT_EQ("rq",   property1.Name());
+    EXPECT_EQ(">",    property1.Operator());
+    EXPECT_EQ("0.85", property1.Value());
+
+    const DataSetBase& sub2 = subDatasets[1];
+    EXPECT_EQ("Alignments to chromosome 1",          sub2.Name());
+    EXPECT_EQ("ac95d0a3-94b8-4918-b3af-a3f81bbe519c", sub2.UniqueId());
+    EXPECT_EQ("2.3.0",                                sub2.Version());
+    const Filters& sub2Filters = sub2.Filters();
+    EXPECT_EQ(1, sub2Filters.Size());
+    const Filter& sub2Filter = sub2Filters[0];
+    EXPECT_EQ(1, sub2Filter.Properties().Size());
+    const Property& property2 = sub2Filter.Properties()[0];
+    EXPECT_EQ("RNAME",   property2.Name());
+    EXPECT_EQ("==",    property2.Operator());
+    EXPECT_EQ("chr1", property2.Value());
+}
+
+static void TestAli1Xml()
+{
+    const DataSet dataset(DataSetIOTests::ali1XmlFn);
+    EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),                  dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.AlignmentSet"),          dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_AlignmentSet"),                 dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"),      dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"),                                dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Alignments BAM file."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments0.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments0.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Alignments BAM file, by relative path."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:./alignments1.bam"),  resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments1.pbi"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("HighQuality Read Alignments"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.85"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        }
+        else {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("Alignments to chromosome 1"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("RNAME"), property.Name());
+            EXPECT_EQ(std::string("chr1"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+}
+
+static void TestAli2Xml()
+{
+    const DataSet dataset(DataSetIOTests::ali2XmlFn);
+    EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),                  dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.AlignmentSet"),          dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_AlignmentSet"),                 dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"),      dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"),                                dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Alignments BAM file."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments2.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments2.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Alignments BAM file, by relative path."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:./alignments3.bam"),  resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments3.pbi"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("HighQuality Read Alignments"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.85"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        }
+        else {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("Alignments to chromosome 1"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("RNAME"), property.Name());
+            EXPECT_EQ(std::string("chr1"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+}
+
+static void TestAli3Xml()
+{
+    const DataSet dataset(DataSetIOTests::ali3XmlFn);
+    EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),                  dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.AlignmentSet"),          dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_AlignmentSet"),                 dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"),      dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"),                                dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Alignments BAM file."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments2.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments2.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Alignments BAM file, by relative path."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:./alignments3.bam"),  resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments3.pbi"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("HighQuality Read Alignments"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.75"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        }
+        else {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("Alignments to chromosome 1"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("RNAME"), property.Name());
+            EXPECT_EQ(std::string("chr1"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+}
+
+static void TestAli4Xml()
+{
+    const DataSet dataset(DataSetIOTests::ali4XmlFn);
+    EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),                  dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.AlignmentSet"),          dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_AlignmentSet"),                 dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"),      dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"),                                dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Alignments BAM file."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments0.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments0.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Alignments BAM file, by relative path."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:./alignments1.bam"),  resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments1.pbi"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("HighQuality Read Alignments"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.85"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        }
+        else {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("Alignments to chromosome 1"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("RNAME"), property.Name());
+            EXPECT_EQ(std::string("chr1"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+}
+
+static void TestMappingStaggeredXml()
+{
+    const DataSet dataset(DataSetIOTests::mappingStaggeredXmlFn);
+    EXPECT_EQ(DataSet::GENERIC, dataset.Type());
+    EXPECT_EQ(std::string("2015-05-13T10:58:26"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.DataSet"), dataset.MetaType());
+    EXPECT_EQ(std::string(""), dataset.Name());
+    EXPECT_EQ(std::string(""), dataset.Tags());
+    EXPECT_EQ(std::string("30f72098-bc5b-e06b-566c-8b28dda909a8"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string(""), resource.Name());
+            EXPECT_EQ(std::string(""), resource.Description());
+            EXPECT_EQ(std::string(""), resource.MetaType());
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_1.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string(""), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_1.bam.bai"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string(""), resource.Name());
+            EXPECT_EQ(std::string(""), resource.Description());
+            EXPECT_EQ(std::string(""), resource.MetaType());
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_2.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string(""), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_2.bam.bai"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("2015-05-13T10:58:26"),    subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string(""), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("c5402d06-4643-057c-e300-fe229b4e8909"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const ExternalResources& subResources = subdataset.ExternalResources();
+            ASSERT_EQ(1, subResources.Size());
+            const ExternalResource& resource = subResources[0];
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_2.bam"), resource.ResourceId());
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_2.bam.bai"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("2015-05-13T10:58:26"),    subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string(""), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("f8b54a55-5fb7-706f-ab35-39afc9c86924"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const ExternalResources& subResources = subdataset.ExternalResources();
+            ASSERT_EQ(1, subResources.Size());
+            const ExternalResource& resource = subResources[0];
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_1.bam"), resource.ResourceId());
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_1.bam.bai"), index.ResourceId());
+        }
+    }
+}
+
+static void TestBarcodeXml()
+{
+    const DataSet dataset(DataSetIOTests::barcodeXmlFn);
+    EXPECT_EQ(DataSet::BARCODE, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.BarcodeSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_BarcodeSet"), dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(1, resources.Size());
+    const ExternalResource& resource = resources[0];
+    EXPECT_EQ(std::string("First Barcodes FASTA"), resource.Name());
+    EXPECT_EQ(std::string("Points to an example Barcodes FASTA file."), resource.Description());
+    EXPECT_EQ(std::string("BarcodeFile.BarcodeFastaFile"), resource.MetaType());
+    EXPECT_EQ(std::string("file:///mnt/path/to/barcode.fasta"), resource.ResourceId());
+    EXPECT_EQ(std::string("Example"), resource.Tags());
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(std::string("30"),     metadata.NumRecords());
+    EXPECT_EQ(std::string("400"),    metadata.TotalLength());
+
+    // access metadata extensions directly for now
+    EXPECT_EQ(std::string("paired"), metadata.ChildText("BarcodeConstruction"));
+}
+
+static void TestCcsReadXml()
+{
+    const DataSet dataset(DataSetIOTests::ccsReadXmlFn);
+    EXPECT_EQ(DataSet::CONSENSUS_READ, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.ConsensusReadSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_ConsensusReadSet"), dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First ConsensusRead BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example ConsensusRead BAM file."), resource.Description());
+            EXPECT_EQ(std::string("PacBio.ConsensusReadFile.ConsensusReadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/ccsreads0.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("PacBio.Index.PacBioIndex"), index.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/ccsreads0.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second ConsensusRead BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example ConsensusRead BAM file."), resource.Description());
+            EXPECT_EQ(std::string("PacBio.ConsensusReadFile.ConsensusReadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/ccsreads1.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("PacBio.Index.PacBioIndex"), index.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/ccsreads0.pbi"), index.ResourceId());
+        }
+    }
+}
+
+static void TestLambdaContigsXml()
+{
+    const DataSet dataset(DataSetIOTests::lambdaContigsXmlFn);
+    EXPECT_EQ(DataSet::REFERENCE, dataset.Type());
+    EXPECT_EQ(std::string("2015-05-28T10:56:36"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.ReferenceSet"), dataset.MetaType());
+    EXPECT_EQ(std::string(""), dataset.Name());
+    EXPECT_EQ(std::string(""), dataset.Tags());
+    EXPECT_EQ(std::string("596e87db-34f9-d2fd-c905-b017543170e1"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(1, resources.Size());
+    const ExternalResource& resource = resources[0];
+    EXPECT_EQ(std::string("file:tests/data/lambda_contigs.fasta"), resource.ResourceId());
+}
+
+static void TestPbalchemyXml()
+{
+    const DataSet dataset(DataSetIOTests::pbalchemyXmlFn);
+    EXPECT_EQ(DataSet::GENERIC, dataset.Type());
+    EXPECT_EQ(std::string("2015-05-22T16:56:16"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.DataSet"), dataset.MetaType());
+    EXPECT_EQ(std::string(""), dataset.Name());
+    EXPECT_EQ(std::string(""), dataset.Tags());
+    EXPECT_EQ(std::string("58e3f7c5-24c1-b58b-fbd5-37de268cc2f0"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(1, resources.Size());
+    const ExternalResource& resource = resources[0];
+    EXPECT_EQ(std::string("file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam"), resource.ResourceId());
+    const FileIndices& fileIndices = resource.FileIndices();
+    ASSERT_EQ(1, fileIndices.Size());
+    const FileIndex& index = fileIndices[0];
+    EXPECT_EQ(std::string("file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam.bai"), index.ResourceId());
+
+    // TYPOs: Should be Filter Properties/Property not Parameter(s)
+
+}
+
+static void TestReferenceXml()
+{
+    const DataSet dataset(DataSetIOTests::referenceXmlFn);
+    EXPECT_EQ(DataSet::REFERENCE, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.ReferenceSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_ReferenceSet"), dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(1, resources.Size());
+    const ExternalResource& resource = resources[0];
+    EXPECT_EQ(std::string("First References FASTA"), resource.Name());
+    EXPECT_EQ(std::string("Points to an example references FASTA file."), resource.Description());
+    EXPECT_EQ(std::string("PacBio.ReferenceFile.ReferenceFastaFile"), resource.MetaType());
+    EXPECT_EQ(std::string("file:///mnt/path/to/reference.fasta"), resource.ResourceId());
+    EXPECT_EQ(std::string("Example"), resource.Tags());
+    const FileIndices& fileIndices = resource.FileIndices();
+    ASSERT_EQ(2, fileIndices.Size());
+    for (size_t i = 0; i < fileIndices.Size(); ++i) {
+        const FileIndex& index = fileIndices[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("PacBio.Index.SaWriterIndex"), index.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/reference.fasta.sa"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("PacBio.Index.SamIndex"), index.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/reference.fasta.fai"), index.ResourceId());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(std::string("500"),     metadata.NumRecords());
+    EXPECT_EQ(std::string("5000000"), metadata.TotalLength());
+
+    // access metadata extensions directly for now
+    EXPECT_EQ(std::string("Tribble"), metadata.ChildText("Organism"));
+    EXPECT_EQ(std::string("Diploid"), metadata.ChildText("Ploidy"));
+
+    const internal::DataSetElement& contigs = metadata.Child<internal::DataSetElement>("Contigs");
+    ASSERT_EQ(1, contigs.NumChildren());
+
+    const internal::DataSetElement& contig = contigs.Child<internal::DataSetElement>(0);
+    EXPECT_EQ(std::string("gi|229359445|emb|AM181176.4|"), contig.Attribute("Name"));
+    EXPECT_EQ(std::string("Pseudomonas fluorescens SBW25 complete genome|quiver"), contig.Attribute("Description"));
+    EXPECT_EQ(std::string("6722109"), contig.Attribute("Length"));
+    EXPECT_EQ(std::string("f627c795efad7ce0050ed42b942d408e"), contig.Attribute("Digest"));
+}
+
+static void TestSubread1Xml()
+{
+    const DataSet dataset(DataSetIOTests::subread1XmlFn);
+    EXPECT_EQ(DataSet::SUBREAD, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.SubreadSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_SubreadSet"), dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Subreads BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(std::string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads0.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads0.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Subreads BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(std::string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads1.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads0.pbi"), index.ResourceId());
+        }
+    }
+
+    const Filters& filters = dataset.Filters();
+    ASSERT_EQ(2, filters.Size());
+    for (size_t i = 0; i < filters.Size(); ++i) {
+        const Filter& filter = filters[i];
+        if (i == 0) {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.75"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        } else {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("QNAME"), property.Name());
+            EXPECT_EQ(std::string("100/0/0_100"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(std::string("500"),    metadata.NumRecords());
+    EXPECT_EQ(std::string("500000"), metadata.TotalLength());
+}
+
+static void TestSubread2Xml()
+{
+    const DataSet dataset(DataSetIOTests::subread2XmlFn);
+    EXPECT_EQ(DataSet::SUBREAD, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.SubreadSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_SubreadSet"), dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Subreads BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(std::string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads2.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads2.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Subreads BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(std::string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads3.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads3.pbi"), index.ResourceId());
+        }
+    }
+
+    const Filters& filters = dataset.Filters();
+    ASSERT_EQ(2, filters.Size());
+    for (size_t i = 0; i < filters.Size(); ++i) {
+        const Filter& filter = filters[i];
+        if (i == 0) {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.75"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        } else {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("QNAME"), property.Name());
+            EXPECT_EQ(std::string("100/0/0_100"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(std::string("500"),    metadata.NumRecords());
+    EXPECT_EQ(std::string("500000"), metadata.TotalLength());
+}
+
+static void TestSubread3Xml()
+{
+    const DataSet dataset(DataSetIOTests::subread3XmlFn);
+    EXPECT_EQ(DataSet::SUBREAD, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"), dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.SubreadSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_SubreadSet"), dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Subreads BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(std::string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads2.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads2.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Subreads BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(std::string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads3.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads3.pbi"), index.ResourceId());
+        }
+    }
+
+    const Filters& filters = dataset.Filters();
+    ASSERT_EQ(2, filters.Size());
+    for (size_t i = 0; i < filters.Size(); ++i) {
+        const Filter& filter = filters[i];
+        if (i == 0) {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.85"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        } else {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("QNAME"), property.Name());
+            EXPECT_EQ(std::string("100/0/0_100"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(std::string("500"),    metadata.NumRecords());
+    EXPECT_EQ(std::string("500000"), metadata.TotalLength());
+}
+
+static void TestTransformedXml()
+{
+    const DataSet dataset(DataSetIOTests::transformedXmlFn);
+    EXPECT_EQ(DataSet::HDF_SUBREAD, dataset.Type());
+    EXPECT_EQ(std::string("PacBio.DataSet.SubreadSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("Subreads from run r001173_42129_130607"), dataset.Name());
+    EXPECT_EQ(std::string("pacbio.secondary.instrument=RS"), dataset.Tags());
+    EXPECT_EQ(std::string("abbc9183-b01e-4671-8c12-19efee534647"), dataset.UniqueId());
+    EXPECT_EQ(std::string("0.5"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema"),         dataset.Attribute("xmlns:xs"));
+    EXPECT_EQ(std::string("http://www.w3.org/2005/xpath-functions"), dataset.Attribute("xmlns:fn"));
+    EXPECT_EQ(std::string("java:java.util.UUID"), dataset.Attribute("xmlns:uuid"));
+    EXPECT_EQ(std::string("http://whatever"), dataset.Attribute("xmlns:bax"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(3, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("PacBio.SubreadFile.BaxFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///pbi/dept/secondary/siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.0.bax.h5"),
+                      resource.ResourceId());
+        }
+        else if (i == 1) {
+            EXPECT_EQ(std::string("PacBio.SubreadFile.BaxFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///pbi/dept/secondary/siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.1.bax.h5"),
+                      resource.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("PacBio.SubreadFile.BaxFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///pbi/dept/secondary/siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.2.bax.h5"),
+                      resource.ResourceId());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(std::string("150000"),   metadata.NumRecords());
+    EXPECT_EQ(std::string("50000000"), metadata.TotalLength());
+}
+
+} // namespace DataSetIOTests
+
+TEST(DataSetIOTest, InspectMalformedXml)
+{
+    const std::string xmlFn = PbbamTestsConfig::Data_Dir + "/dataset/malformed.xml";
+
+    DataSet ds(xmlFn);
+    std::ostringstream s;
+    ds.SaveToStream(s);
+
+    const std::string expected{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<SubreadSet CreatedAt=\"2015-08-19T15:39:36.331\" Description=\"Merged dataset from 1 files using DatasetMerger 0.1.2\" "
+                    "MetaType=\"PacBio.DataSet.HdfSubreadSet\" Name=\"Subreads from runr000013_42267_150403\" "
+                    "Tags=\"pacbio.secondary.instrument=RS\" TimeStampedName=\"hdfsubreadset_2015-08-19T15:39:36.331-07:00\" "
+                    "UniqueId=\"b4741521-2a4c-42df-8a13-0a755ca9ed1e\" Version=\"0.5\" "
+                    "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                    "xmlns:ns0=\"http://pacificbiosciences.com/PacBioBaseDataModel.xsd\" "
+                    "xmlns:ns1=\"http://pacificbiosciences.com/PacBioSampleInfo.xsd\" "
+                    "xmlns:ns2=\"http://pacificbiosciences.com/PacBioCollectionMetadata.xsd\" "
+                    "xmlns:ns3=\"http://pacificbiosciences.com/PacBioReagentKit.xsd\" "
+                    "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                    "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t<ns0:ExternalResources>\n"
+        "\t\t<ns0:ExternalResource MetaType=\"SubreadFile.SubreadBamFile\" "
+                                  "ResourceId=\"file:///mnt/secondary-siv/jenkins/jenkins-bot01/workspace/Ubuntu1404_Mainline_SA3_Tiny_tests/software/smrtanalysis/siv/testkit-jobs/sa3_pipelines/mapping/tiny/job_output-ubuntu1404/tasks/pbsmrtpipe.tasks.h5_subreads_to_subread-0//mnt/secondary-siv/jenkins/jenkins-bot01/workspace/Ubuntu1404_Mainline_SA3_Tiny_tests/software/smrtanalysis/siv/testkit-jobs/sa3_pipelines/mapping/tiny/job_output-ubuntu1404/tasks/pbsmrtpipe.tasks.h5_subreads_to_subread-0/file.subreads.subreads.bam\" "
+                                  "TimeStampedName=\"SubreadFile.SubreadBamFile_00000000000000\" "
+                                  "UniqueId=\"251acf71-9eb0-489e-9dd1-cdbd11432753\" />\n"
+        "\t</ns0:ExternalResources>\n"
+        "\t<DataSetMetadata>\n"
+        "\t\t<TotalLength>50000000</TotalLength>\n"
+        "\t\t<NumRecords>150000</NumRecords>\n"
+        "\t\t<ns2:Collections>\n"
+        "\t\t\t<ns2:CollectionMetadata Context=\"m150404_101626_42267_c100807920800000001823174110291514_s1_p0\" "
+                                      "InstrumentId=\"1\" InstrumentName=\"42267\" MetaType=\"PacBio.Collection\" "
+                                      "TimeStampedName=\"m150404_101626_42267_c100807920800000001823174110291514_s1_p0\" "
+                                      "UniqueId=\"d66c8372-2b70-4dcf-b64f-9f8b5cc351fd\">\n"
+        "\t\t\t\t<ns2:InstCtrlVer>2.3.0.1.142990</ns2:InstCtrlVer>\n"
+        "\t\t\t\t<ns2:SigProcVer>NRT@172.31.128.10:8082, SwVer=2301.142990, HwVer=1.0</ns2:SigProcVer>\n"
+        "\t\t\t\t<ns2:RunDetails>\n"
+        "\t\t\t\t\t<ns2:RunId>r000013_42267_150403</ns2:RunId>\n"
+        "\t\t\t\t\t<ns2:Name>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:Name>\n"
+        "\t\t\t\t</ns2:RunDetails>\n"
+        "\t\t\t\t<ns2:WellSample Name=\"Inst42267-040315-SAT-100pM-2kb-P6C4\">\n"
+        "\t\t\t\t\t<ns2:PlateId>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:PlateId>\n"
+        "\t\t\t\t\t<ns2:WellName>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:WellName>\n"
+        "\t\t\t\t\t<ns2:Concentration>0.0</ns2:Concentration>\n"
+        "\t\t\t\t\t<ns2:SampleReuseEnabled>false</ns2:SampleReuseEnabled>\n"
+        "\t\t\t\t\t<ns2:StageHotstartEnabled>false</ns2:StageHotstartEnabled>\n"
+        "\t\t\t\t\t<ns2:SizeSelectionEnabled>false</ns2:SizeSelectionEnabled>\n"
+        "\t\t\t\t\t<ns2:UseCount>1</ns2:UseCount>\n"
+        "\t\t\t\t\t<ns1:BioSamplePointers>\n"
+        "\t\t\t\t\t\t<ns1:BioSamplePointer>251acf71-9eb0-489e-9dd1-cdbd11432752</ns1:BioSamplePointer>\n"
+        "\t\t\t\t\t</ns1:BioSamplePointers>\n"
+        "\t\t\t\t</ns2:WellSample>\n"
+        "\t\t\t\t<ns2:Automation>\n"
+        "\t\t\t\t\t<ns0:AutomationParameters>\n"
+        "\t\t\t\t\t\t<ns0:AutomationParameter />\n"
+        "\t\t\t\t\t</ns0:AutomationParameters>\n"
+        "\t\t\t\t</ns2:Automation>\n"
+        "\t\t\t\t<ns2:CollectionNumber>7</ns2:CollectionNumber>\n"
+        "\t\t\t\t<ns2:CellIndex>4</ns2:CellIndex>\n"
+        "\t\t\t\t<ns2:CellPac Barcode=\"10080792080000000182317411029151\" />\n"
+        "\t\t\t\t<ns2:Primary>\n"
+        "\t\t\t\t\t<ns2:AutomationName>BasecallerV1</ns2:AutomationName>\n"
+        "\t\t\t\t\t<ns2:ConfigFileName>2-3-0_P6-C4.xml</ns2:ConfigFileName>\n"
+        "\t\t\t\t\t<ns2:SequencingCondition />\n"
+        "\t\t\t\t\t<ns2:OutputOptions>\n"
+        "\t\t\t\t\t\t<ns2:ResultsFolder>Analysis_Results</ns2:ResultsFolder>\n"
+        "\t\t\t\t\t\t<ns2:CollectionPathUri>rsy://mp-rsync/vol55//RS_DATA_STAGING/42267/Inst42267-040315-SAT-100pM-2kb-P6C4_13/A04_7/</ns2:CollectionPathUri>\n"
+        "\t\t\t\t\t\t<ns2:CopyFiles>\n"
+        "\t\t\t\t\t\t\t<ns2:CollectionFileCopy>Fasta</ns2:CollectionFileCopy>\n"
+        "\t\t\t\t\t\t</ns2:CopyFiles>\n"
+        "\t\t\t\t\t\t<ns2:Readout>Bases</ns2:Readout>\n"
+        "\t\t\t\t\t\t<ns2:MetricsVerbosity>Minimal</ns2:MetricsVerbosity>\n"
+        "\t\t\t\t\t</ns2:OutputOptions>\n"
+        "\t\t\t\t</ns2:Primary>\n"
+        "\t\t\t</ns2:CollectionMetadata>\n"
+        "\t\t</ns2:Collections>\n"
+        "\t\t<ns1:BioSamples>\n"
+        "\t\t\t<ns1:BioSample Description=\"Inst42267-SAT-100pM-2kbLambda-P6C4-Std120_CPS_040315\" "
+                            "MetaType=\"PacBio.Sample\" Name=\"Inst42267-040315-SAT-100pM-2kb-P6C4\" "
+                            "TimeStampedName=\"biosample_2015-08-19T15:39:36.331-07:00\" UniqueId=\"251acf71-9eb0-489e-9dd1-cdbd11432752\" />\n"
+        "\t\t</ns1:BioSamples>\n"
+        "\t</DataSetMetadata>\n"
+        "</SubreadSet>\n"};
+
+    EXPECT_EQ(expected, s.str());
+}
+
+TEST(DataSetIOTest, RelativePathCarriedThroughOk_FromString)
+{
+    const std::string inputXml{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet "
+            "CreatedAt=\"2015-01-27T09:00:01\" "
+            "MetaType=\"PacBio.DataSet.AlignmentSet\" "
+            "Name=\"DataSet_AlignmentSet\" "
+            "Tags=\"barcode moreTags mapping mytags\" "
+            "TimeStampedName=\"biosample_2015-08-19T15:39:36.331-07:00\" "
+            "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+            "Version=\"2.3.0\" "
+            "xmlns=\"http://pacificbiosciences.com/PacBioDataModel.xsd\" "
+            "xmlns:pbbase=\"http://pacificbiosciences.com/PacBioBaseDataModel.xsd\" "
+            "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+            "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+            "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDataModel.xsd\">\n"
+        "\t<pbbase:ExternalResources>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to an example Alignments BAM file.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Third Alignments BAM\" "
+                "ResourceId=\"../path/to/resource1.bam\" "
+                "Tags=\"Example\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                    "MetaType=\"PacBio.Index.PacBioIndex\" "
+                    "ResourceId=\"../path/to/resource1.bam.pbi\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to another example Alignments BAM file, by relative path.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Fourth Alignments BAM\" "
+                "ResourceId=\"../path/to/resource2.bam\" "
+                "Tags=\"Example\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                    "MetaType=\"PacBio.Index.PacBioIndex\" "
+                    "ResourceId=\"../path/to/resource2.bam.pbi\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t</pbbase:ExternalResources>\n"
+        "</pbds:AlignmentSet>\n"};
+
+    auto dataset = DataSet::FromXml(inputXml);
+
+    std::ostringstream stream;
+    dataset.SaveToStream(stream);
+    auto outputXml = stream.str();
+
+    EXPECT_EQ(inputXml, outputXml);
+}
+
+TEST(DataSetIOTest, RelativePathCarriedThroughOk_FromFile)
+{
+    DataSet dataset(PbbamTestsConfig::Data_Dir + "/relative/relative.xml");
+    auto resources = dataset.ExternalResources();
+    EXPECT_EQ("./a/test.bam",  resources[0].ResourceId());
+    EXPECT_EQ("./b/test1.bam", resources[1].ResourceId());
+    EXPECT_EQ("./b/test2.bam", resources[2].ResourceId());
+
+    std::ostringstream out;
+    dataset.SaveToStream(out);
+
+    auto newDataset = DataSet::FromXml(out.str());
+    auto newResources = newDataset.ExternalResources();
+    EXPECT_EQ("./a/test.bam",  newResources[0].ResourceId());
+    EXPECT_EQ("./b/test1.bam", newResources[1].ResourceId());
+    EXPECT_EQ("./b/test2.bam", newResources[2].ResourceId());
+}
+
+TEST(DataSetIOTest, DataSetFromRelativeBamFilename)
+{
+    // cache initial directory and move to location so we can test relatvie filename ok
+    const std::string startingDirectory = FileUtils::CurrentWorkingDirectory();
+
+    const std::string targetDirectory = PbbamTestsConfig::Data_Dir + "/dataset";
+    DataSetIOTests::changeCurrentDirectory(targetDirectory);
+    ASSERT_EQ(targetDirectory, FileUtils::CurrentWorkingDirectory());
+
+    EXPECT_NO_THROW(
+    {
+        const std::string relativeBamFn = "../phi29.bam";
+        const DataSet ds(relativeBamFn);
+        const auto files = ds.BamFiles();
+        EXPECT_EQ(1, files.size());
+    });
+
+    // restore working directory
+    DataSetIOTests::changeCurrentDirectory(startingDirectory);
+}
+
+TEST(DataaSetIOTest, AllFiles)
+{
+    // check  BamFiles only
+    EXPECT_NO_THROW(
+    {
+        const DataSet dataset(PbbamTestsConfig::Data_Dir + "/chunking/chunking.subreadset.xml");
+        const auto bamFiles = dataset.BamFiles();
+        EXPECT_EQ(3, bamFiles.size());
+    });
+
+    // now fetch all files (original BAMs plus PBI files)
+    EXPECT_NO_THROW(
+    {
+        const DataSet dataset(PbbamTestsConfig::Data_Dir + "/chunking/chunking.subreadset.xml");
+        const auto allFiles = dataset.AllFiles();
+        EXPECT_EQ(6, allFiles.size());
+    });
+}
+
+TEST(DataSetIOTest, MetadataDefaultChildrenProperlyOrderedPerXsd)
+{
+    DataSet dataset(DataSet::ALIGNMENT);
+    dataset.CreatedAt("2015-01-27T09:00:01");
+    dataset.MetaType("PacBio.DataSet.AlignmentSet");
+    dataset.Name("DataSet_AlignmentSet");
+    dataset.Tags("barcode moreTags mapping mytags");
+    dataset.TimeStampedName("my_time_stamped_name");
+    dataset.UniqueId("b095d0a3-94b8-4918-b3af-a3f81bbe519c");
+    dataset.Attribute("xmlns",              "http://pacificbiosciences.com/PacBioDatasets.xsd")
+           .Attribute("xmlns:xsi",          "http://www.w3.org/2001/XMLSchema-instance")
+           .Attribute("xsi:schemaLocation", "http://pacificbiosciences.com/PacBioDatasets.xsd");
+
+    ExternalResource ext("Fake.MetaType", "filename");
+    ext.CreatedAt("2015-01-27T09:00:01");
+    ext.TimeStampedName("custom_tsn")
+       .UniqueId("my_uuid");
+    dataset.ExternalResources().Add(ext);
+
+    const auto numRecords = std::to_string(42);
+    const auto totalLength = std::to_string(1000);
+    DataSetMetadata metadata(numRecords, totalLength);
+    dataset.Metadata(metadata);
+
+    std::ostringstream s;
+    dataset.SaveToStream(s);
+
+    const std::string result = s.str();
+    const size_t xmlnsFound = result.find("xmlns=");
+    const size_t xmlnsXsiFound = result.find("xmlns:xsi=");
+    const size_t xsiSchemaLocationFound = result.find("xsi:schemaLocation=");
+    const size_t xmlnsPbbaseFound = result.find("xmlns:pbbase=");
+    const size_t xmlnsPbdsFound = result.find("xmlns:pbds=");
+
+    EXPECT_TRUE(xmlnsFound < xmlnsXsiFound);
+    EXPECT_TRUE(xmlnsXsiFound < xsiSchemaLocationFound);
+    EXPECT_TRUE(xsiSchemaLocationFound < xmlnsPbbaseFound);
+    EXPECT_TRUE(xmlnsPbbaseFound < xmlnsPbdsFound);
+}
+
+TEST(DataSetIOTest, MakeReferenceSetFromSubdataset)
+{
+    // ReferenceSet with ReferenceSet subdataset
+    const std::string referenceSetXml{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:ReferenceSet CreatedAt=\"2015-01-27T09:00:01\" MetaType=\"PacBio.DataSet.ReferenceSet\" "
+                "Name=\"DataSet_ReferenceSet\" Tags=\"barcode moreTags mapping mytags\" "
+                "TimeStampedName=\"my_time_stamped_name\" "
+                "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" Version=\"3.0.1\" "
+                "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:pbbase=\"http://pacificbiosciences.com/PacBioBaseDataModel.xsd\" "
+                "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t<pbbase:ExternalResources>\n"
+        "\t\t<pbbase:ExternalResource MetaType=\"Fake.MetaType\" ResourceId=\"filename\" TimeStampedName=\"custom_tsn\" UniqueId=\"my_uuid\" Version=\"3.0.1\" />\n"
+        "\t</pbbase:ExternalResources>\n"
+        "\t<pbds:DataSets>\n"
+        "\t\t<pbds:ReferenceSet> CreatedAt=\"2015-01-27T09:00:01\" MetaType=\"PacBio.DataSet.ReferenceSet\" "
+                "Name=\"DataSet_ReferenceSet\" Tags=\"barcode moreTags mapping mytags\" "
+                "TimeStampedName=\"my_time_stamped_name\" "
+                "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" Version=\"3.0.1\" "
+                "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:pbbase=\"http://pacificbiosciences.com/PacBioBaseDataModel.xsd\" "
+                "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t\t\t<pbds:DataSetMetadata>\n"
+        "\t\t\t\t<pbds:TotalLength>1000</pbds:TotalLength>\n"
+        "\t\t\t\t<pbds:NumRecords>42</pbds:NumRecords>\n"
+        "\t\t\t</pbds:DataSetMetadata>\n"
+        "\t\t</pbds:ReferenceSet>\n"
+        "\t</pbds:DataSets>\n"
+        "\t<pbds:DataSetMetadata>\n"
+        "\t\t<pbds:TotalLength>1000</pbds:TotalLength>\n"
+        "\t\t<pbds:NumRecords>42</pbds:NumRecords>\n"
+        "\t</pbds:DataSetMetadata>\n"
+        "</pbds:ReferenceSet>\n"};
+
+    EXPECT_NO_THROW(DataSet::FromXml(referenceSetXml));
+
+    // AlignmentSet with ReferenceSet subdataset
+    const std::string alignmentSetXml{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet CreatedAt=\"2015-01-27T09:00:01\" MetaType=\"PacBio.DataSet.AlignmentSet\" "
+                "Name=\"DataSet_AlignmentSet\" Tags=\"barcode moreTags mapping mytags\" "
+                "TimeStampedName=\"my_time_stamped_name\" "
+                "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" Version=\"3.0.1\" "
+                "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:pbbase=\"http://pacificbiosciences.com/PacBioBaseDataModel.xsd\" "
+                "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t<pbbase:ExternalResources>\n"
+        "\t\t<pbbase:ExternalResource MetaType=\"Fake.MetaType\" ResourceId=\"filename\" TimeStampedName=\"custom_tsn\" UniqueId=\"my_uuid\" Version=\"3.0.1\" />\n"
+        "\t</pbbase:ExternalResources>\n"
+        "\t<pbds:DataSets>\n"
+        "\t\t<pbds:ReferenceSet> CreatedAt=\"2015-01-27T09:00:01\" MetaType=\"PacBio.DataSet.ReferenceSet\" "
+                "Name=\"DataSet_ReferenceSet\" Tags=\"barcode moreTags mapping mytags\" "
+                "TimeStampedName=\"my_time_stamped_name\" "
+                "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" Version=\"3.0.1\" "
+                "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:pbbase=\"http://pacificbiosciences.com/PacBioBaseDataModel.xsd\" "
+                "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t\t\t<pbds:DataSetMetadata>\n"
+        "\t\t\t\t<pbds:TotalLength>1000</pbds:TotalLength>\n"
+        "\t\t\t\t<pbds:NumRecords>42</pbds:NumRecords>\n"
+        "\t\t\t</pbds:DataSetMetadata>\n"
+        "\t\t</pbds:ReferenceSet>\n"
+        "\t</pbds:DataSets>\n"
+        "\t<pbds:DataSetMetadata>\n"
+        "\t\t<pbds:TotalLength>1000</pbds:TotalLength>\n"
+        "\t\t<pbds:NumRecords>42</pbds:NumRecords>\n"
+        "\t</pbds:DataSetMetadata>\n"
+        "</pbds:AlignmentSet>\n"};
+
+    EXPECT_NO_THROW(DataSet::FromXml(alignmentSetXml));
+}
+
+TEST(DataSetIOTest, AbsolutePathsForGenericAndDerivedDatasets)
+{
+    DataSet dataset;
+    ReferenceSet referenceDataset;
+
+    dataset.ExternalResources().Add(ExternalResource{"PacBio.SubreadFile.SubreadBamFile", "test.fa"});
+    referenceDataset.ExternalResources().Add(ExternalResource{"PacBio.SubreadFile.SubreadBamFile", "test.fa"});
+
+    const std::string expectedGenericFn{dataset.Path() + "/test.fa"};
+    const std::string expectedReferenceFn{referenceDataset.Path() + "/test.fa"};
+
+    std::ostringstream out;
+    dataset.SaveToStream(out);
+    const std::string genericDatasetXml{out.str()};
+    EXPECT_NE(genericDatasetXml.find(expectedGenericFn), std::string::npos);
+
+    out.str("");
+    referenceDataset.SaveToStream(out);
+    const std::string referenceDatasetXml{out.str()};
+    EXPECT_NE(referenceDatasetXml.find(expectedReferenceFn), std::string::npos);
+}
+
+// clang-format on
diff --git a/tests/src/test_DataSetQuery.cpp b/tests/src/test_DataSetQuery.cpp

new file mode 100644 (file)

index 0000000..e0ada2d
--- /dev/null
+++ b/tests/src/test_DataSetQuery.cpp
@@ -0,0 +1,463 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <boost/any.hpp>
+
+#include <pbbam/DataSet.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/GenomicIntervalQuery.h>
+#include <pbbam/Unused.h>
+#include <pbbam/ZmwGroupQuery.h>
+#include <pbbam/ZmwQuery.h>
+
+#include "PbbamTestData.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace DataSetQueryTests {
+
+const std::string alignedBamFn = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+const std::string aligned2BamFn = PbbamTestsConfig::Data_Dir + "/aligned2.bam";
+const std::string alignedCopyBamFn = PbbamTestsConfig::GeneratedData_Dir + "/aligned.bam";
+const std::string aligned2CopyBamFn = PbbamTestsConfig::GeneratedData_Dir + "/aligned2.bam";
+
+const std::string group_fofn = PbbamTestsConfig::Generated_Dir + "/group.fofn";
+const std::string group_file1 = PbbamTestsConfig::Data_Dir + "/group/test1.bam";
+const std::string group_file2 = PbbamTestsConfig::Data_Dir + "/group/test2.bam";
+const std::string group_file3 = PbbamTestsConfig::Data_Dir + "/group/test3.bam";
+
+const std::vector<std::string> group_file1_names{
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/24962/0_427"};
+
+const std::vector<std::string> group_file2_names{
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2114_2531",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/4101_5571",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237"};
+
+const std::vector<std::string> group_file3_names{
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/45203/0_893",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/45203/0_893",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/3759_4005",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/4052_4686",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/4732_4869",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/9482_9628",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/9675_10333",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/10378_10609",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49194/0_798",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49194/845_1541",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49521/0_134"};
+
+static inline bool InGroup(const std::string& name, const std::vector<std::string>& group)
+{
+    for (const std::string& s : group) {
+        if (s == name) return true;
+    }
+    return false;
+}
+
+}  // namespace DataSetQueryTests
+
+TEST(DataSetQueryTest, EntireFileQueryTest)
+{
+    // single file
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::alignedBamFn);
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(bamFile);
+
+        int count = 0;
+        EntireFileQuery query(dataset);  // from DataSet object
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(4, count);
+
+        count = 0;
+        EntireFileQuery query2(DataSetQueryTests::alignedBamFn);  // from BAM filename
+        for (const BamRecord& record : query2) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(4, count);
+
+        count = 0;
+        EntireFileQuery query3(bamFile);  // from BamFile object
+        for (const BamRecord& record : query3) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(4, count);
+    });
+
+    // duplicate file attempt
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::alignedBamFn);
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(bamFile);
+        dataset.ExternalResources().Add(bamFile);
+
+        int count = 0;
+        EntireFileQuery query(dataset);
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(4, count);  // same as single
+    });
+
+    // true multi-file dataset
+    EXPECT_NO_THROW({
+        BamFile file1(DataSetQueryTests::group_file1);  // 1 read
+        BamFile file2(DataSetQueryTests::group_file2);  // 4 reads
+        BamFile file3(DataSetQueryTests::group_file3);  // 13 reads
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(file1);
+        dataset.ExternalResources().Add(file2);
+        dataset.ExternalResources().Add(file3);
+
+        int count = 0;
+        EntireFileQuery query(dataset);
+        for (const BamRecord& record : query) {
+
+            // ensure sequential merge of files
+            if (count == 0)
+                EXPECT_TRUE(DataSetQueryTests::InGroup(record.FullName(),
+                                                       DataSetQueryTests::group_file1_names));
+            else if (count < 5)
+                EXPECT_TRUE(DataSetQueryTests::InGroup(record.FullName(),
+                                                       DataSetQueryTests::group_file2_names));
+            else
+                EXPECT_TRUE(DataSetQueryTests::InGroup(record.FullName(),
+                                                       DataSetQueryTests::group_file3_names));
+
+            ++count;
+        }
+        EXPECT_EQ(18, count);
+    });
+
+    // same as above, from FOFN
+    EXPECT_NO_THROW({
+        int count = 0;
+
+        DataSet dataset(DataSetQueryTests::group_fofn);
+        EntireFileQuery query(dataset);
+        for (const BamRecord& record : query) {
+
+            // ensure sequential merge of files
+            if (count == 0)
+                EXPECT_TRUE(DataSetQueryTests::InGroup(record.FullName(),
+                                                       DataSetQueryTests::group_file1_names));
+            else if (count < 5)
+                EXPECT_TRUE(DataSetQueryTests::InGroup(record.FullName(),
+                                                       DataSetQueryTests::group_file2_names));
+            else
+                EXPECT_TRUE(DataSetQueryTests::InGroup(record.FullName(),
+                                                       DataSetQueryTests::group_file3_names));
+
+            ++count;
+        }
+        EXPECT_EQ(18, count);
+    });
+}
+
+TEST(DataSetQueryTest, GenomicIntervalQueryTest)
+{
+    const std::string rname = "lambda_NEB3011";
+
+    // single file
+    EXPECT_NO_THROW({
+        DataSet dataset(DataSetQueryTests::alignedBamFn);  // from BAM filename
+
+        // count records
+        int count = 0;
+        GenomicInterval interval(rname, 5000, 6000);
+        GenomicIntervalQuery query(interval, dataset);
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(2, count);
+
+        // adjust interval and pass back in
+        count = 0;
+        interval.Start(9000);
+        interval.Stop(9500);
+        query.Interval(interval);
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(2, count);
+
+        // unknown ref
+        count = 0;
+        interval.Name("does not exist");
+        interval.Start(0);
+        interval.Stop(100);
+        EXPECT_THROW(query.Interval(interval), std::exception);
+        for (const BamRecord& record : query) {  // iteration is still safe, just returns no data
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(0, count);
+
+        // adjust again - make sure we can read a real region after an invalid one
+        interval.Name(rname);
+        interval.Start(5000);
+        interval.Stop(6000);
+        query.Interval(interval);
+        count = 0;
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(2, count);
+    });
+
+    // duplicate file
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::alignedBamFn);
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(bamFile);
+        dataset.ExternalResources().Add(bamFile);
+
+        // count records & also ensure sorted merge
+        int count = 0;
+        int prevId = 0;
+        int prevPos = 0;
+
+        GenomicInterval interval(rname, 5000, 6000);
+        GenomicIntervalQuery query(interval, dataset);
+        for (const BamRecord& record : query) {
+
+            EXPECT_TRUE(record.ReferenceId() >= prevId);
+            EXPECT_TRUE(record.ReferenceStart() >= prevPos);
+
+            prevId = record.ReferenceId();
+            prevPos = record.ReferenceStart();
+            ++count;
+        }
+        EXPECT_EQ(2, count);  // same as single file
+
+        // adjust interval and pass back in
+        count = 0;
+        interval.Start(9000);
+        interval.Stop(10000);
+        query.Interval(interval);
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(2, count);  // same as single file
+
+        // unknown ref
+        count = 0;
+        interval.Name("does not exist");
+        interval.Start(0);
+        interval.Stop(100);
+        EXPECT_THROW(query.Interval(interval), std::exception);
+        for (const BamRecord& record : query) {  // iteration is still safe, just returns no data
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(0, count);  // same as single file
+
+        // adjust again - make sure we can read a real region after an invalid one
+        interval.Name(rname);
+        interval.Start(5000);
+        interval.Stop(5300);
+        query.Interval(interval);
+        count = 0;
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(2, count);  // same as single file
+    });
+
+    // multi file BAM (same record content for easy testing, but different filename(ResourceId)
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::alignedBamFn);
+        BamFile copyFile(DataSetQueryTests::alignedCopyBamFn);
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(bamFile);
+        dataset.ExternalResources().Add(copyFile);
+
+        // count records & also ensure sorted merge
+        int count = 0;
+        int prevId = 0;
+        int prevPos = 0;
+
+        GenomicInterval interval(rname, 5000, 6000);
+        GenomicIntervalQuery query(interval, dataset);
+        for (const BamRecord& record : query) {
+
+            EXPECT_TRUE(record.ReferenceId() >= prevId);
+            EXPECT_TRUE(record.ReferenceStart() >= prevPos);
+
+            prevId = record.ReferenceId();
+            prevPos = record.ReferenceStart();
+            ++count;
+        }
+        EXPECT_EQ(4, count);  // single file * 2
+
+        // adjust interval and pass back in
+        count = 0;
+        interval.Start(9000);
+        interval.Stop(10000);
+        query.Interval(interval);
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(4, count);  // single file * 2
+
+        // unknown ref
+        count = 0;
+        interval.Name("does not exist");
+        interval.Start(0);
+        interval.Stop(100);
+        EXPECT_THROW(query.Interval(interval), std::exception);
+        for (const BamRecord& record : query) {  // iteration is still safe, just returns no data
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(0, count);  // single file * 2
+
+        // adjust again - make sure we can read a real region after an invalid one
+        interval.Name(rname);
+        interval.Start(5000);
+        interval.Stop(5300);
+        query.Interval(interval);
+        count = 0;
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(4, count);  // single file * 2
+    });
+}
+
+// TODO: implement me
+TEST(DataSetQueryTest, QNameQueryTest) { EXPECT_TRUE(true); }
+
+TEST(DataSetQueryTest, ZmwQueryTest)
+{
+    const std::vector<int32_t> whitelist = {13473, 30983};
+
+    // single file
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::aligned2BamFn);
+        ASSERT_TRUE(bamFile.PacBioIndexExists());
+        DataSet dataset(bamFile);
+
+        int count = 0;
+        ZmwQuery query(whitelist, dataset);
+        for (const BamRecord& record : query) {
+            const int32_t holeNumber = record.HoleNumber();
+            EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983);
+            ++count;
+        }
+        EXPECT_EQ(4, count);
+    });
+
+    // multi-file
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::aligned2BamFn);
+        BamFile bamFile2(DataSetQueryTests::aligned2CopyBamFn);
+        ASSERT_TRUE(bamFile.PacBioIndexExists());
+        ASSERT_TRUE(bamFile2.PacBioIndexExists());
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(ExternalResource(bamFile));
+        dataset.ExternalResources().Add(ExternalResource(bamFile2));
+
+        int count = 0;
+        ZmwQuery query(whitelist, dataset);
+        for (const BamRecord& r : query) {
+            const auto holeNumber = r.HoleNumber();
+            EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983);
+            ++count;
+        }
+        EXPECT_EQ(8, count);
+    });
+}
+
+TEST(DataSetQueryTest, ZmwGroupQueryTest)
+{
+    const std::vector<int32_t> whitelist = {13473, 30983};
+
+    // single-file
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::aligned2BamFn);
+        ASSERT_TRUE(bamFile.PacBioIndexExists());
+        DataSet dataset(bamFile);
+
+        int count = 0;
+        int32_t groupZmw = -1;
+        ZmwGroupQuery query(whitelist, dataset);
+        for (const std::vector<BamRecord>& group : query) {
+            for (const BamRecord& record : group) {
+                const auto holeNumber = record.HoleNumber();
+                if (groupZmw == -1) groupZmw = holeNumber;
+                EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983);
+                EXPECT_EQ(groupZmw, holeNumber);
+                ++count;
+            }
+            groupZmw = -1;
+        }
+        EXPECT_EQ(4, count);
+    });
+
+    // multi-file
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::aligned2BamFn);
+        BamFile bamFile2(DataSetQueryTests::aligned2CopyBamFn);
+        ASSERT_TRUE(bamFile.PacBioIndexExists());
+        ASSERT_TRUE(bamFile2.PacBioIndexExists());
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(ExternalResource(bamFile));
+        dataset.ExternalResources().Add(ExternalResource(bamFile2));
+
+        int totalCount = 0;
+        int numRecordsInGroup = 0;
+        int groupCount = 0;
+        int32_t groupZmw = -1;
+        ZmwGroupQuery query(whitelist, dataset);
+        for (const std::vector<BamRecord>& group : query) {
+            for (const BamRecord& record : group) {
+                const auto holeNumber = record.HoleNumber();
+                ++numRecordsInGroup;
+                if (groupZmw == -1) groupZmw = holeNumber;
+                EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983);
+                EXPECT_EQ(groupZmw, holeNumber);
+                ++totalCount;
+            }
+            if (groupCount == 0)
+                EXPECT_EQ(4, numRecordsInGroup);
+            else if (groupCount == 1)
+                EXPECT_EQ(4, numRecordsInGroup);
+            else
+                EXPECT_TRUE(false);  // should not get here
+            numRecordsInGroup = 0;
+            ++groupCount;
+            groupZmw = -1;
+        }
+        EXPECT_EQ(8, totalCount);
+    });
+}
diff --git a/tests/src/test_DataSetXsd.cpp b/tests/src/test_DataSetXsd.cpp

new file mode 100644 (file)

index 0000000..2a7e0a7
--- /dev/null
+++ b/tests/src/test_DataSetXsd.cpp
@@ -0,0 +1,140 @@
+// Author: Derek Barnett
+
+#include <sstream>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/DataSet.h>
+#include <pbbam/DataSetXsd.h>
+
+#include "PbbamTestData.h"
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(DataSetXsdTest, DefaultsOk)
+{
+    NamespaceRegistry registry;
+
+    const NamespaceInfo& baseInfo = registry.Namespace(XsdType::BASE_DATA_MODEL);
+    const NamespaceInfo& dsInfo   = registry.Namespace(XsdType::DATASETS);
+    const NamespaceInfo& defaultInfo = registry.DefaultNamespace();
+
+    EXPECT_EQ(XsdType::DATASETS, registry.DefaultXsd());
+
+    EXPECT_EQ(std::string("pbds"),   dsInfo.Name());
+    EXPECT_EQ(std::string("pbbase"), baseInfo.Name());
+    EXPECT_EQ(std::string("pbds"),   defaultInfo.Name());
+
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioBaseDataModel.xsd"), baseInfo.Uri());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"),      dsInfo.Uri());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"),      defaultInfo.Uri());
+}
+
+TEST(DataSetXsdTest, EditDefaultOk)
+{
+    NamespaceRegistry registry;
+    registry.SetDefaultXsd(XsdType::DATASETS);
+
+    const NamespaceInfo& defaultInfo = registry.DefaultNamespace();
+
+    EXPECT_EQ(XsdType::DATASETS, registry.DefaultXsd());
+    EXPECT_EQ(std::string("pbds"), defaultInfo.Name());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), defaultInfo.Uri());
+}
+
+TEST(DataSetXsdTest, EditRegistryOk)
+{
+    NamespaceRegistry registry;
+    registry.Register(XsdType::DATASETS, NamespaceInfo("custom", "http://custom/uri.xsd"));
+
+    const NamespaceInfo& dsInfo = registry.Namespace(XsdType::DATASETS);
+
+    EXPECT_EQ(std::string("custom"),                dsInfo.Name());
+    EXPECT_EQ(std::string("http://custom/uri.xsd"), dsInfo.Uri());
+}
+
+TEST(DataSetXsdTest, EditDatasetRegistry)
+{
+    DataSet dataset(DataSet::ALIGNMENT);
+    dataset.CreatedAt("2015-01-27T09:00:01");
+    dataset.MetaType("PacBio.DataSet.AlignmentSet");
+    dataset.Name("DataSet_AlignmentSet");
+    dataset.Tags("barcode moreTags mapping mytags");
+    dataset.TimeStampedName("my_time_stamped_name");
+    dataset.UniqueId("b095d0a3-94b8-4918-b3af-a3f81bbe519c");
+    dataset.Attribute("xmlns",              "http://pacificbiosciences.com/PacBioDatasets.xsd")
+           .Attribute("xmlns:xsi",          "http://www.w3.org/2001/XMLSchema-instance")
+           .Attribute("xsi:schemaLocation", "http://pacificbiosciences.com/PacBioDatasets.xsd");
+
+    ExternalResource ext("Fake.MetaType", "filename");
+    ext.CreatedAt("2015-01-27T09:00:01");
+    ext.TimeStampedName("custom_tsn")
+       .UniqueId("my_uuid");
+    dataset.ExternalResources().Add(ext);
+
+    dataset.Namespaces().Register(XsdType::BASE_DATA_MODEL, NamespaceInfo("custom", "http://custom/uri.xsd"));
+
+    std::ostringstream s;
+    dataset.SaveToStream(s);
+    std::string result = s.str();
+    EXPECT_NE(result.find("custom:ExternalResource"), std::string::npos);
+}
+
+TEST(DataSetXsdTest, ElementRegistryOk)
+{
+    { // default namespaces
+
+        DataSet ds;
+
+        // append child elements that do not have a C++ built-in, nor namespace prefix with addition
+        DataSetMetadata& metadata = ds.Metadata();
+        metadata.AddChild(internal::DataSetElement("SummaryStats"));
+        metadata.AddChild(internal::DataSetElement("CopyFiles"));
+        metadata.AddChild(internal::DataSetElement("BioSamples"));
+        metadata.AddChild(internal::DataSetElement("AutomationParameters"));
+
+        std::ostringstream s;
+        ds.SaveToStream(s);
+        const std::string output = s.str();
+
+        // check that default namespace is propagated properly
+        EXPECT_TRUE(output.find("pbds:SummaryStats") != std::string::npos);
+        EXPECT_TRUE(output.find("pbmeta:CopyFiles") != std::string::npos);
+        EXPECT_TRUE(output.find("pbsample:BioSamples") != std::string::npos);
+        EXPECT_TRUE(output.find("pbbase:AutomationParameters") != std::string::npos);
+    }
+
+    { // custom namespaces
+
+        DataSet ds;
+
+        // setup custom namespaces
+        ds.Namespaces().Register(XsdType::BASE_DATA_MODEL,     NamespaceInfo("custom_base",   "http://custom/base.xsd"));
+        ds.Namespaces().Register(XsdType::COLLECTION_METADATA, NamespaceInfo("custom_meta",   "http://custom/meta.xsd"));
+        ds.Namespaces().Register(XsdType::DATASETS,            NamespaceInfo("custom_ds",     "http://custom/datasets.xsd"));
+        ds.Namespaces().Register(XsdType::SAMPLE_INFO,         NamespaceInfo("custom_sample", "http://custom/base.xsd"));
+
+        // append child elements that do not have a C++ built-in, nor namespace prefix with addition
+        DataSetMetadata& metadata = ds.Metadata();
+        metadata.AddChild(internal::DataSetElement("SummaryStats"));
+        metadata.AddChild(internal::DataSetElement("CopyFiles"));
+        metadata.AddChild(internal::DataSetElement("BioSamples"));
+        metadata.AddChild(internal::DataSetElement("AutomationParameters"));
+
+        std::ostringstream s;
+        ds.SaveToStream(s);
+        const std::string output = s.str();
+
+        // check that custom namespace is propagated properly
+        EXPECT_TRUE(output.find("custom_ds:SummaryStats") != std::string::npos);
+        EXPECT_TRUE(output.find("custom_meta:CopyFiles") != std::string::npos);
+        EXPECT_TRUE(output.find("custom_sample:BioSamples") != std::string::npos);
+        EXPECT_TRUE(output.find("custom_base:AutomationParameters") != std::string::npos);
+    }
+}
+
+// clang-format on
diff --git a/tests/src/test_EndToEnd.cpp b/tests/src/test_EndToEnd.cpp

new file mode 100644 (file)

index 0000000..f47dc5a
--- /dev/null
+++ b/tests/src/test_EndToEnd.cpp
@@ -0,0 +1,206 @@
+// Author: Derek Barnett
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <htslib/sam.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamFile.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace EndToEndTests {
+
+struct Bam1Deleter
+{
+    void operator()(bam1_t* b) const
+    {
+        if (b) bam_destroy1(b);
+        b = nullptr;
+    }
+};
+
+struct SamFileDeleter
+{
+    void operator()(samFile* file) const
+    {
+        if (file) sam_close(file);
+        file = nullptr;
+    }
+};
+
+struct BamHdrDeleter
+{
+    void operator()(bam_hdr_t* hdr) const
+    {
+        if (hdr) bam_hdr_destroy(hdr);
+        hdr = nullptr;
+    }
+};
+
+const std::string inputBamFn = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+const std::string goldStandardSamFn = PbbamTestsConfig::Data_Dir + "/aligned.sam";
+const std::string generatedBamFn = PbbamTestsConfig::GeneratedData_Dir + "/generated.bam";
+const std::string generatedSamFn = PbbamTestsConfig::GeneratedData_Dir + "/generated.sam";
+const std::vector<std::string> generatedFiles = {generatedBamFn, generatedSamFn};
+
+static inline int RunBam2Sam(const std::string& bamFn, const std::string& samFn,
+                             const std::string& args = std::string())
+{
+    std::ostringstream s;
+    s << PbbamTestsConfig::Bam2Sam << " " << args << " " << bamFn << " > " << samFn;
+    return system(s.str().c_str());
+}
+
+static inline int RunDiff(const std::string& fn1, const std::string& fn2)
+{
+    std::ostringstream s;
+    s << "diff " << fn1 << " " << fn2;
+    return system(s.str().c_str());
+}
+
+static inline void Remove(const std::vector<std::string>& files)
+{
+    for (const auto& fn : files)
+        remove(fn.c_str());
+}
+
+static inline void CheckGeneratedOutput()
+{
+    // convert to sam & diff against gold standard
+    const int convertRet = RunBam2Sam(generatedBamFn, generatedSamFn);
+    const int diffRet = RunDiff(goldStandardSamFn, generatedSamFn);
+    EXPECT_EQ(0, convertRet);
+    EXPECT_EQ(0, diffRet);
+
+    // clean up
+    Remove(generatedFiles);
+}
+
+}  // namespace EndToEndTests
+
+// sanity check for rest of tests below
+TEST(EndToEndTest, ReadAndWrite_PureHtslib)
+{
+    {  // scoped to force flush & close before conversion/diff
+
+        // open files
+
+        std::unique_ptr<samFile, EndToEndTests::SamFileDeleter> inWrapper(
+            sam_open(EndToEndTests::inputBamFn.c_str(), "r"));
+        samFile* in = inWrapper.get();
+        ASSERT_TRUE(in);
+
+        std::unique_ptr<samFile, EndToEndTests::SamFileDeleter> outWrapper(
+            sam_open(EndToEndTests::generatedBamFn.c_str(), "wb"));
+        samFile* out = outWrapper.get();
+        ASSERT_TRUE(out);
+
+        // fetch & write header
+
+        std::unique_ptr<bam_hdr_t, EndToEndTests::BamHdrDeleter> headerWrapper(sam_hdr_read(in));
+        bam_hdr_t* hdr = headerWrapper.get();
+        ASSERT_TRUE(hdr);
+        ASSERT_EQ(0, sam_hdr_write(out, hdr));
+
+        // fetch & write records
+
+        std::unique_ptr<bam1_t, EndToEndTests::Bam1Deleter> record(bam_init1());
+        bam1_t* b = record.get();
+        ASSERT_TRUE(b);
+
+        while (sam_read1(in, hdr, b) >= 0) {
+            const auto ret = sam_write1(out, hdr, b);
+            UNUSED(ret);
+        }
+    }
+
+    EndToEndTests::CheckGeneratedOutput();
+}
+
+TEST(EndToEndTest, ReadAndWrite_SingleThread)
+{
+    EXPECT_NO_THROW({
+        // open input BAM file
+        BamFile bamFile(EndToEndTests::inputBamFn);
+
+        // open output BAM file
+        BamWriter writer(EndToEndTests::generatedBamFn, bamFile.Header(),
+                         BamWriter::DefaultCompression, 1);
+
+        // copy BAM file
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile)
+            writer.Write(record);
+    });
+
+    EndToEndTests::CheckGeneratedOutput();
+}
+
+TEST(EndToEndTest, ReadAndWrite_APIDefaultThreadCount)
+{
+    EXPECT_NO_THROW({
+        // open input BAM file
+        BamFile bamFile(EndToEndTests::inputBamFn);
+
+        // open output BAM file
+        BamWriter writer(EndToEndTests::generatedBamFn, bamFile.Header());
+
+        // copy BAM file
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile)
+            writer.Write(record);
+    });
+
+    EndToEndTests::CheckGeneratedOutput();
+}
+
+TEST(EndToEndTest, ReadAndWrite_SystemDefaultThreadCount)
+{
+    EXPECT_NO_THROW({
+        // open input BAM file
+        BamFile bamFile(EndToEndTests::inputBamFn);
+
+        // open output BAM file
+        BamWriter writer(EndToEndTests::generatedBamFn, bamFile.Header(),
+                         BamWriter::DefaultCompression, 0);
+
+        // copy BAM file
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile)
+            writer.Write(record);
+    });
+
+    EndToEndTests::CheckGeneratedOutput();
+}
+
+TEST(EndToEndTest, ReadAndWrite_UserThreadCount)
+{
+    EXPECT_NO_THROW({
+        // open input BAM file
+        BamFile bamFile(EndToEndTests::inputBamFn);
+
+        // open output BAM file
+        BamWriter writer(EndToEndTests::generatedBamFn, bamFile.Header(),
+                         BamWriter::DefaultCompression, 3);
+
+        // copy BAM file
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile)
+            writer.Write(record);
+    });
+
+    EndToEndTests::CheckGeneratedOutput();
+}
diff --git a/tests/src/test_EntireFileQuery.cpp b/tests/src/test_EntireFileQuery.cpp

new file mode 100644 (file)

index 0000000..f2f014a
--- /dev/null
+++ b/tests/src/test_EntireFileQuery.cpp
@@ -0,0 +1,106 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamWriter.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace EntireFileQueryTests {
+
+const std::string inputBamFn = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+
+}  // namespace EntireFileQueryTests
+
+TEST(EntireFileQueryTest, CountRecords)
+{
+    EXPECT_NO_THROW({
+        BamFile bamFile(EntireFileQueryTests::inputBamFn);
+        int count = 0;
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile) {
+            UNUSED(record);
+            ++count;
+        }
+
+        EXPECT_EQ(4, count);
+    });
+}
+
+TEST(EntireFileQueryTest, NonConstBamRecord)
+{
+    EXPECT_NO_THROW({
+        BamFile bamFile(EntireFileQueryTests::inputBamFn);
+        int count = 0;
+        EntireFileQuery entireFile(bamFile);
+        for (BamRecord& record : entireFile) {
+            UNUSED(record);
+            ++count;
+        }
+
+        EXPECT_EQ(4, count);
+    });
+}
+
+TEST(BamRecordTest, HandlesDeletionOK)
+{
+    // this file raised no error in Debug mode, but segfaulted when
+    // trying to access the aligned qualities in Release mode
+
+    const std::string problemBamFn = PbbamTestsConfig::Data_Dir + "/segfault.bam";
+    BamFile bamFile(problemBamFn);
+    int count = 0;
+    EntireFileQuery entireFile(bamFile);
+    for (const BamRecord& record : entireFile) {
+
+        const auto rawQualities = record.Qualities(Orientation::GENOMIC, false);
+        const auto alignedQualities = record.Qualities(Orientation::GENOMIC, true);
+
+        const std::string rawExpected{
+            "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"
+            "IIIIIIIIIIIII"};
+
+        // 1=1D98=
+        const std::string alignedExpected{
+            "I!"
+            "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"
+            "IIIIIIIIIIII"};
+
+        EXPECT_EQ(rawExpected, rawQualities.Fastq());
+        EXPECT_EQ(alignedExpected, alignedQualities.Fastq());
+
+        ++count;
+    }
+
+    EXPECT_EQ(1, count);
+}
+
+TEST(BamRecordTest, ReferenceName)
+{
+    {  // check reference name of first record
+        const std::string exampleBam = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+        BamFile bamFile(exampleBam);
+        EntireFileQuery records(bamFile);
+        auto firstIter = records.begin();
+        auto& firstRecord = *firstIter;
+        ASSERT_TRUE(firstRecord.IsMapped());
+        EXPECT_EQ("lambda_NEB3011", firstRecord.ReferenceName());
+    }
+
+    {  // unmapped records have no reference name, should throw
+        const std::string exampleBam = PbbamTestsConfig::Data_Dir + "/unmap1.bam";
+        BamFile bamFile(exampleBam);
+        EntireFileQuery records(bamFile);
+        auto firstIter = records.begin();
+        auto& firstRecord = *firstIter;
+        ASSERT_FALSE(firstRecord.IsMapped());
+        EXPECT_THROW(firstRecord.ReferenceName(), std::runtime_error);
+    }
+}
diff --git a/tests/src/test_FaiIndex.cpp b/tests/src/test_FaiIndex.cpp

new file mode 100644 (file)

index 0000000..2d32eb0
--- /dev/null
+++ b/tests/src/test_FaiIndex.cpp
@@ -0,0 +1,111 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/FaiIndex.h>
+
+#include "PbbamTestData.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace FaiIndexTests {
+
+const std::string simpleFastaFn{PbbamTestsConfig::Data_Dir + "/fastx/simple.fa"};
+const std::string simpleFastaFaiFn{PbbamTestsConfig::Data_Dir + "/fastx/simple.fa.fai"};
+const std::string simpleFastqFn{PbbamTestsConfig::Data_Dir + "/fastx/simple.fq"};
+const std::string simpleFastqFaiFn{PbbamTestsConfig::Data_Dir + "/fastx/simple.fq.fai"};
+
+}  // namespace FaiIndexTests
+
+TEST(FaiIndexTest, LoadsFromFastaFaiFile)
+{
+    const std::vector<std::string> expectedNames{"seq1", "seq2", "seq3", "seq4",
+                                                 "seq5", "seq6", "seq7", "seq8"};
+    const FaiEntry expectedEntry{63, 146, 63, 64};
+
+    const FaiIndex index{FaiIndexTests::simpleFastaFaiFn};
+    const auto& names = index.Names();
+    ASSERT_EQ(8, names.size());
+    EXPECT_TRUE(std::equal(expectedNames.cbegin(), expectedNames.cend(), names.cbegin()));
+    EXPECT_EQ(expectedEntry, index.Entry("seq3"));
+}
+
+TEST(FaiIndexTest, LoadsFromFastqFaiFile)
+{
+    const std::vector<std::string> expectedNames{"seq1", "seq2", "seq3", "seq4",
+                                                 "seq5", "seq6", "seq7", "seq8"};
+    const FaiEntry expectedEntry{63, 278, 63, 64, 344};
+
+    const FaiIndex index{FaiIndexTests::simpleFastqFaiFn};
+    const auto& names = index.Names();
+    ASSERT_EQ(8, names.size());
+    EXPECT_TRUE(std::equal(expectedNames.cbegin(), expectedNames.cend(), names.cbegin()));
+    EXPECT_EQ(expectedEntry, index.Entry("seq3"));
+}
+
+TEST(FaiIndexTest, SaveFastaIndexToStream)
+{
+    // clang-format off
+    const std::string expected
+    {
+        "seq1\t63\t6\t63\t64\n"
+        "seq2\t63\t76\t63\t64\n"
+        "seq3\t63\t146\t63\t64\n"
+        "seq4\t63\t216\t63\t64\n"
+        "seq5\t63\t286\t63\t64\n"
+        "seq6\t63\t356\t63\t64\n"
+        "seq7\t63\t426\t63\t64\n"
+        "seq8\t63\t496\t63\t64\n"
+    };
+    // clang-format on
+
+    FaiIndex index;
+    index.Add("seq1", {63, 6, 63, 64});
+    index.Add("seq2", {63, 76, 63, 64});
+    index.Add("seq3", {63, 146, 63, 64});
+    index.Add("seq4", {63, 216, 63, 64});
+    index.Add("seq5", {63, 286, 63, 64});
+    index.Add("seq6", {63, 356, 63, 64});
+    index.Add("seq7", {63, 426, 63, 64});
+    index.Add("seq8", {63, 496, 63, 64});
+
+    std::ostringstream out;
+    index.Save(out);
+    EXPECT_EQ(expected, out.str());
+}
+
+TEST(FaiIndexTest, SaveFastqIndexToStream)
+{
+    // clang-format off
+    const std::string expected
+    {
+        "seq1\t63\t6\t63\t64\t72\n"
+        "seq2\t63\t142\t63\t64\t208\n"
+        "seq3\t63\t278\t63\t64\t344\n"
+        "seq4\t63\t414\t63\t64\t480\n"
+        "seq5\t63\t550\t63\t64\t616\n"
+        "seq6\t63\t686\t63\t64\t752\n"
+        "seq7\t63\t822\t63\t64\t888\n"
+        "seq8\t63\t958\t63\t64\t1024\n"
+    };
+    // clang-format on
+
+    FaiIndex index;
+    index.Add("seq1", {63, 6, 63, 64, 72});
+    index.Add("seq2", {63, 142, 63, 64, 208});
+    index.Add("seq3", {63, 278, 63, 64, 344});
+    index.Add("seq4", {63, 414, 63, 64, 480});
+    index.Add("seq5", {63, 550, 63, 64, 616});
+    index.Add("seq6", {63, 686, 63, 64, 752});
+    index.Add("seq7", {63, 822, 63, 64, 888});
+    index.Add("seq8", {63, 958, 63, 64, 1024});
+
+    std::ostringstream out;
+    index.Save(out);
+    EXPECT_EQ(expected, out.str());
+}
diff --git a/tests/src/test_FaiZmwChunker.cpp b/tests/src/test_FaiZmwChunker.cpp

new file mode 100644 (file)

index 0000000..234a410
--- /dev/null
+++ b/tests/src/test_FaiZmwChunker.cpp
@@ -0,0 +1,101 @@
+// Author: Derek Barnett
+
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "../../src/FaiZmwChunker.h"
+
+#include "FastxTests.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(FaiZmwChunkerTest, empty_input_zmws_yields_no_chunks)
+{
+    FaiIndex index;
+    FaiZmwChunker chunker{index, 5};
+    EXPECT_EQ(0, chunker.NumChunks());
+}
+
+TEST(FaiZmwChunkerTest, throws_if_requested_num_chunks_is_zero)
+{
+    FaiIndex index;
+    EXPECT_THROW(FaiZmwChunker(index, 0), std::runtime_error);
+}
+
+TEST(FaiZmwChunkerTest, standard_chunking)
+{
+    {
+        FaiZmwChunker chunker{FastxTests::chunkingFastaFaiFn, 5};
+        ASSERT_EQ(5, chunker.NumChunks());
+
+        // 7-7-6-6-6
+
+        EXPECT_EQ(7, chunker.Chunk(0).NumZmws);
+        EXPECT_EQ(7, chunker.Chunk(1).NumZmws);
+        EXPECT_EQ(6, chunker.Chunk(2).NumZmws);
+        EXPECT_EQ(6, chunker.Chunk(3).NumZmws);
+        EXPECT_EQ(6, chunker.Chunk(4).NumZmws);
+
+        EXPECT_EQ(7, chunker.Chunk(0).NumRecords);
+        EXPECT_EQ(7, chunker.Chunk(1).NumRecords);
+        EXPECT_EQ(6, chunker.Chunk(2).NumRecords);
+        EXPECT_EQ(6, chunker.Chunk(3).NumRecords);
+        EXPECT_EQ(9, chunker.Chunk(4).NumRecords);  // 3 records share zmw
+
+        EXPECT_EQ("seq/0", chunker.Chunk(0).FirstSeqName);
+        EXPECT_EQ("seq/7", chunker.Chunk(1).FirstSeqName);
+        EXPECT_EQ("seq/14", chunker.Chunk(2).FirstSeqName);
+        EXPECT_EQ("seq/20", chunker.Chunk(3).FirstSeqName);
+        EXPECT_EQ("seq/50", chunker.Chunk(4).FirstSeqName);
+
+        EXPECT_EQ(7, chunker.Chunk(0).FirstSeqOffset);
+        EXPECT_EQ(91, chunker.Chunk(1).FirstSeqOffset);
+        EXPECT_EQ(180, chunker.Chunk(2).FirstSeqOffset);
+        EXPECT_EQ(258, chunker.Chunk(3).FirstSeqOffset);
+        EXPECT_EQ(336, chunker.Chunk(4).FirstSeqOffset);
+    }
+    {
+        FaiZmwChunker chunker{FastxTests::chunkingFastaFaiFn, 3};
+        ASSERT_EQ(3, chunker.NumChunks());
+
+        // 11-11-10
+
+        EXPECT_EQ(11, chunker.Chunk(0).NumZmws);
+        EXPECT_EQ(11, chunker.Chunk(1).NumZmws);
+        EXPECT_EQ(10, chunker.Chunk(2).NumZmws);
+
+        EXPECT_EQ(11, chunker.Chunk(0).NumRecords);
+        EXPECT_EQ(11, chunker.Chunk(1).NumRecords);
+        EXPECT_EQ(13, chunker.Chunk(2).NumRecords);  // 3 records share zmw
+
+        EXPECT_EQ("seq/0", chunker.Chunk(0).FirstSeqName);
+        EXPECT_EQ("seq/11", chunker.Chunk(1).FirstSeqName);
+        EXPECT_EQ("seq/30", chunker.Chunk(2).FirstSeqName);
+
+        EXPECT_EQ(7, chunker.Chunk(0).FirstSeqOffset);
+        EXPECT_EQ(141, chunker.Chunk(1).FirstSeqOffset);
+        EXPECT_EQ(284, chunker.Chunk(2).FirstSeqOffset);
+    }
+}
+
+TEST(FaiZmwChunkerTest, one_chunk_contains_all_records)
+{
+    FaiZmwChunker chunker{FastxTests::chunkingFastaFaiFn, 1};
+    ASSERT_EQ(1, chunker.NumChunks());
+
+    // 32
+
+    EXPECT_EQ(32, chunker.Chunk(0).NumZmws);
+    EXPECT_EQ(35, chunker.Chunk(0).NumRecords);
+    EXPECT_EQ("seq/0", chunker.Chunk(0).FirstSeqName);
+    EXPECT_EQ(7, chunker.Chunk(0).FirstSeqOffset);
+}
+
+TEST(FaiZmwChunkerTest, one_zmw_per_chunk_if_requested_count_is_larger_than_input)
+{
+    FaiZmwChunker chunker{FastxTests::chunkingFastaFaiFn, 50};
+    ASSERT_EQ(32, chunker.NumChunks());
+    // 32 unique ZMWs
+}
+\ No newline at end of file
diff --git a/tests/src/test_FastaReader.cpp b/tests/src/test_FastaReader.cpp

new file mode 100644 (file)

index 0000000..7fb1ab3
--- /dev/null
+++ b/tests/src/test_FastaReader.cpp
@@ -0,0 +1,195 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/FastaReader.h>
+#include <pbbam/FastaSequence.h>
+#include <pbbam/FastaSequenceQuery.h>
+#include <pbbam/FastaWriter.h>
+#include <pbbam/Unused.h>
+#include <boost/algorithm/string.hpp>
+
+#include "FastxTests.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace FastaReaderTests {
+
+void CheckFastaSequence(const size_t index, const FastaSequence& seq)
+{
+    SCOPED_TRACE("checking FASTA seq:" + std::to_string(index));
+    const auto& expected = FastxTests::ExpectedFasta.at(index);
+    EXPECT_EQ(expected.Name(), seq.Name());
+    EXPECT_EQ(expected.Bases(), seq.Bases());
+}
+
+}  // namespace FastaReaderTests
+
+TEST(FastaReaderTest, throws_on_empty_filename)
+{
+    EXPECT_THROW(FastaReader reader{""}, std::runtime_error);
+}
+
+TEST(FastaReaderTest, throws_on_invalid_extension)
+{
+    EXPECT_THROW(FastaReader reader{"wrong.ext"}, std::runtime_error);
+}
+
+TEST(FastaReaderTest, can_open_text_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaFn;
+    EXPECT_NO_THROW(FastaReader reader{fn});
+}
+
+TEST(FastaReaderTest, can_open_gzip_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaGzipFn;
+    EXPECT_NO_THROW(FastaReader reader{fn});
+}
+
+TEST(FastaReaderTest, can_open_bgzf_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaBgzfFn;
+    EXPECT_NO_THROW(FastaReader reader{fn});
+}
+
+TEST(FastaReaderTest, can_iterate_manually_on_text_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaFn;
+    FastaReader reader{fn};
+
+    size_t count = 0;
+    FastaSequence seq;
+    while (reader.GetNext(seq)) {
+        FastaReaderTests::CheckFastaSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFasta.size(), count);
+}
+
+TEST(FastaReaderTest, can_iterate_manually_on_gzip_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaGzipFn;
+    FastaReader reader{fn};
+
+    size_t count = 0;
+    FastaSequence seq;
+    while (reader.GetNext(seq)) {
+        FastaReaderTests::CheckFastaSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFasta.size(), count);
+}
+
+TEST(FastaReaderTest, can_iterate_manually_on_bgzf_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaBgzfFn;
+    FastaReader reader{fn};
+
+    size_t count = 0;
+    FastaSequence seq;
+    while (reader.GetNext(seq)) {
+        FastaReaderTests::CheckFastaSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFasta.size(), count);
+}
+
+TEST(FastaReaderTest, can_iterate_using_range_for_on_text_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaFn;
+
+    size_t count = 0;
+    FastaReader reader{fn};
+    for (const auto& seq : reader) {
+        FastaReaderTests::CheckFastaSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFasta.size(), count);
+}
+
+TEST(FastaReaderTest, can_iterate_using_range_for_on_gzip_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaGzipFn;
+
+    size_t count = 0;
+    FastaReader reader{fn};
+    for (const auto& seq : reader) {
+        FastaReaderTests::CheckFastaSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFasta.size(), count);
+}
+
+TEST(FastaReaderTest, can_iterate_using_range_for_on_bgzf_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaBgzfFn;
+
+    size_t count = 0;
+    FastaReader reader{fn};
+    for (const auto& seq : reader) {
+        FastaReaderTests::CheckFastaSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFasta.size(), count);
+}
+
+TEST(FastaReaderTest, can_read_all_from_text_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaFn;
+
+    size_t count = 0;
+    for (const auto& seq : FastaReader::ReadAll(fn)) {
+        FastaReaderTests::CheckFastaSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFasta.size(), count);
+}
+
+TEST(FastaReaderTest, can_read_all_from_gzip_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaGzipFn;
+
+    size_t count = 0;
+    for (const auto& seq : FastaReader::ReadAll(fn)) {
+        FastaReaderTests::CheckFastaSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFasta.size(), count);
+}
+
+TEST(FastaReaderTest, can_read_all_from_bgzf_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaBgzfFn;
+
+    size_t count = 0;
+    for (const auto& seq : FastaReader::ReadAll(fn)) {
+        FastaReaderTests::CheckFastaSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFasta.size(), count);
+}
+
+TEST(FastaReaderTest, can_handle_windows_style_newlines)
+{
+    const std::string fn = FastxTests::fastxDataDir + "/windows_formatted.fasta";
+
+    {
+        size_t count = 0;
+        FastaReader reader{fn};
+        FastaSequence seq;
+        while (reader.GetNext(seq)) {
+            ++count;
+            bool endOK = (boost::algorithm::ends_with(seq.Name(), "5p") ||
+                          boost::algorithm::ends_with(seq.Name(), "3p"));
+            EXPECT_TRUE(endOK);
+        }
+        EXPECT_EQ(7, count);  // 7 primers in total
+    }
+}
diff --git a/tests/src/test_FastaSequence.cpp b/tests/src/test_FastaSequence.cpp

new file mode 100644 (file)

index 0000000..ef7207b
--- /dev/null
+++ b/tests/src/test_FastaSequence.cpp
@@ -0,0 +1,31 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/FastaSequence.h>
+
+#include "FastxTests.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace FastaSequenceTests {
+}  // namespace FastaSequenceTests
+
+TEST(FastaSequenceTest, can_construct_from_seq_name_and_bases)
+{
+    FastaSequence seq{"1", "GATTACA"};
+    EXPECT_EQ("1", seq.Name());
+    EXPECT_EQ("GATTACA", seq.Bases());
+}
+
+TEST(FastaSequenceTest, can_construct_from_seq_name_and_bases_whitespaces)
+{
+    FastaSequence seq{"1", "GATTACA\n"};
+    EXPECT_EQ("1", seq.Name());
+    EXPECT_EQ("GATTACA", seq.Bases());
+}
+\ No newline at end of file
diff --git a/tests/src/test_FastaSequenceQuery.cpp b/tests/src/test_FastaSequenceQuery.cpp

new file mode 100644 (file)

index 0000000..5f79c93
--- /dev/null
+++ b/tests/src/test_FastaSequenceQuery.cpp
@@ -0,0 +1,66 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/FastaReader.h>
+#include <pbbam/FastaSequence.h>
+#include <pbbam/FastaSequenceQuery.h>
+#include <pbbam/FastaWriter.h>
+#include <pbbam/Unused.h>
+#include <boost/algorithm/string.hpp>
+
+#include "FastxTests.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace FastaSequenceQueryTests {
+}  // namespace FastaSequenceQueryTests
+
+TEST(FastaSequenceQueryTest, can_read_from_fasta_file)
+{
+    const std::string fn = PbbamTestsConfig::Data_Dir + "/lambdaNEB.fa";
+
+    {
+        size_t count = 0;
+        FastaSequenceQuery query{fn};
+        for (const auto& seq : query) {
+            UNUSED(seq);
+            ++count;
+        }
+        EXPECT_EQ(1, count);
+    }
+
+    {
+        FastaSequenceQuery query{fn};
+        const auto first = query.cbegin();
+        const auto& seq = *first;
+        EXPECT_EQ("lambda_NEB3011", seq.Name());
+    }
+}
+
+TEST(FastaSequenceQueryTest, can_read_from_dataset)
+{
+    const std::string fn = PbbamTestsConfig::Data_Dir + "/referenceset.xml";
+
+    {
+        size_t count = 0;
+        FastaSequenceQuery query{fn};
+        for (const auto& seq : query) {
+            UNUSED(seq);
+            ++count;
+        }
+        EXPECT_EQ(5, count);  // 1 from lambda, 4 from chimera
+    }
+    {
+        FastaSequenceQuery query{fn};
+        const auto first = query.cbegin();
+        const auto& seq = *first;
+        EXPECT_EQ("lambda_NEB3011", seq.Name());
+    }
+}
diff --git a/tests/src/test_FastaWriter.cpp b/tests/src/test_FastaWriter.cpp

new file mode 100644 (file)

index 0000000..0a9d47d
--- /dev/null
+++ b/tests/src/test_FastaWriter.cpp
@@ -0,0 +1,106 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/FastaReader.h>
+#include <pbbam/FastaSequence.h>
+#include <pbbam/FastaSequenceQuery.h>
+#include <pbbam/FastaWriter.h>
+#include <pbbam/Unused.h>
+#include <boost/algorithm/string.hpp>
+
+#include "FastxTests.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace FastaWriterTests {
+}  // namespace FastaWriterTests
+
+TEST(FastaWriterTest, throws_on_empty_filename)
+{
+    EXPECT_THROW(FastaWriter writer{""}, std::runtime_error);
+}
+
+TEST(FastaWriterTest, throws_on_invalid_extension)
+{
+    EXPECT_THROW(FastaWriter writer{"wrong.ext"}, std::runtime_error);
+}
+
+TEST(FastaWriterTest, can_write_fasta_sequence)
+{
+    const std::string outFasta = PbbamTestsConfig::GeneratedData_Dir + "/out.fa";
+    const FastaSequence seq{"name", "GATTACA"};
+
+    {
+        FastaWriter writer{outFasta};
+        writer.Write(seq);
+    }
+
+    const auto seqs = FastaReader::ReadAll(outFasta);
+    ASSERT_EQ(1, seqs.size());
+    EXPECT_EQ(seq.Name(), seqs[0].Name());
+    EXPECT_EQ(seq.Bases(), seqs[0].Bases());
+
+    remove(outFasta.c_str());
+}
+
+TEST(FastaWriterTest, can_write_fasta_from_bam)
+{
+    const std::string fn = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+    const std::string outFasta = PbbamTestsConfig::GeneratedData_Dir + "/out.fa";
+
+    {
+        FastaWriter writer{outFasta};
+        EntireFileQuery query{fn};
+        for (const auto& bam : query)
+            writer.Write(bam);
+    }
+    const auto seqs = FastaReader::ReadAll(outFasta);
+    ASSERT_EQ(4, seqs.size());
+
+    const std::string name1{"singleInsertion/100/0_49"};
+    const std::string name2{"singleInsertion/200/0_49"};
+    const std::string name3{"singleInsertion/100/0_111"};
+    const std::string name4{"singleInsertion/100/0_111"};
+
+    EXPECT_EQ(name1, seqs[0].Name());
+    EXPECT_EQ(name2, seqs[1].Name());
+    EXPECT_EQ(name3, seqs[2].Name());
+    EXPECT_EQ(name4, seqs[3].Name());
+
+    const std::string bases1{"GGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGAT"};
+    const std::string bases2{"GGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGAT"};
+    const std::string bases3{
+        "TTTGGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGATAAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGAG"
+        "CAGCACGGTAAACAGCGGCAA"};
+    const std::string bases4{
+        "AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGAGCAGCACGGTAAACAGCGGCAAATCAGCCAGTCCGGCATCAATTGGCCTCCTG"
+        "ACCGCTGTACCTGCAGCCAAA"};
+
+    remove(outFasta.c_str());
+}
+
+TEST(FastaWriterTest, can_write_fasta_from_strings)
+{
+    const std::string outFasta = PbbamTestsConfig::GeneratedData_Dir + "/out.fa";
+    const std::string name = "name";
+    const std::string bases = "GATTACA";
+
+    {
+        FastaWriter writer{outFasta};
+        writer.Write(name, bases);
+    }
+
+    const auto seqs = FastaReader::ReadAll(outFasta);
+    ASSERT_EQ(1, seqs.size());
+    EXPECT_EQ(name, seqs[0].Name());
+    EXPECT_EQ(bases, seqs[0].Bases());
+
+    remove(outFasta.c_str());
+}
diff --git a/tests/src/test_FastqReader.cpp b/tests/src/test_FastqReader.cpp

new file mode 100644 (file)

index 0000000..5a51343
--- /dev/null
+++ b/tests/src/test_FastqReader.cpp
@@ -0,0 +1,232 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+#include <cstddef>
+#include <cstdint>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamFileMerger.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/FastqReader.h>
+#include <pbbam/FastqSequence.h>
+#include <pbbam/FastqWriter.h>
+
+#include "FastxTests.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace FastqReaderTests {
+
+void CheckFastqSequence(const size_t index, const FastqSequence& seq)
+{
+    SCOPED_TRACE("checking FASTA seq:" + std::to_string(index));
+    const auto& expected = FastxTests::ExpectedFastq.at(index);
+    EXPECT_EQ(expected.Name(), seq.Name());
+    EXPECT_EQ(expected.Bases(), seq.Bases());
+    EXPECT_EQ(expected.Qualities().Fastq(), seq.Qualities().Fastq());
+}
+
+}  // namespace FastqReaderTests
+
+TEST(FastqReaderTest, throws_on_empty_filename)
+{
+    EXPECT_THROW(FastqReader reader{""}, std::runtime_error);
+}
+
+TEST(FastqReaderTest, throws_on_invalid_extension)
+{
+    EXPECT_THROW(FastqReader reader{"wrong.ext"}, std::runtime_error);
+}
+
+TEST(FastqReaderTest, can_open_text_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqFn;
+    EXPECT_NO_THROW(FastqReader reader{fn});
+}
+
+TEST(FastqReaderTest, can_open_gzip_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqGzipFn;
+    EXPECT_NO_THROW(FastqReader reader{fn});
+}
+
+TEST(FastqReaderTest, can_open_bgzf_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqBgzfFn;
+    EXPECT_NO_THROW(FastqReader reader{fn});
+}
+
+TEST(FastqReaderTest, can_iterate_manually_on_text_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqFn;
+    FastqReader reader{fn};
+
+    size_t count = 0;
+    FastqSequence seq;
+    while (reader.GetNext(seq)) {
+        FastqReaderTests::CheckFastqSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFastq.size(), count);
+}
+
+TEST(FastqReaderTest, can_iterate_manually_on_gzip_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqGzipFn;
+    FastqReader reader{fn};
+
+    size_t count = 0;
+    FastqSequence seq;
+    while (reader.GetNext(seq)) {
+        FastqReaderTests::CheckFastqSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFastq.size(), count);
+}
+
+TEST(FastqReaderTest, can_iterate_manually_on_bgzf_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqBgzfFn;
+    FastqReader reader{fn};
+
+    size_t count = 0;
+    FastqSequence seq;
+    while (reader.GetNext(seq)) {
+        FastqReaderTests::CheckFastqSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFastq.size(), count);
+}
+
+TEST(FastqReaderTest, can_iterate_using_range_for_on_text_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqFn;
+
+    size_t count = 0;
+    FastqReader reader{fn};
+    for (const auto& seq : reader) {
+        FastqReaderTests::CheckFastqSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFastq.size(), count);
+}
+
+TEST(FastqReaderTest, can_iterate_using_range_for_on_gzip_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqGzipFn;
+
+    size_t count = 0;
+    FastqReader reader{fn};
+    for (const auto& seq : reader) {
+        FastqReaderTests::CheckFastqSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFastq.size(), count);
+}
+
+TEST(FastqReaderTest, can_iterate_using_range_for_on_bgzf_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqBgzfFn;
+
+    size_t count = 0;
+    FastqReader reader{fn};
+    for (const auto& seq : reader) {
+        FastqReaderTests::CheckFastqSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFastq.size(), count);
+}
+
+TEST(FastqReaderTest, can_read_all_from_text_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqFn;
+
+    size_t count = 0;
+    for (const auto& seq : FastqReader::ReadAll(fn)) {
+        FastqReaderTests::CheckFastqSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFastq.size(), count);
+}
+
+TEST(FastqReaderTest, can_read_all_from_gzip_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqGzipFn;
+
+    size_t count = 0;
+    for (const auto& seq : FastqReader::ReadAll(fn)) {
+        FastqReaderTests::CheckFastqSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFastq.size(), count);
+}
+
+TEST(FastqReaderTest, can_read_all_from_bgzf_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqBgzfFn;
+
+    size_t count = 0;
+    for (const auto& seq : FastqReader::ReadAll(fn)) {
+        FastqReaderTests::CheckFastqSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(FastxTests::ExpectedFastq.size(), count);
+}
+
+TEST(FastqReaderTest, can_handle_windows_style_newlines)
+{
+    const std::string fn = FastxTests::fastxDataDir + "/windows_formatted.fastq";
+
+    {
+        FastqReader reader{fn};
+        FastqSequence seq;
+        reader.GetNext(seq);  // 1 sequence in total
+        EXPECT_EQ("C5", seq.Name());
+        EXPECT_EQ("AAGCA", seq.Bases());
+        EXPECT_EQ("~~~~~", seq.Qualities().Fastq());
+    }
+}
+
+TEST(FastqMerging, can_merge_bams_to_fastq_output)
+{
+    const std::vector<std::string> bamFiles{PbbamTestsConfig::Data_Dir + "/group/test1.bam",
+                                            PbbamTestsConfig::Data_Dir + "/group/test2.bam",
+                                            PbbamTestsConfig::Data_Dir + "/group/test3.bam"};
+
+    const std::string outFastq = PbbamTestsConfig::GeneratedData_Dir + "/out.fq";
+
+    {
+        FastqWriter fastq{outFastq};
+        BamFileMerger::Merge(bamFiles, fastq);
+    }
+
+    const std::vector<std::string> mergedFastqNames{
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2114_2531",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/4101_5571",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/24962/0_427",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/45203/0_893",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/45203/0_893",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/3759_4005",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/4052_4686",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/4732_4869",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/9482_9628",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/9675_10333",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/10378_10609",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49194/0_798",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49194/845_1541",
+        "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49521/0_134"};
+
+    const auto seqs = FastqReader::ReadAll(outFastq);
+    ASSERT_EQ(mergedFastqNames.size(), seqs.size());
+
+    for (size_t i = 0; i < seqs.size(); ++i)
+        EXPECT_EQ(mergedFastqNames[i], seqs[i].Name());
+
+    remove(outFastq.c_str());
+}
+\ No newline at end of file
diff --git a/tests/src/test_FastqSequence.cpp b/tests/src/test_FastqSequence.cpp

new file mode 100644 (file)

index 0000000..5107b80
--- /dev/null
+++ b/tests/src/test_FastqSequence.cpp
@@ -0,0 +1,35 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+#include <cstddef>
+#include <cstdint>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamFileMerger.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/FastqReader.h>
+#include <pbbam/FastqSequence.h>
+#include <pbbam/FastqWriter.h>
+
+#include "FastxTests.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace FastqSequenceTests {
+}  // namespace FastqSequenceTests
+
+TEST(FastqSequenceTest, BasicConstructorsOk)
+{
+    FastqSequence seq1{"1", "GATTACA", "[[[[[[["};
+    EXPECT_EQ("1", seq1.Name());
+    EXPECT_EQ("GATTACA", seq1.Bases());
+    EXPECT_EQ("[[[[[[[", seq1.Qualities().Fastq());
+
+    const auto quals = std::vector<uint8_t>{58, 58, 58, 58, 58, 58, 58};
+    FastqSequence seq2{"1", "GATTACA", QualityValues{quals}};
+    EXPECT_EQ("1", seq2.Name());
+    EXPECT_EQ("GATTACA", seq2.Bases());
+    EXPECT_EQ("[[[[[[[", seq2.Qualities().Fastq());
+}
diff --git a/tests/src/test_FastqWriter.cpp b/tests/src/test_FastqWriter.cpp

new file mode 100644 (file)

index 0000000..f29efa3
--- /dev/null
+++ b/tests/src/test_FastqWriter.cpp
@@ -0,0 +1,100 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+#include <cstddef>
+#include <cstdint>
+
+#include "FastxTests.h"
+#include "PbbamTestData.h"
+
+#include <pbbam/BamFileMerger.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/FastqReader.h>
+#include <pbbam/FastqSequence.h>
+#include <pbbam/FastqWriter.h>
+
+#include "FastxTests.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace FastqWriterrTests {
+}  // namespace FastqWriterrTests
+
+TEST(FastqWriterTest, throws_on_empty_filename)
+{
+    EXPECT_THROW(FastqWriter writer{""}, std::runtime_error);
+}
+
+TEST(FastqWriterTest, throws_on_invalid_extension)
+{
+    EXPECT_THROW(FastqWriter writer{"wrong.ext"}, std::runtime_error);
+}
+
+TEST(FastqWriterTest, can_write_fastq_sequence)
+{
+    const std::string outFastq = PbbamTestsConfig::GeneratedData_Dir + "/out.fq";
+    const FastqSequence seq{"name", "GATTACA", "!!!!!!!"};
+
+    {
+        FastqWriter writer{outFastq};
+        writer.Write(seq);
+    }
+
+    const auto seqs = FastqReader::ReadAll(outFastq);
+    ASSERT_EQ(1, seqs.size());
+    EXPECT_EQ(seq.Name(), seqs[0].Name());
+    EXPECT_EQ(seq.Bases(), seqs[0].Bases());
+    EXPECT_EQ(seq.Qualities(), seqs[0].Qualities());
+
+    remove(outFastq.c_str());
+}
+
+TEST(FastqWriterTest, can_write_fastq_from_bam)
+{
+    const std::string fn = PbbamTestsConfig::Data_Dir + "/unmap1.bam";
+    const std::string outFastq = PbbamTestsConfig::GeneratedData_Dir + "/out.fq";
+
+    {
+        FastqWriter writer{outFastq};
+        EntireFileQuery query{fn};
+        for (const auto& bam : query)
+            writer.Write(bam);
+    }
+    const auto seqs = FastqReader::ReadAll(outFastq);
+    ASSERT_EQ(1, seqs.size());
+
+    const std::string name{"test/1/0_100"};
+    const std::string bases{
+        "GATCGCACTGAAAATCTGGATATAGAACGTGTGCAAATGATTGTCTCTACCGTTCCGTAAAAATTATTGCTAATTAGCAATGATTTTAAG"
+        "CTAATTAGTT"};
+    const std::string quals{
+        "CCCCCCCCCCCCCCCCCCCACCCCCACCCCCCCCCCCCB;CCCAACCCCCCCCCCCCCD=B9BCABCBCB>BBBC@B<<@BA;BCC?B>"
+        "A<<@(?:4==4"};
+
+    EXPECT_EQ(name, seqs[0].Name());
+    EXPECT_EQ(bases, seqs[0].Bases());
+    EXPECT_EQ(quals, seqs[0].Qualities().Fastq());
+
+    remove(outFastq.c_str());
+}
+
+TEST(FastqWriterTest, can_write_fastq_from_strings)
+{
+    const std::string outFastq = PbbamTestsConfig::GeneratedData_Dir + "/out.fq";
+    const std::string name = "name";
+    const std::string bases = "GATTACA";
+    const std::string quals = "!!!!!!!";
+
+    {
+        FastqWriter writer{outFastq};
+        writer.Write(name, bases, quals);
+    }
+
+    const auto seqs = FastqReader::ReadAll(outFastq);
+    ASSERT_EQ(1, seqs.size());
+    EXPECT_EQ(name, seqs[0].Name());
+    EXPECT_EQ(bases, seqs[0].Bases());
+
+    remove(outFastq.c_str());
+}
diff --git a/tests/src/test_FileUtils.cpp b/tests/src/test_FileUtils.cpp

new file mode 100644 (file)

index 0000000..d3a402a
--- /dev/null
+++ b/tests/src/test_FileUtils.cpp
@@ -0,0 +1,287 @@
+// Author: Derek Barnett
+
+#include <cctype>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <boost/algorithm/string.hpp>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/../../src/FileUtils.h>
+#include <pbbam/../../src/TimeUtils.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(FileUtilsTest, ExistsOk)
+{
+    EXPECT_FALSE(FileUtils::Exists("does_not_exist.txt"));
+
+    const std::string tmp = PbbamTestsConfig::GeneratedData_Dir + "/pbbam_exists_check.tmp";
+    const std::string cmd = std::string("touch ") + tmp;
+    ASSERT_EQ(0, system(cmd.c_str()));
+    EXPECT_TRUE(FileUtils::Exists(tmp));
+}
+
+TEST(FileUtilsTest, LastModifiedOk)
+{
+    // a little tricky to check without going a full 'mock' filesystem route, but we can approximate
+    //
+    // also, I can't seem to get better than second resolution (on OSX 10.9/clang at least, st_mtimespec.tv_nsec is always zero)
+
+    const auto then = TimeUtils::CurrentTime();
+    const auto thenDuration = then.time_since_epoch();
+    const auto thenSeconds = std::chrono::duration_cast<std::chrono::seconds>(thenDuration).count();
+
+    const std::string tmp = PbbamTestsConfig::GeneratedData_Dir + "/pbbam_lastmod_check.tmp";
+    const std::string rmCmd = std::string("rm ") + tmp;
+    const std::string touchCmd = std::string("touch  ") + tmp;
+    const auto ret = system(rmCmd.c_str());
+    UNUSED(ret);
+    ASSERT_EQ(0, system(touchCmd.c_str()));
+
+    const auto stamp = FileUtils::LastModified(tmp);
+    const auto stampDuration = stamp.time_since_epoch();
+    const auto stampSeconds =
+        std::chrono::duration_cast<std::chrono::seconds>(stampDuration).count();
+
+    const int skew = 3600;  // allow 1 hour of clock-skew
+
+    EXPECT_LE(thenSeconds, stampSeconds + skew);
+}
+
+TEST(FileUtilsTest, ResolvedFilePathOk)
+{
+    const std::string testFrom = "/path/to/myDir";
+
+    // "raw" filenames - no URI scheme
+
+    const std::string absolutePath = "/absolute/path/to/file.txt";
+    const std::string relativePath = "../relative/path/to/file.txt";
+    const std::string noPathFn = "file.txt";
+
+    const std::string resolvedAbsolutePath = FileUtils::ResolvedFilePath(absolutePath, testFrom);
+    const std::string resolvedRelativePath = FileUtils::ResolvedFilePath(relativePath, testFrom);
+    const std::string resolvedNoPath = FileUtils::ResolvedFilePath(noPathFn, testFrom);
+    const std::string resolvedAbsolutePath_defaultFrom = FileUtils::ResolvedFilePath(absolutePath);
+    const std::string resolvedRelativePath_defaultFrom = FileUtils::ResolvedFilePath(relativePath);
+    const std::string resolvedNoPath_defaultFrom = FileUtils::ResolvedFilePath(noPathFn);
+
+    EXPECT_EQ("/absolute/path/to/file.txt", resolvedAbsolutePath);
+    EXPECT_EQ("/path/to/myDir/../relative/path/to/file.txt", resolvedRelativePath);
+    EXPECT_EQ("/path/to/myDir/file.txt", resolvedNoPath);
+
+    EXPECT_EQ("/absolute/path/to/file.txt", resolvedAbsolutePath_defaultFrom);
+    EXPECT_EQ("./../relative/path/to/file.txt", resolvedRelativePath_defaultFrom);
+    EXPECT_EQ("./file.txt", resolvedNoPath_defaultFrom);
+
+    // filenames with URI scheme ("file://")
+
+    const std::string absoluteSchemeFn = "file:///absolute/path/to/file.txt";
+    const std::string relativeSchemeFn = "file://../relative/path/to/file.txt";
+    const std::string noPathSchemeFn = "file://file.txt";
+
+    const std::string resolvedAbsoluteSchemePath =
+        FileUtils::ResolvedFilePath(absoluteSchemeFn, testFrom);
+    const std::string resolvedRelativeSchemePath =
+        FileUtils::ResolvedFilePath(relativeSchemeFn, testFrom);
+    const std::string resolvedNoPathSchemeFn =
+        FileUtils::ResolvedFilePath(noPathSchemeFn, testFrom);
+    const std::string resolvedAbsoluteSchemePath_defaultFrom =
+        FileUtils::ResolvedFilePath(absoluteSchemeFn);
+    const std::string resolvedRelativeSchemePath_defaultFrom =
+        FileUtils::ResolvedFilePath(relativeSchemeFn);
+    const std::string resolvedNoPathSchemeFn_defaultFrom =
+        FileUtils::ResolvedFilePath(noPathSchemeFn);
+
+    EXPECT_EQ("/absolute/path/to/file.txt", resolvedAbsoluteSchemePath);
+    EXPECT_EQ("/path/to/myDir/../relative/path/to/file.txt", resolvedRelativeSchemePath);
+    EXPECT_EQ("/path/to/myDir/file.txt", resolvedNoPathSchemeFn);
+
+    EXPECT_EQ("/absolute/path/to/file.txt", resolvedAbsoluteSchemePath_defaultFrom);
+    EXPECT_EQ("./../relative/path/to/file.txt", resolvedRelativeSchemePath_defaultFrom);
+    EXPECT_EQ("./file.txt", resolvedNoPathSchemeFn_defaultFrom);
+}
+
+TEST(FileUtilsTest, SizeOk)
+{
+    const std::string tmp = PbbamTestsConfig::GeneratedData_Dir + "/pbbam_empty_file.tmp";
+    const std::string cmd = std::string("touch ") + tmp;
+    ASSERT_EQ(0, system(cmd.c_str()));
+    EXPECT_EQ(0, FileUtils::Size(tmp));
+
+    EXPECT_THROW(FileUtils::Size("does_not_exist.txt"), std::runtime_error);
+}
+
+// ####################################################################################################
+// The code below is part of a simple check whether or not a (Windows-only) file path is absolute.
+//
+// NOTE: (and this is admittedly brittle for maintenance, but) the internal methods used are literally
+// copied here for direct driving. There's likely a better way going forward, than the manual copy/paste.
+// But in the absence of a similar runtime environment to build in & test against, while
+// the motivating behavior is blocking other work, this lets me get the fix in their hands ASAP and still
+// have some test code poking it beforehand. -DB
+//
+namespace test_windows {
+
+static std::string removeFileUriScheme(const std::string& uri)
+{
+    assert(!uri.empty());
+
+    auto schemeLess = uri;
+    const auto fileScheme = std::string{"file://"};
+    const auto schemeFound = schemeLess.find(fileScheme);
+    if (schemeFound != std::string::npos) {
+        if (schemeFound != 0) throw std::runtime_error("Malformed URI: scheme not at beginning");
+        schemeLess = schemeLess.substr(fileScheme.size());
+    }
+    return schemeLess;
+}
+
+static std::string removeDiskName(const std::string& filePath)
+{
+    if (filePath.size() >= 2) {
+        const char firstChar = filePath.at(0);
+        if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':')) return filePath.substr(2);
+    }
+    return filePath;
+}
+
+static const char native_pathSeparator = '\\';
+
+static bool native_pathIsAbsolute(const std::string& filePath)
+{
+    assert(!filePath.empty());
+
+    // if starts with single slash or double slash [cases 1,3]
+    if (boost::algorithm::starts_with(filePath, "\\")) return true;
+
+    // if starts with single or double-dots -> not absolute [case 4 + ".\file.txt"]
+    if (boost::algorithm::starts_with(filePath, ".")) return false;
+
+    // if starts with drive name and colon ("C:\foo\bar.txt")
+    if (filePath.size() >= 2) {
+        const char firstChar = filePath.at(0);
+        if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':'))
+            return native_pathIsAbsolute(removeDiskName(filePath));
+    }
+
+    // otherwise, likely relative
+    return false;
+}
+
+static std::string native_resolvedFilePath(const std::string& filePath, const std::string& from)
+{
+    // strip file:// scheme if present
+    auto schemeLess = removeFileUriScheme(filePath);
+
+    // if empty or already absolute path, just return it
+    // upfront empty check simplifies further parsing logic
+    if (schemeLess.empty() || native_pathIsAbsolute(schemeLess)) return schemeLess;
+
+    // else make relative from the provided 'from' directory
+    //
+    // first pop disk name, then any leading single-dot '.'
+    //
+    // since we're prepending the 'from' directory, we can remove
+    // any leading './' form our file path. this may just mean that
+    // we pop it off to add it right back (when from == '.'), but this
+    // keeps it consistent with other 'from' parent directories
+    //
+    schemeLess = removeDiskName(schemeLess);
+
+    const bool thisDirAtStart = (schemeLess.find(".") == 0);
+    if (thisDirAtStart) {
+        if (schemeLess.find(native_pathSeparator) == 1) schemeLess = schemeLess.substr(2);
+    }
+    return from + native_pathSeparator + schemeLess;
+}
+
+}  // namespace test_windows
+
+TEST(FileUtilsTest, WindowsPathsOk)
+{
+    {  // remove disk name
+
+        // "C:\tmp.txt"
+        std::string f1 = "C:\\tmp.txt";
+        EXPECT_EQ(std::string("\\tmp.txt"), test_windows::removeDiskName(f1));
+
+        // "C:tmp.txt"
+        std::string f2 = "C:tmp.txt";
+        EXPECT_EQ(std::string("tmp.txt"), test_windows::removeDiskName(f2));
+
+        // "\tmp.txt"
+        std::string f3 = "\\tmp.txt";
+        EXPECT_EQ(f3, test_windows::removeDiskName(f3));
+
+        // "tmp.txt"
+        std::string f4 = "tmp.txt";
+        EXPECT_EQ(f4, test_windows::removeDiskName(f4));
+    }
+
+    {  // isAbsolute ?
+
+        // "\\server\path\to\tmp.txt"
+        EXPECT_TRUE(test_windows::native_pathIsAbsolute("\\\\server\\path\\to\tmp.txt"));
+
+        // "..\tmp.txt"
+        EXPECT_FALSE(test_windows::native_pathIsAbsolute("..\\tmp.txt"));
+
+        // ".\tmp.txt"
+        EXPECT_FALSE(test_windows::native_pathIsAbsolute(".\\tmp.txt"));
+
+        // "C:\path\to\tmp.txt"
+        EXPECT_TRUE(test_windows::native_pathIsAbsolute("C:\\path\\to\\tmp.txt"));
+
+        // "C:..\path\to\tmp.txt"
+        EXPECT_FALSE(test_windows::native_pathIsAbsolute("C:..\\path\\to\\tmp.txt"));
+    }
+
+    {  // resolve file path
+
+        const std::string myRootDir = "C:\\path\\to\\myRootDir";
+
+        // "\\server\path\to\tmp.txt"
+        const std::string fn1 = "\\\\server\\path\\to\tmp.txt";
+        const std::string fn1_expected = fn1;
+        EXPECT_EQ(fn1_expected, test_windows::native_resolvedFilePath(fn1, myRootDir));
+
+        // "..\tmp.txt"
+        const std::string fn2 = "..\\tmp.txt";
+        const std::string fn2_expected = "C:\\path\\to\\myRootDir\\..\\tmp.txt";
+        EXPECT_EQ(fn2_expected, test_windows::native_resolvedFilePath(fn2, myRootDir));
+
+        // ".\tmp.txt"
+        const std::string fn3 = ".\\tmp.txt";
+        const std::string fn3_expected = "C:\\path\\to\\myRootDir\\tmp.txt";
+        EXPECT_EQ(fn3_expected, test_windows::native_resolvedFilePath(fn3, myRootDir));
+
+        // "C:\path\to\tmp.txt"
+        const std::string fn4 = "C:\\path\\to\\tmp.txt";
+        const std::string fn4_expected = fn4;
+        EXPECT_EQ(fn4_expected, test_windows::native_resolvedFilePath(fn4, myRootDir));
+
+        // "C:..\path\to\tmp.txt"
+        const std::string fn5 = "C:..\\path\\to\\tmp.txt";
+        const std::string fn5_expected = "C:\\path\\to\\myRootDir\\..\\path\\to\\tmp.txt";
+        EXPECT_EQ(fn5_expected, test_windows::native_resolvedFilePath(fn5, myRootDir));
+
+        // "C:tmp.txt"
+        const std::string fn6 = "C:tmp.txt";
+        const std::string fn6_expected = "C:\\path\\to\\myRootDir\\tmp.txt";
+        EXPECT_EQ(fn6_expected, test_windows::native_resolvedFilePath(fn6, myRootDir));
+        EXPECT_EQ(fn3_expected,
+                  test_windows::native_resolvedFilePath(
+                      fn6, myRootDir));  // our path is equivalent to fn3's "./temp.txt"
+    }
+}
+//
+// ####################################################################################################
diff --git a/tests/src/test_Frames.cpp b/tests/src/test_Frames.cpp

new file mode 100644 (file)

index 0000000..82cb3a6
--- /dev/null
+++ b/tests/src/test_Frames.cpp
@@ -0,0 +1,46 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/Frames.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace FramesTests {
+
+static const std::vector<uint16_t> testFrames{
+    0,  8,  140, 0,  0,   7,  4,  0,  85, 2,  1,  3,  2,   10, 1,  20, 47,   10,  9,  60, 20,
+    3,  12, 5,   13, 165, 6,  14, 22, 12, 2,  4,  9,  218, 27, 3,  15, 2,    17,  2,  45, 24,
+    89, 10, 7,   1,  11,  15, 0,  7,  0,  28, 17, 12, 6,   10, 37, 0,  12,   52,  0,  7,  1,
+    14, 3,  26,  12, 0,   20, 17, 2,  13, 2,  9,  13, 7,   15, 29, 3,  6,    2,   1,  28, 10,
+    3,  14, 7,   1,  22,  1,  6,  6,  0,  19, 31, 6,  2,   14, 0,  0,  1000, 947, 948};
+
+static const std::vector<uint8_t> encodedFrames{
+    0,  8,  102, 0,  0,   7,  4,  0,  75, 2,  1,  3,  2,   10, 1,  20, 47,  10,  9,  60, 20,
+    3,  12, 5,   13, 115, 6,  14, 22, 12, 2,  4,  9,  135, 27, 3,  15, 2,   17,  2,  45, 24,
+    77, 10, 7,   1,  11,  15, 0,  7,  0,  28, 17, 12, 6,   10, 37, 0,  12,  52,  0,  7,  1,
+    14, 3,  26,  12, 0,   20, 17, 2,  13, 2,  9,  13, 7,   15, 29, 3,  6,   2,   1,  28, 10,
+    3,  14, 7,   1,  22,  1,  6,  6,  0,  19, 31, 6,  2,   14, 0,  0,  255, 254, 255};
+
+}  // namespace FramesTests
+
+TEST(FramesTest, Constructors)
+{
+    const Frames f;
+    ASSERT_TRUE(f.Data().empty());
+
+    const Frames f2(FramesTests::testFrames);
+    const auto d = f2.Data();
+    ASSERT_EQ(FramesTests::testFrames, d);
+}
+
+TEST(FramesTest, Encoded)
+{
+    const Frames f(FramesTests::testFrames);
+    const auto e = f.Encode();
+    ASSERT_EQ(FramesTests::encodedFrames, e);
+}
diff --git a/tests/src/test_GenomicIntervalQuery.cpp b/tests/src/test_GenomicIntervalQuery.cpp

new file mode 100644 (file)

index 0000000..d094d1b
--- /dev/null
+++ b/tests/src/test_GenomicIntervalQuery.cpp
@@ -0,0 +1,205 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BaiIndexCache.h>
+#include <pbbam/GenomicIntervalQuery.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace GenomicIntervalQueryTests {
+const std::string inputBamFn = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+const std::string inputBamFn_2 = PbbamTestsConfig::Data_Dir + "/aligned2.bam";
+}  // namespace GenomicIntervalQueryTests
+
+TEST(GenomicIntervalQueryTest, ReuseQueryAndCountRecords)
+{
+    const std::string rname = "lambda_NEB3011";
+
+    BamFile bamFile(GenomicIntervalQueryTests::inputBamFn);
+
+    // setup with normal interval
+    int count = 0;
+    GenomicInterval interval(rname, 5000, 6000);
+    GenomicIntervalQuery query(interval, bamFile);
+    for (const BamRecord& record : query) {
+        UNUSED(record);
+        ++count;
+    }
+    EXPECT_EQ(2, count);
+
+    // adjust interval and pass back in
+    count = 0;
+    interval.Start(9300);
+    interval.Stop(9400);
+    query.Interval(interval);
+    for (const BamRecord& record : query) {
+        UNUSED(record);
+        ++count;
+    }
+    EXPECT_EQ(2, count);
+
+    // adjust again (empty region)
+    count = 0;
+    interval.Name(rname);
+    interval.Start(1000);
+    interval.Stop(2000);
+    query.Interval(interval);
+    for (const BamRecord& record : query) {
+        UNUSED(record);
+        ++count;
+    }
+    EXPECT_EQ(0, count);
+
+    // unknown ref
+    count = 0;
+    interval.Name("does not exist");
+    interval.Start(0);
+    interval.Stop(100);
+    EXPECT_THROW(query.Interval(interval), std::runtime_error);
+    for (const BamRecord& record : query) {  // iteration is still safe, just returns no data
+        UNUSED(record);
+        ++count;
+    }
+    EXPECT_EQ(0, count);
+
+    // adjust again - make sure we can read a real region after an invalid one
+    interval.Name(rname);
+    interval.Start(5000);
+    interval.Stop(6000);
+    query.Interval(interval);
+    count = 0;
+    for (const BamRecord& record : query) {
+        UNUSED(record);
+        ++count;
+    }
+    EXPECT_EQ(2, count);
+}
+
+TEST(GenomicIntervalQueryTest, NonConstBamRecord)
+{
+    EXPECT_NO_THROW({
+        BamFile bamFile(GenomicIntervalQueryTests::inputBamFn);
+        int count = 0;
+
+        GenomicInterval interval("lambda_NEB3011", 8000, 10000);
+        GenomicIntervalQuery query(interval, bamFile);
+        for (BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(2, count);
+    });
+}
+
+TEST(GenomicIntervalQueryTest, MissingBaiShouldThrow)
+{
+    GenomicInterval interval("lambda_NEB3011", 0, 100);
+    const std::string phi29Bam = PbbamTestsConfig::Data_Dir + "/phi29.bam";
+    const std::string hasBaiBam = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+
+    {  // single file, missing BAI
+        EXPECT_THROW(GenomicIntervalQuery query(interval, phi29Bam), std::runtime_error);
+    }
+
+    {  // from dataset, all missing BAI
+        DataSet ds;
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        EXPECT_THROW(GenomicIntervalQuery query(interval, ds), std::runtime_error);
+    }
+
+    {  // from dataset, mixed BAI presence
+        DataSet ds;
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        ds.ExternalResources().Add(
+            ExternalResource("PacBio.AlignmentFile.AlignmentBamFile", hasBaiBam));
+        EXPECT_THROW(GenomicIntervalQuery query(interval, ds), std::runtime_error);
+    }
+}
+
+TEST(GenomicIntervalQueryTest, InitializeWithoutInterval)
+{
+    const std::string rname = "lambda_NEB3011";
+
+    BamFile bamFile(GenomicIntervalQueryTests::inputBamFn);
+
+    // setup without normal interval
+    int count = 0;
+    GenomicIntervalQuery query(bamFile);
+    for (const BamRecord& record : query) {
+        UNUSED(record);
+        ++count;
+    }
+    EXPECT_EQ(0, count);
+
+    // pass in actual interval
+    count = 0;
+    GenomicInterval interval{"lambda_NEB3011", 9300, 9400};
+    query.Interval(interval);
+    for (const BamRecord& record : query) {
+        UNUSED(record);
+        ++count;
+    }
+    EXPECT_EQ(2, count);
+}
+
+TEST(GenomicIntervalQueryTest, CanReuseBaiIndexCache)
+{
+    const std::string refName{"lambda_NEB3011"};
+    const std::vector<std::string> filenames{GenomicIntervalQueryTests::inputBamFn,
+                                             GenomicIntervalQueryTests::inputBamFn_2};
+
+    const DataSet ds{filenames};
+    const auto indexCache = MakeBaiIndexCache(ds);
+
+    auto checkInterval = [](GenomicIntervalQuery& query, const GenomicInterval& interval,
+                            const size_t expectedCount) {
+        // update query
+        query.Interval(interval);
+
+        // checkout results
+        std::vector<Position> startPositions;
+        for (const BamRecord& r : query) {
+            EXPECT_EQ(interval.Name(), r.ReferenceName());
+            EXPECT_TRUE(r.ReferenceStart() < interval.Stop());
+            EXPECT_TRUE(r.ReferenceEnd() >= interval.Start());
+            startPositions.push_back(r.ReferenceStart());
+        }
+        EXPECT_EQ(expectedCount, startPositions.size());
+        EXPECT_TRUE(std::is_sorted(startPositions.cbegin(), startPositions.cend()));
+    };
+
+    // reuse cache between interval updates
+    GenomicIntervalQuery query{ds, indexCache};
+    {
+        const GenomicInterval interval{refName, 5000, 8000};
+        const size_t expectedCount = 7;
+        checkInterval(query, interval, expectedCount);
+    }
+    {
+        const GenomicInterval interval{refName, 0, 100};
+        const size_t expectedCount = 1;
+        checkInterval(query, interval, expectedCount);
+    }
+    {
+        const GenomicInterval interval{refName, 9300, 9400};
+        const size_t expectedCount = 2;
+        checkInterval(query, interval, expectedCount);
+    }
+
+    // reuse cache in independent query
+    GenomicIntervalQuery query2{ds, indexCache};
+    const GenomicInterval interval{refName, 5000, 8000};
+    const size_t expectedCount = 7;
+    checkInterval(query2, interval, expectedCount);
+}
diff --git a/tests/src/test_GenomicIntervals.cpp b/tests/src/test_GenomicIntervals.cpp

new file mode 100644 (file)

index 0000000..6ae0b86
--- /dev/null
+++ b/tests/src/test_GenomicIntervals.cpp
@@ -0,0 +1,165 @@
+// Author: David Seifert
+
+#include <gtest/gtest.h>
+
+#include <pbbam/DataSet.h>
+
+#include "PbbamTestData.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace {
+
+const std::string inputDir{PacBio::BAM::PbbamTestsConfig::Data_Dir + "/test_GenomicIntervals/"};
+
+}  // namespace
+
+TEST(DataSetGenomicIntervalsTest, NoFilter)
+{
+    // vanilla AlignmentSet, no filters
+    PacBio::BAM::DataSet ds{inputDir + "no_filter.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+
+    const std::vector<PacBio::BAM::GenomicInterval> result = ds.GenomicIntervals();
+    const std::vector<PacBio::BAM::GenomicInterval> correct{{"contig1", 0, 20}, {"contig2", 0, 10}};
+    EXPECT_EQ(correct, result);
+}
+
+TEST(DataSetGenomicIntervalsTest, Empty)
+{
+    // interval contig1:[5, 5), i.e., empty, yet both offsets are within range
+    PacBio::BAM::DataSet ds{inputDir + "empty.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+
+    const std::vector<PacBio::BAM::GenomicInterval> result = ds.GenomicIntervals();
+    const std::vector<PacBio::BAM::GenomicInterval> correct{};
+    EXPECT_EQ(correct, result);
+}
+
+TEST(DataSetGenomicIntervalsTest, OutOfRange)
+{
+    // interval contig1:[1000, 10000), i.e., empty, as the selected range
+    // lies above the contig1 size of 20
+    PacBio::BAM::DataSet ds{inputDir + "out_of_range.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+
+    const std::vector<PacBio::BAM::GenomicInterval> result = ds.GenomicIntervals();
+    const std::vector<PacBio::BAM::GenomicInterval> correct{};
+    EXPECT_EQ(correct, result);
+}
+
+TEST(DataSetGenomicIntervalsTest, SingleInterval)
+{
+    // interval contig1:[3, 10)
+    PacBio::BAM::DataSet ds{inputDir + "single_interval.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+
+    const std::vector<PacBio::BAM::GenomicInterval> result = ds.GenomicIntervals();
+    const std::vector<PacBio::BAM::GenomicInterval> correct{{"contig1", 3, 10}};
+    EXPECT_EQ(correct, result);
+}
+
+TEST(DataSetGenomicIntervalsTest, WholeContig)
+{
+    // interval contig1:[0, 20), i.e., select the whole contig
+    PacBio::BAM::DataSet ds{inputDir + "whole_contig.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+
+    const std::vector<PacBio::BAM::GenomicInterval> result = ds.GenomicIntervals();
+    const std::vector<PacBio::BAM::GenomicInterval> correct{{"contig1", 0, 20}};
+    EXPECT_EQ(correct, result);
+}
+
+TEST(DataSetGenomicIntervalsTest, ContigNameOnly)
+{
+    // interval contig1, i.e., select the whole contig, without a range filter
+    PacBio::BAM::DataSet ds{inputDir + "contig_name_only.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+
+    const std::vector<PacBio::BAM::GenomicInterval> result = ds.GenomicIntervals();
+    const std::vector<PacBio::BAM::GenomicInterval> correct{{"contig1", 0, 20}};
+    EXPECT_EQ(correct, result);
+}
+
+TEST(DataSetGenomicIntervalsTest, SingleIntervalLessOrEqual)
+{
+    // interval contig1:[3, 11), test "tstart <=" relation
+    PacBio::BAM::DataSet ds{inputDir + "single_interval_start_lte.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+
+    const std::vector<PacBio::BAM::GenomicInterval> result = ds.GenomicIntervals();
+    const std::vector<PacBio::BAM::GenomicInterval> correct{{"contig1", 3, 11}};
+    EXPECT_EQ(correct, result);
+}
+
+TEST(DataSetGenomicIntervalsTest, SingleIntervalGreaterOrEqual)
+{
+    // interval contig1:[2, 10), test "tend >=" relation
+    PacBio::BAM::DataSet ds{inputDir + "single_interval_end_gte.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+
+    const std::vector<PacBio::BAM::GenomicInterval> result = ds.GenomicIntervals();
+    const std::vector<PacBio::BAM::GenomicInterval> correct{{"contig1", 2, 10}};
+    EXPECT_EQ(correct, result);
+}
+
+TEST(DataSetGenomicIntervalsTest, DisjointIntervals)
+{
+    // interval contig1:[3, 7),[13, 17), test that disjoint intervals remain disjoint
+    PacBio::BAM::DataSet ds{inputDir + "disjoint_intervals.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+
+    const std::vector<PacBio::BAM::GenomicInterval> result = ds.GenomicIntervals();
+    const std::vector<PacBio::BAM::GenomicInterval> correct{{"contig1", 3, 7}, {"contig1", 13, 17}};
+    EXPECT_EQ(correct, result);
+}
+
+TEST(DataSetGenomicIntervalsTest, AdjacentIntervals)
+{
+    // interval contig1:[3, 17), test that intervals [3, 10) and [10, 17)
+    // get merged into a single overall interval
+    PacBio::BAM::DataSet ds{inputDir + "adjacent_intervals.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+
+    const std::vector<PacBio::BAM::GenomicInterval> result = ds.GenomicIntervals();
+    const std::vector<PacBio::BAM::GenomicInterval> correct{{"contig1", 3, 17}};
+    EXPECT_EQ(correct, result);
+}
+
+TEST(DataSetGenomicIntervalsTest, TwoContigs)
+{
+    // interval contig1:[3, 11) and contig2:[2, 7), test intervals on
+    // different contigs, also test "tstart <=" and "tend >="
+    PacBio::BAM::DataSet ds{inputDir + "two_contigs.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+
+    const std::vector<PacBio::BAM::GenomicInterval> result = ds.GenomicIntervals();
+    const std::vector<PacBio::BAM::GenomicInterval> correct{{"contig1", 3, 11}, {"contig2", 2, 7}};
+    EXPECT_EQ(correct, result);
+}
+
+// Test various invalid AlignmentSets
+TEST(DataSetGenomicIntervalsTest, InvalidMissingRname)
+{
+    // missing "rname"
+    PacBio::BAM::DataSet ds{inputDir + "invalid_missing_rname.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+    EXPECT_THROW(ds.GenomicIntervals(), std::runtime_error);
+}
+
+TEST(DataSetGenomicIntervalsTest, InvalidRnameOperator)
+{
+    // non-sensical "rname" operator ">"
+    PacBio::BAM::DataSet ds{inputDir + "invalid_rname_operator.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+    EXPECT_THROW(ds.GenomicIntervals(), std::runtime_error);
+}
+
+TEST(DataSetGenomicIntervalsTest, InvalidTstartOperator)
+{
+    // non-sensical "tstart" operator "="
+    PacBio::BAM::DataSet ds{inputDir + "invalid_tstart_operator.alignmentset.xml"};
+    ds.Type(PacBio::BAM::DataSet::ALIGNMENT);
+    EXPECT_THROW(ds.GenomicIntervals(), std::runtime_error);
+}
diff --git a/tests/src/test_IndexedBamWriter.cpp b/tests/src/test_IndexedBamWriter.cpp

new file mode 100644 (file)

index 0000000..59d8c81
--- /dev/null
+++ b/tests/src/test_IndexedBamWriter.cpp
@@ -0,0 +1,125 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamFile.h>
+#include <pbbam/BamReader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/IndexedBamWriter.h>
+#include <pbbam/PbiBuilder.h>
+#include <pbbam/PbiRawData.h>
+
+// clang-format off
+
+TEST(IndexedBamWriter, WritesValidIndex)
+{
+    using namespace PacBio::BAM;
+
+    const std::string inBam = PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam";
+    const std::string outBam = PbbamTestsConfig::GeneratedData_Dir + "/ibw.bam";
+    const std::string outPbi = PbbamTestsConfig::GeneratedData_Dir + "/ibw.bam.pbi";
+
+    const BamFile file{inBam};
+    const auto& header = file.Header();
+
+    const std::vector<std::string> expectedQNames{
+        "ArminsFakeMovie/100000/2659_3025",
+        "ArminsFakeMovie/100000/3116_3628",
+        "ArminsFakeMovie/100000/3722_4267",
+        "ArminsFakeMovie/100000/4356_4864",
+        "ArminsFakeMovie/100000/4960_5477",
+        "ArminsFakeMovie/100000/5571_6087",
+        "ArminsFakeMovie/100000/6199_6719",
+        "ArminsFakeMovie/100000/6812_7034",
+        "ArminsFakeMovie/200000/2659_3025",
+        "ArminsFakeMovie/200000/3116_3628",
+        "ArminsFakeMovie/200000/3722_4267",
+        "ArminsFakeMovie/200000/4356_4864",
+        "ArminsFakeMovie/200000/4960_5477",
+        "ArminsFakeMovie/200000/5571_6087",
+        "ArminsFakeMovie/200000/6199_6719",
+        "ArminsFakeMovie/200000/6812_7034",
+        "ArminsFakeMovie/300000/2659_3025",
+        "ArminsFakeMovie/300000/3116_3628",
+        "ArminsFakeMovie/300000/3722_4267",
+        "ArminsFakeMovie/300000/4356_4864",
+        "ArminsFakeMovie/300000/4960_5477",
+        "ArminsFakeMovie/300000/5571_6087",
+        "ArminsFakeMovie/300000/6199_6719",
+        "ArminsFakeMovie/300000/6812_7034"
+    };
+
+    {   // copy file & generate index
+
+        BamReader reader{file};
+        IndexedBamWriter writer{outBam, header};
+        BamRecord b;
+        while (reader.GetNext(b))
+            writer.Write(b);
+    }
+
+    {   // sequential read of new BAM
+
+        BamReader reader{outBam};
+        BamRecord b;
+        for (size_t i = 0; i < 24; ++i)
+        {
+            reader.GetNext(b);
+            EXPECT_EQ(expectedQNames.at(i), b.FullName());
+        }
+    }
+
+    {   // check random access in new BAM, using companion PBI
+
+        const PbiRawData idx{outPbi};
+        const auto& offsets = idx.BasicData().fileOffset_;
+
+        BamReader reader{outBam};
+        BamRecord b;
+        for (int i = 23; i >=0; --i)
+        {
+            reader.VirtualSeek(offsets.at(i));
+            reader.GetNext(b);
+            EXPECT_EQ(expectedQNames.at(i), b.FullName());
+        }
+    }
+}
+
+TEST(IndexedBamWriter, HandlesVeryLongReads)
+{
+    using namespace PacBio::BAM;
+
+    const std::string inBamFn = PbbamTestsConfig::Data_Dir + "/long_reads.bam";
+    const std::string outBamFn = PbbamTestsConfig::GeneratedData_Dir + "/long_reads.copy.bam";
+    const std::string outPbiFn = PbbamTestsConfig::GeneratedData_Dir + "/long_reads.copy.bam.pbi";
+
+    // copy file, writing inline PBI index
+    {
+        BamFile file{inBamFn};
+        IndexedBamWriter writer{outBamFn, file.Header()};
+        EntireFileQuery query{file};
+        for (const auto& b : query)
+            writer.Write(b);
+    }
+
+    {
+        const PbiRawData idx{outPbiFn};
+        const auto& offsets = idx.BasicData().fileOffset_;
+
+        BamReader reader{outBamFn};
+        BamRecord b;
+        for (int i = 0; i < 100; ++i)
+        {
+            reader.VirtualSeek(offsets.at(i));
+            reader.GetNext(b);
+        }
+    }
+
+    remove(outBamFn.c_str());
+    remove(outPbiFn.c_str());
+}
+
+// clang-format on
diff --git a/tests/src/test_IndexedFastaReader.cpp b/tests/src/test_IndexedFastaReader.cpp

new file mode 100644 (file)

index 0000000..7afe0c7
--- /dev/null
+++ b/tests/src/test_IndexedFastaReader.cpp
@@ -0,0 +1,169 @@
+// Author: Derek Barnett
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/IndexedFastaReader.h"
+
+#include "FastxTests.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace IndexedFastaReaderTests {
+
+const std::string lambdaFasta = PbbamTestsConfig::Data_Dir + "/lambdaNEB.fa";
+const std::string singleInsertionBam = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+
+}  // namespace IndexedFastaReaderTests
+
+TEST(IndexedFastaReaderTest, throws_on_empty_filename)
+{
+    EXPECT_THROW(IndexedFastaReader reader{""}, std::runtime_error);
+}
+
+TEST(IndexedFastaReaderTest, throws_on_invalid_extension)
+{
+    EXPECT_THROW(IndexedFastaReader reader{"wrong.ext"}, std::runtime_error);
+}
+
+TEST(IndexedFastaReaderTest, can_open_text_fasta_for_reading)
+{
+    const auto& fn = FastxTests::simpleFastaFn;
+    EXPECT_NO_THROW(IndexedFastaReader reader{fn});
+}
+
+TEST(IndexedFastaReaderTest, throws_on_gzip_fasta)
+{
+    const auto& fn = FastxTests::simpleFastaGzipFn;
+    EXPECT_THROW(IndexedFastaReader reader{fn}, std::runtime_error);
+}
+
+TEST(IndexedFastaReaderTest, can_open_bgzf_fasta_for_reading)
+{
+    const auto& fn = FastxTests::simpleFastaBgzfFn;
+    EXPECT_NO_THROW(IndexedFastaReader reader{fn});
+}
+
+TEST(IndexedFastaReaderTest, can_fetch_subsequence_from_lambda)
+{
+    IndexedFastaReader r(IndexedFastaReaderTests::lambdaFasta);
+
+    EXPECT_TRUE(r.HasSequence("lambda_NEB3011"));
+    EXPECT_FALSE(r.HasSequence("dog"));
+    EXPECT_EQ(1, r.NumSequences());
+    EXPECT_EQ(48502, r.SequenceLength("lambda_NEB3011"));
+
+    std::string seq = r.Subsequence("lambda_NEB3011:0-10");
+    EXPECT_EQ("GGGCGGCGAC", seq);
+
+    std::string seq2 = r.Subsequence("lambda_NEB3011", 0, 10);
+    EXPECT_EQ("GGGCGGCGAC", seq2);
+
+    // subsequence extending beyond bounds returns clipped
+    std::string seq3 = r.Subsequence("lambda_NEB3011", 48400, 48600);
+    EXPECT_EQ(102, seq3.length());
+
+    // empty subsequence
+    std::string emptySeq = r.Subsequence("lambda_NEB3011", 10, 10);
+    EXPECT_EQ("", emptySeq);
+}
+
+TEST(IndexedFastaReaderTest, prints_clipped_and_gapped_subsequences_from_lambda)
+{
+    IndexedFastaReader r(IndexedFastaReaderTests::lambdaFasta);
+
+    // Open BAM file
+    BamFile bamFile(IndexedFastaReaderTests::singleInsertionBam);
+    EntireFileQuery bamQuery(bamFile);
+
+    auto it = bamQuery.begin();
+    auto record = *it++;
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::NATIVE, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::NATIVE, true, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::GENOMIC, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true));
+    record = *it++;
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::NATIVE, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::NATIVE, true, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::GENOMIC, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true));
+    record = *it++;
+    EXPECT_EQ(
+        "----------------------------------------------------"
+        "AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+        r.ReferenceSubsequence(record, Orientation::NATIVE, true));
+    EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+              r.ReferenceSubsequence(record, Orientation::NATIVE, true, true));
+    EXPECT_EQ(
+        "----------------------------------------------------"
+        "AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+        r.ReferenceSubsequence(record, Orientation::GENOMIC, true));
+    EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+              r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true));
+    record = *it++;
+    EXPECT_EQ(
+        "AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA-----------------------------"
+        "-----------------------",
+        r.ReferenceSubsequence(record, Orientation::GENOMIC, true));
+    EXPECT_EQ(
+        "----------------------------------------------------TTGCCGCTGTT-"
+        "ACCGTGCTGCGATCTTCTGCCATCGACGGACGTCCCACATTGGTGACTT",
+        r.ReferenceSubsequence(record, Orientation::NATIVE, true));
+    EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+              r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true));
+    EXPECT_EQ("TTGCCGCTGTT-ACCGTGCTGCGATCTTCTGCCATCGACGGACGTCCCACATTGGTGACTT",
+              r.ReferenceSubsequence(record, Orientation::NATIVE, true, true));
+}
+
+// Come back
+TEST(IndexedFastaReaderTest, throws_on_invalid_subsequence_requests)
+{
+    IndexedFastaReader r(IndexedFastaReaderTests::lambdaFasta);
+
+    //
+    // attempt access without "opening"
+    //
+    // EXPECT_THROW(r.NumSequences(), std::exception);
+    // EXPECT_THROW(r.HasSequence("lambda_NEB3011"), std::exception);
+    // EXPECT_THROW(r.SequenceLength("lambda_NEB3011"), std::exception);
+    // EXPECT_THROW(r.Subsequence("lambda_NEB3011:0-10"), std::exception);
+
+    //
+    // invalid accesses after opening
+    //
+    EXPECT_THROW(r.SequenceLength("dog"), std::exception);
+    EXPECT_THROW(r.Subsequence("dog:0-10"), std::exception);
+}
+
+//
+TEST(IndexedFastaReaderTest, can_fetch_name_info_from_lambda)
+{
+    IndexedFastaReader r(IndexedFastaReaderTests::lambdaFasta);
+    std::vector<std::string> names = {"lambda_NEB3011"};
+
+    // Test all-name request
+    EXPECT_EQ(names, r.Names());
+
+    // Test single-name query
+    EXPECT_EQ(names[0], r.Name(0));
+
+    // invalid name acces (out of range)
+    EXPECT_THROW(r.Name(1), std::exception);
+}
diff --git a/tests/src/test_IndexedFastqReader.cpp b/tests/src/test_IndexedFastqReader.cpp

new file mode 100644 (file)

index 0000000..89485a7
--- /dev/null
+++ b/tests/src/test_IndexedFastqReader.cpp
@@ -0,0 +1,137 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <htslib/hts.h>
+
+#include "pbbam/IndexedFastqReader.h"
+
+#include "FastxTests.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace IndexedFastqReaderTests {
+}  // namespace IndexedFastqReaderTests
+
+TEST(IndexedFastqReaderTest, throws_on_empty_filename)
+{
+    EXPECT_THROW(IndexedFastqReader reader{""}, std::runtime_error);
+}
+
+TEST(IndexedFastqReaderTest, throws_on_invalid_extension)
+{
+    EXPECT_THROW(IndexedFastqReader reader{"wrong.ext"}, std::runtime_error);
+}
+
+TEST(IndexedFastqReaderTest, can_open_text_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqFn;
+    EXPECT_NO_THROW(IndexedFastqReader reader{fn});
+}
+
+TEST(IndexedFastqReaderTest, throws_on_gzip_fastq)
+{
+    const auto& fn = FastxTests::simpleFastqGzipFn;
+    EXPECT_THROW(IndexedFastqReader reader{fn}, std::runtime_error);
+}
+
+TEST(IndexedFastqReaderTest, can_open_bgzf_fastq_for_reading)
+{
+    const auto& fn = FastxTests::simpleFastqBgzfFn;
+    EXPECT_NO_THROW(IndexedFastqReader reader{fn});
+}
+
+TEST(IndexedFastqReaderTest, can_query_index_for_metadata)
+{
+    const IndexedFastqReader r{FastxTests::simpleFastqFn};
+
+    EXPECT_TRUE(r.HasSequence("seq1"));
+    EXPECT_FALSE(r.HasSequence("nope"));
+
+    EXPECT_EQ(8, r.NumSequences());
+    EXPECT_EQ(63, r.SequenceLength("seq5"));
+
+    const auto& names = r.Names();
+    ASSERT_EQ(FastxTests::ExpectedFastq.size(), names.size());
+    for (size_t i = 0; i < names.size(); ++i)
+        EXPECT_EQ(FastxTests::ExpectedFastq.at(i).Name(), names.at(i));
+}
+
+TEST(IndexedFastqReaderTest, subsequence_from_text_fastq)
+{
+    IndexedFastqReader r{FastxTests::simpleFastqFn};
+    {
+        const std::string expectedSeq{"GCATGCATGC"};
+        const QualityValues expectedQuals{"~}|{zyxwvu"};
+
+        const auto subsequence = r.Subsequence("seq2", 0, 10);
+
+        EXPECT_EQ(expectedSeq, subsequence.first);
+        EXPECT_EQ(expectedQuals, subsequence.second.Fastq());
+    }
+    {
+        const std::string expectedSeq{"ATGCATGCAT"};
+        const QualityValues expectedQuals{R"(`_^]\[ZYXW)"};
+
+        const auto subsequence = r.Subsequence("seq6", 30, 40);
+
+        EXPECT_EQ(expectedSeq, subsequence.first);
+        EXPECT_EQ(expectedQuals, subsequence.second.Fastq());
+    }
+}
+
+TEST(IndexedFastqReaderTest, subsequence_from_bgzf_fastq)
+{
+    IndexedFastqReader r{FastxTests::simpleFastqBgzfFn};
+    {
+        const std::string expectedSeq{"GCATGCATGC"};
+        const QualityValues expectedQuals{"~}|{zyxwvu"};
+
+        const auto subsequence = r.Subsequence("seq2", 0, 10);
+
+        EXPECT_EQ(expectedSeq, subsequence.first);
+        EXPECT_EQ(expectedQuals, subsequence.second.Fastq());
+    }
+    {
+        const std::string expectedSeq{"ATGCATGCAT"};
+        const QualityValues expectedQuals{R"(`_^]\[ZYXW)"};
+
+        const auto subsequence = r.Subsequence("seq6", 30, 40);
+
+        EXPECT_EQ(expectedSeq, subsequence.first);
+        EXPECT_EQ(expectedQuals, subsequence.second.Fastq());
+    }
+}
+
+TEST(IndexedFastqReaderTest, returns_empty_result_from_empty_region)
+{
+    IndexedFastqReader r{FastxTests::simpleFastqFn};
+    const auto subsequence = r.Subsequence("seq2", 0, 0);
+    EXPECT_TRUE(subsequence.first.empty());
+    EXPECT_TRUE(subsequence.second.empty());
+}
+
+TEST(IndexedFastqReaderTest, throws_if_region_is_malformated)
+{
+    IndexedFastqReader r{FastxTests::simpleFastqFn};
+
+    // start > end
+    EXPECT_THROW(r.Subsequence("seq2", 10, 5), std::runtime_error);
+
+    // start, end < 0
+    EXPECT_THROW(r.Subsequence("seq2", -1, 5), std::runtime_error);
+    EXPECT_THROW(r.Subsequence("seq2", 5, -1), std::runtime_error);
+    EXPECT_THROW(r.Subsequence("seq2", -2, -1), std::runtime_error);
+}
+
+TEST(IndexedFastqReaderTest, returns_available_length_if_region_is_longer)
+{
+    // i.e. like std::string::substr()
+
+    IndexedFastqReader r{FastxTests::simpleFastqFn};
+    const auto subsequence = r.Subsequence("seq2", 0, 1000);
+    EXPECT_EQ(63, subsequence.first.size());
+}
diff --git a/tests/src/test_LongCigar.cpp b/tests/src/test_LongCigar.cpp

new file mode 100644 (file)

index 0000000..ad39c3b
--- /dev/null
+++ b/tests/src/test_LongCigar.cpp
@@ -0,0 +1,125 @@
+// Author: Derek Barnett
+
+#include <iostream>
+#include <string>
+#include <tuple>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamReader.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/StringUtilities.h>
+
+#include "../../src/MemoryUtils.h"
+
+using BamReader = PacBio::BAM::BamReader;
+using BamRecord = PacBio::BAM::BamRecord;
+using BamWriter = PacBio::BAM::BamWriter;
+using Cigar = PacBio::BAM::Cigar;
+using CigarOp = PacBio::BAM::CigarOperation;
+using PacBio::BAM::CigarOperationType;
+using Tag = PacBio::BAM::Tag;
+
+// clang-format off
+
+namespace LongCigarTests {
+
+static bool DoesHtslibSupportLongCigar()
+{
+    const std::string htsVersion = hts_version();
+
+    // remove any "-<blah>" for non-release versions
+    const auto versionBase = PacBio::BAM::Split(htsVersion, '-');
+    if (versionBase.empty())
+        throw std::runtime_error{"invalid htslib version format: " + htsVersion};
+
+    // grab major/minor version numbers
+    const auto versionParts = PacBio::BAM::Split(versionBase[0], '.');
+    if (versionParts.size() < 2)
+         throw std::runtime_error{"invalid htslib version format: " + htsVersion};
+
+    // check against v1.7
+    const int versionMajor = std::stoi(versionParts[0]);
+    const int versionMinor = std::stoi(versionParts[1]);
+    static constexpr const int v17_major = 1;
+    static constexpr const int v17_minor = 7;
+    return std::tie(versionMajor, versionMinor) >=
+           std::tie(v17_major, v17_minor);
+}
+
+static const bool has_native_long_cigar_support = DoesHtslibSupportLongCigar();
+
+// BAM record in this file has its CIGAR data in the new "CG" tag
+static const std::string LongCigarBam = PacBio::BAM::PbbamTestsConfig::Data_Dir + "/long-cigar-1.7.bam";
+
+static const std::string LongCigarOut =
+    PacBio::BAM::PbbamTestsConfig::GeneratedData_Dir + "/long-cigar-generated.bam";
+
+static const size_t numOps = 72091;
+
+static BamRecord ReadLongCigarRecord(const std::string& fn)
+{
+    BamRecord b;
+    BamReader reader{fn};
+    const bool success = reader.GetNext(b);
+    EXPECT_TRUE(success);
+    return b;
+}
+
+}  // namespace LongCigarTests
+
+TEST(LongCigarTest, ReadAndFetchLongCigar)
+{
+    const auto b = LongCigarTests::ReadLongCigarRecord(LongCigarTests::LongCigarBam);
+
+    EXPECT_EQ(LongCigarTests::numOps, b.CigarData().size());
+    if (LongCigarTests::has_native_long_cigar_support)
+        EXPECT_FALSE(b.Impl().HasTag("CG"));
+    else
+        EXPECT_TRUE(b.Impl().HasTag("CG"));
+}
+
+TEST(LongCigarTest, EditLongCigar)
+{
+    auto b = LongCigarTests::ReadLongCigarRecord(LongCigarTests::LongCigarBam);
+    b.Impl().CigarData(b.CigarData());
+
+    EXPECT_EQ(LongCigarTests::numOps, b.CigarData().size());
+    if (LongCigarTests::has_native_long_cigar_support)
+        EXPECT_FALSE(b.Impl().HasTag("CG"));
+    else
+        EXPECT_TRUE(b.Impl().HasTag("CG"));
+}
+
+TEST(LongCigarTest, WriteLongCigar)
+{
+    SCOPED_TRACE("WriteLongCigar");
+
+    {   // edit & write
+        auto b = LongCigarTests::ReadLongCigarRecord(LongCigarTests::LongCigarBam);
+        b.Impl().CigarData(b.CigarData());
+
+        EXPECT_EQ(LongCigarTests::numOps, b.CigarData().size());
+        if (LongCigarTests::has_native_long_cigar_support)
+            EXPECT_FALSE(b.Impl().HasTag("CG"));
+        else
+            EXPECT_TRUE(b.Impl().HasTag("CG"));
+
+        BamWriter writer{LongCigarTests::LongCigarOut, b.header_};
+        writer.Write(b);
+    }
+
+    {   // read back in
+        auto b = LongCigarTests::ReadLongCigarRecord(LongCigarTests::LongCigarOut);
+
+        EXPECT_EQ(LongCigarTests::numOps, b.CigarData().size());
+        if (LongCigarTests::has_native_long_cigar_support)
+            EXPECT_FALSE(b.Impl().HasTag("CG"));
+        else
+            EXPECT_TRUE(b.Impl().HasTag("CG"));
+    }
+}
+
+// clang-format on
diff --git a/tests/src/test_PacBioIndex.cpp b/tests/src/test_PacBioIndex.cpp

new file mode 100644 (file)

index 0000000..2809d2d
--- /dev/null
+++ b/tests/src/test_PacBioIndex.cpp
@@ -0,0 +1,444 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamFile.h>
+#include <pbbam/BamReader.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/PbiBuilder.h>
+#include <pbbam/PbiRawData.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace PacBioIndexTests {
+
+const std::string test2BamFn = PbbamTestsConfig::Data_Dir + "/aligned2.bam";
+const std::string phi29BamFn = PbbamTestsConfig::Data_Dir + "/phi29.bam";
+
+static PbiRawData Test2Bam_CoreIndexData()
+{
+    PbiRawData rawData;
+    rawData.Version(PbiFile::CurrentVersion);
+    rawData.FileSections(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::REFERENCE);
+    rawData.NumReads(10);
+
+    PbiRawBasicData& basicData = rawData.BasicData();
+    basicData.rgId_ = {-1197849594, -1197849594, -1197849594, -1197849594, -1197849594,
+                       -1197849594, -1197849594, -1197849594, -1197849594, -1197849594};
+    basicData.qStart_ = {48, 387, 0, 9936, 10232, 7468, 5557, 7285, 426, 7064};
+    basicData.qEnd_ = {1132, 1134, 344, 10187, 10394, 8906, 7235, 8657, 1045, 7421};
+    basicData.holeNumber_ = {49050, 32328, 32328, 6469, 6469, 30983, 13473, 13473, 19915, 30983};
+    basicData.readQual_ = {0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6};
+    basicData.ctxtFlag_ = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    basicData.fileOffset_ = {33816576, 33825163, 33831333, 33834264, 33836542,
+                             33838065, 33849818, 33863499, 33874621, 1392836608};
+
+    PbiRawMappedData& mappedData = rawData.MappedData();
+    mappedData.tId_ = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    mappedData.tStart_ = {0, 302, 675, 2170, 2203, 3572, 4506, 4507, 4592, 4669};
+    mappedData.tEnd_ = {471, 1019, 1026, 2397, 2326, 5015, 6125, 5850, 5203, 5011};
+    mappedData.aStart_ = {653, 395, 1, 9960, 10271, 7468, 5574, 7285, 441, 7075};
+    mappedData.aEnd_ = {1129, 1134, 344, 10185, 10394, 8906, 7235, 8647, 1040, 7418};
+    mappedData.revStrand_ = {0, 1, 0, 1, 0, 1, 1, 0, 1, 0};
+    mappedData.nM_ = {460, 704, 339, 216, 118, 1394, 1581, 1313, 583, 333};
+    mappedData.nMM_ = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    mappedData.mapQV_ = {254, 254, 254, 254, 254, 254, 254, 254, 254, 254};
+
+    PbiRawReferenceData& referenceData = rawData.ReferenceData();
+    referenceData.entries_ = {PbiReferenceEntry{0, 0, 10},
+                              PbiReferenceEntry{4294967295, 4294967295, 4294967295}};
+
+    return rawData;
+}
+
+// NOTE: We have 2 different sets of offsets because the copied, new file differs in size than the existing one.
+//
+//       Unsure which combination of write parameters were used on the original. Things like thread count,
+//       compression level, etc. can effect compression ratio, BGZF block sizes, etc. even though the BAM record
+//       content itself is equal. So we'll just track these index values separately, for now at least.
+//
+static PbiRawData Test2Bam_ExistingIndex()
+{
+    PbiRawData index = Test2Bam_CoreIndexData();
+    index.BasicData().fileOffset_ = {33816576, 33825163, 33831333, 33834264, 33836542,
+                                     33838065, 33849818, 33863499, 33874621, 1392836608};
+    return index;
+}
+
+static PbiRawData Test2Bam_NewIndex()
+{
+    PbiRawData index = Test2Bam_CoreIndexData();
+    index.BasicData().fileOffset_ = {33816576,  236126208, 391315456,  469106688,  537067520,
+                                     587792384, 867303424, 1182793728, 1449787392, 1582628864};
+    return index;
+}
+
+static void ExpectRawIndicesEqual(const PbiRawData& expected, const PbiRawData& actual)
+{
+    // header data
+    EXPECT_EQ(expected.FileSections(), actual.FileSections());
+    EXPECT_EQ(expected.NumReads(), actual.NumReads());
+
+    // subread data
+    const PbiRawBasicData& e = expected.BasicData();
+    const PbiRawBasicData& a = actual.BasicData();
+    EXPECT_EQ(e.rgId_, a.rgId_);
+    EXPECT_EQ(e.qStart_, a.qStart_);
+    EXPECT_EQ(e.qEnd_, a.qEnd_);
+    EXPECT_EQ(e.holeNumber_, a.holeNumber_);
+    EXPECT_EQ(e.readQual_, a.readQual_);
+    EXPECT_EQ(e.ctxtFlag_, a.ctxtFlag_);
+    EXPECT_EQ(e.fileOffset_, a.fileOffset_);
+
+    // mapped data
+    EXPECT_EQ(expected.HasMappedData(), actual.HasMappedData());
+    if (expected.HasMappedData() && actual.HasMappedData()) {
+        const PbiRawMappedData& e2 = expected.MappedData();
+        const PbiRawMappedData& a2 = actual.MappedData();
+        EXPECT_EQ(e2.tId_, a2.tId_);
+        EXPECT_EQ(e2.tStart_, a2.tStart_);
+        EXPECT_EQ(e2.tEnd_, a2.tEnd_);
+        EXPECT_EQ(e2.aStart_, a2.aStart_);
+        EXPECT_EQ(e2.aEnd_, a2.aEnd_);
+        EXPECT_EQ(e2.revStrand_, a2.revStrand_);
+        EXPECT_EQ(e2.nM_, a2.nM_);
+        EXPECT_EQ(e2.nMM_, a2.nMM_);
+        EXPECT_EQ(e2.mapQV_, a2.mapQV_);
+    }
+
+    // reference data
+    EXPECT_EQ(expected.HasReferenceData(), actual.HasReferenceData());
+    if (expected.HasReferenceData() && actual.HasReferenceData()) {
+        const PbiRawReferenceData& e2 = expected.ReferenceData();
+        const PbiRawReferenceData& a2 = actual.ReferenceData();
+        EXPECT_EQ(e2.entries_, a2.entries_);
+    }
+
+    // barcode data
+    EXPECT_EQ(expected.HasBarcodeData(), actual.HasBarcodeData());
+    if (expected.HasBarcodeData() && actual.HasBarcodeData()) {
+        const PbiRawBarcodeData& e2 = expected.BarcodeData();
+        const PbiRawBarcodeData& a2 = actual.BarcodeData();
+        EXPECT_EQ(e2.bcForward_, a2.bcForward_);
+        EXPECT_EQ(e2.bcReverse_, a2.bcReverse_);
+        EXPECT_EQ(e2.bcQual_, a2.bcQual_);
+    }
+}
+
+}  // namespace PacBioIndexTests
+
+TEST(PacBioIndexTest, CreateFromExistingBam)
+{
+    // do this in temp directory, so we can ensure write access
+    const std::string tempDir = PbbamTestsConfig::GeneratedData_Dir + "/";
+    const std::string tempBamFn = tempDir + "aligned_copy.bam";
+    const std::string tempPbiFn = tempBamFn + ".pbi";
+    std::string cmd("cp ");
+    cmd += PacBioIndexTests::test2BamFn;
+    cmd += " ";
+    cmd += tempBamFn;
+    const auto cmdResult = system(cmd.c_str());
+    UNUSED(cmdResult);
+
+    BamFile bamFile(tempBamFn);
+    PbiFile::CreateFrom(bamFile);
+    EXPECT_EQ(tempPbiFn, bamFile.PacBioIndexFilename());
+
+    PbiRawData index(bamFile.PacBioIndexFilename());
+    EXPECT_EQ(PbiFile::CurrentVersion, index.Version());
+    EXPECT_EQ(10, index.NumReads());
+    EXPECT_TRUE(index.HasMappedData());
+
+    const PbiRawData expectedIndex = PacBioIndexTests::Test2Bam_ExistingIndex();
+    PacBioIndexTests::ExpectRawIndicesEqual(expectedIndex, index);
+
+    // clean up temp file(s)
+    remove(tempBamFn.c_str());
+    remove(tempPbiFn.c_str());
+}
+
+::testing::AssertionResult CanRead(BamReader& reader, BamRecord& record, int i)
+{
+    if (reader.GetNext(record))
+        return ::testing::AssertionSuccess() << "i: " << i;
+    else
+        return ::testing::AssertionFailure() << "i: " << i;
+}
+
+TEST(PacBioIndexTest, CreateOnTheFly)
+{
+    // do this in temp directory, so we can ensure write access
+    const std::string tempDir = PbbamTestsConfig::GeneratedData_Dir + "/";
+    const std::string tempBamFn = tempDir + "temp.bam";
+    const std::string tempPbiFn = tempBamFn + ".pbi";
+
+    // NOTE: new file differs in size than existing (different write parameters may yield different file sizes, even though content is same)
+    const std::vector<int64_t> expectedNewOffsets = {33816576,   236126208, 391315456, 469106688,
+                                                     537067520,  587792384, 867303424, 1182793728,
+                                                     1449787392, 1582628864};
+    std::vector<int64_t> observedOffsets;
+
+    // create PBI on the fly from input BAM while we write to new file
+    {
+        BamFile bamFile(PacBioIndexTests::test2BamFn);
+        BamHeader header = bamFile.Header();
+
+        BamWriter writer(tempBamFn, header);  // default compression, default thread count
+        PbiBuilder builder(tempPbiFn, header.Sequences().size());
+
+        int64_t vOffset = 0;
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile) {
+            writer.Write(record, &vOffset);
+            builder.AddRecord(record, vOffset);
+            observedOffsets.push_back(vOffset);
+        }
+    }
+
+    EXPECT_EQ(expectedNewOffsets, observedOffsets);
+
+    // sanity check on original file
+    {
+        const std::vector<int64_t> originalFileOffsets = {33816576, 33825163,  33831333, 33834264,
+                                                          33836542, 33838065,  33849818, 33863499,
+                                                          33874621, 1392836608};
+        BamRecord r;
+        BamReader reader(PacBioIndexTests::test2BamFn);
+        for (size_t i = 0; i < originalFileOffsets.size(); ++i) {
+            reader.VirtualSeek(originalFileOffsets.at(i));
+            EXPECT_TRUE(CanRead(reader, r, i));
+        }
+    }
+
+    // attempt to seek in our new file using both expected & observed offsets
+    {
+        BamRecord r;
+        BamReader reader(tempBamFn);
+        for (size_t i = 0; i < expectedNewOffsets.size(); ++i) {
+            reader.VirtualSeek(expectedNewOffsets.at(i));
+            EXPECT_TRUE(CanRead(reader, r, i));
+        }
+        for (size_t i = 0; i < observedOffsets.size(); ++i) {
+            reader.VirtualSeek(observedOffsets.at(i));
+            EXPECT_TRUE(CanRead(reader, r, i));
+        }
+    }
+
+    // compare data in new PBI file, to expected data
+    const PbiRawData expectedIndex = PacBioIndexTests::Test2Bam_NewIndex();
+    const PbiRawData fromBuilt = PbiRawData(tempPbiFn);
+    EXPECT_EQ(PbiFile::CurrentVersion, fromBuilt.Version());
+    PacBioIndexTests::ExpectRawIndicesEqual(expectedIndex, fromBuilt);
+
+    // straight diff of newly-generated PBI file to existing PBI
+    // TODO: Come back to this once pbindexump is in place.
+    //       We can't exactly do this since file offsets may differ between 2 BAMs of differing compression levels.
+    //       Should add some sort of BAM checksum based on contents, not just size, for this reason.
+    //    const string pbiDiffCmd = string("diff -q ") + PacBioIndexTests::test2BamFn + ".pbi " + tempPbiFn;
+    //    EXPECT_EQ(0, system(pbiDiffCmd.c_str()));
+
+    // clean up temp file(s)
+    remove(tempBamFn.c_str());
+    remove(tempPbiFn.c_str());
+}
+
+TEST(PacBioIndexTest, RawLoadFromPbiFile)
+{
+    const BamFile bamFile(PacBioIndexTests::test2BamFn);
+    const std::string pbiFilename = bamFile.PacBioIndexFilename();
+    const PbiRawData loadedIndex(pbiFilename);
+    EXPECT_EQ(PbiFile::Version_3_0_1, loadedIndex.Version());
+
+    const PbiRawData expectedIndex = PacBioIndexTests::Test2Bam_ExistingIndex();
+    PacBioIndexTests::ExpectRawIndicesEqual(expectedIndex, loadedIndex);
+}
+
+TEST(PacBioIndexTest, BasicAndBarodeSectionsOnly)
+{
+    // do this in temp directory, so we can ensure write access
+    const std::string tempDir = PbbamTestsConfig::GeneratedData_Dir + "/";
+    const std::string tempBamFn = tempDir + "phi29.bam";
+    const std::string tempPbiFn = tempBamFn + ".pbi";
+    std::string cmd("cp ");
+    cmd += PacBioIndexTests::phi29BamFn;
+    cmd += " ";
+    cmd += tempDir;
+    const auto cmdResult = system(cmd.c_str());
+    UNUSED(cmdResult);
+
+    BamFile bamFile(tempBamFn);
+    PbiFile::CreateFrom(bamFile);
+    EXPECT_EQ(tempPbiFn, bamFile.PacBioIndexFilename());
+
+    PbiRawData index(bamFile.PacBioIndexFilename());
+    EXPECT_EQ(PbiFile::CurrentVersion, index.Version());
+    EXPECT_EQ(120, index.NumReads());
+    EXPECT_FALSE(index.HasMappedData());
+    EXPECT_TRUE(index.HasBarcodeData());
+
+    const std::vector<int16_t> expectedBcForward{
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+    const std::vector<int16_t> expectedBcReverse{
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+    const std::vector<int8_t> expectedBcQuality{
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+    const PbiRawBarcodeData& barcodeData = index.BarcodeData();
+    EXPECT_EQ(expectedBcForward, barcodeData.bcForward_);
+    EXPECT_EQ(expectedBcReverse, barcodeData.bcReverse_);
+    EXPECT_EQ(expectedBcQuality, barcodeData.bcQual_);
+
+    // clean up temp file(s)
+    remove(tempBamFn.c_str());
+    remove(tempPbiFn.c_str());
+}
+
+TEST(PacBioIndexTest, ReferenceDataNotLoadedOnUnsortedBam)
+{
+    BamFile bamFile(PacBioIndexTests::test2BamFn);
+    PbiRawData raw(bamFile.PacBioIndexFilename());
+    EXPECT_TRUE(raw.HasReferenceData());
+}
+
+TEST(PacBioIndexTest, LookupLoadFromFileOk)
+{
+    const uint32_t expectedNumReads = 10;
+    const std::vector<int64_t> expectedOffsets{33816576, 33825163, 33831333, 33834264, 33836542,
+                                               33838065, 33849818, 33863499, 33874621, 1392836608};
+
+    EXPECT_NO_THROW({
+        BamFile bamFile(PacBioIndexTests::test2BamFn);
+        PbiRawData index(bamFile.PacBioIndexFilename());
+        EXPECT_EQ(expectedNumReads, index.NumReads());
+        EXPECT_EQ(expectedOffsets, index.BasicData().fileOffset_);
+    });
+}
+
+TEST(PacBioIndexTest, ThrowOnNonExistentPbiFile)
+{
+    EXPECT_THROW(PbiRawData("does_not_exist.pbi"), std::exception);
+}
+
+TEST(PacBioIndexTest, ThrowOnNonPbiFile)
+{
+    // completely wrong format
+    EXPECT_THROW(
+        {
+            const auto fastaFn = PbbamTestsConfig::Data_Dir + "/lambdaNEB.fa";
+            PbiRawData idx{fastaFn};
+        },
+        std::runtime_error);
+
+    // BGZF file, but not PBI
+    EXPECT_THROW(
+        {
+            const auto bamFn = PbbamTestsConfig::Data_Dir + "/ex2.bam";
+            PbiRawData idx{bamFn};
+        },
+        std::runtime_error);
+}
+
+TEST(PacBioIndexTest, AggregatePBI)
+{
+
+    DataSet ds;
+    ExternalResources& resources = ds.ExternalResources();
+    resources.Add(BamFile{PbbamTestsConfig::Data_Dir +
+                          "/aligned.bam"});  // 4 reads, BASIC | MAPPED | REFERENCE
+    resources.Add(BamFile{PbbamTestsConfig::Data_Dir +
+                          "/polymerase/production.subreads.bam"});  // 8 reads, BASIC | BARCODE
+    resources.Add(BamFile{PbbamTestsConfig::Data_Dir +
+                          "/polymerase/production_hq.hqregion.bam"});  // 1 read,  BASIC only
+
+    const PbiRawData index{ds};
+    const PbiRawBasicData& mergedBasicData = index.BasicData();
+    const PbiRawBarcodeData& mergedBarcodeData = index.BarcodeData();
+    const PbiRawMappedData& mergedMappedData = index.MappedData();
+
+    const uint32_t expectedTotal = 13;  // 4 + 8 + 1
+
+    // 'meta' info
+    EXPECT_EQ(expectedTotal, index.NumReads());
+    EXPECT_EQ(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::BARCODE, index.FileSections());
+    EXPECT_TRUE(index.HasBarcodeData());
+    EXPECT_TRUE(index.HasMappedData());
+    EXPECT_FALSE(index.HasReferenceData());
+
+    // file numbers
+    EXPECT_EQ(0, mergedBasicData.fileNumber_.at(0));
+    EXPECT_EQ(0, mergedBasicData.fileNumber_.at(1));
+    EXPECT_EQ(0, mergedBasicData.fileNumber_.at(2));
+    EXPECT_EQ(0, mergedBasicData.fileNumber_.at(3));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(4));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(5));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(6));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(7));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(8));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(9));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(10));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(11));
+    EXPECT_EQ(2, mergedBasicData.fileNumber_.at(12));
+
+    // basic data
+    EXPECT_EQ(0, mergedBasicData.qStart_.at(0));  // file 1
+    EXPECT_EQ(0, mergedBasicData.qStart_.at(1));
+    EXPECT_EQ(2659, mergedBasicData.qStart_.at(4));  // file 2
+    EXPECT_EQ(3116, mergedBasicData.qStart_.at(5));
+    EXPECT_EQ(2659, mergedBasicData.qStart_.at(12));  // file 3
+
+    EXPECT_EQ(21102592, mergedBasicData.fileOffset_.at(0));  // file 1
+    EXPECT_EQ(21102883, mergedBasicData.fileOffset_.at(1));
+    EXPECT_EQ(19857408, mergedBasicData.fileOffset_.at(4));  // file 2
+    EXPECT_EQ(19860696, mergedBasicData.fileOffset_.at(5));
+    EXPECT_EQ(20054016, mergedBasicData.fileOffset_.at(12));  // file 3
+
+    // mapped data
+    EXPECT_EQ(60, mergedMappedData.mapQV_.at(0));  // file 1
+    EXPECT_EQ(60, mergedMappedData.mapQV_.at(1));
+    EXPECT_EQ(255, mergedMappedData.mapQV_.at(4));  // file 2
+    EXPECT_EQ(255, mergedMappedData.mapQV_.at(5));
+    EXPECT_EQ(255, mergedMappedData.mapQV_.at(12));  // file 3
+
+    // barcode data
+    EXPECT_EQ(-1, mergedBarcodeData.bcForward_.at(0));  // file 1
+    EXPECT_EQ(-1, mergedBarcodeData.bcForward_.at(1));
+    EXPECT_EQ(92, mergedBarcodeData.bcForward_.at(4));  // file 2
+    EXPECT_EQ(92, mergedBarcodeData.bcForward_.at(5));
+    EXPECT_EQ(-1, mergedBarcodeData.bcForward_.at(12));  // file 3
+}
+
+TEST(PbiIndexCacheTest, LoadsExpectedIndexData)
+{
+    const DataSet ds{PbbamTestsConfig::Data_Dir + "/chunking/chunking.subreadset.xml"};
+
+    std::vector<uint32_t> readCounts;
+    for (const BamFile& bamFile : ds.BamFiles()) {
+        const PbiRawData index{bamFile};
+        readCounts.push_back(index.NumReads());
+    }
+
+    const auto indexCache = MakePbiIndexCache(ds);
+    ASSERT_EQ(3, indexCache->size());
+    EXPECT_EQ(readCounts[0], indexCache->at(0)->NumReads());
+    EXPECT_EQ(readCounts[1], indexCache->at(1)->NumReads());
+    EXPECT_EQ(readCounts[2], indexCache->at(2)->NumReads());
+}
diff --git a/tests/src/test_PbiFilter.cpp b/tests/src/test_PbiFilter.cpp

new file mode 100644 (file)

index 0000000..b2a6b05
--- /dev/null
+++ b/tests/src/test_PbiFilter.cpp
@@ -0,0 +1,1364 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/PbiFilter.h>
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace PbiFilterTests {
+
+// helper structs & methods
+
+static
+PbiRawData test2Bam_RawIndex()
+{
+    PbiRawData index;
+    index.NumReads(4);
+
+    PbiRawBasicData& subreadData = index.BasicData();
+    subreadData.rgId_       = { -1197849594, -1197849594, -1197849594, -1197849594 };
+    subreadData.qStart_     = { 2114, 2579, 4101, 5615 };
+    subreadData.qEnd_       = { 2531, 4055, 5571, 6237 };
+    subreadData.holeNumber_ = { 14743, 14743, 14743, 14743 };
+    subreadData.readQual_   = { 0.901, 0.601, 0.901, 0.601 };
+    subreadData.ctxtFlag_   = { 0, 1, 2, 3 };
+    subreadData.fileOffset_ = { 35651584, 35655125, 35667128, 35679170 };
+
+    PbiRawMappedData& mappedData = index.MappedData();
+    mappedData.tId_       = { 0, 0, 0, 0 };
+    mappedData.tStart_    = { 9507, 8453, 8455, 9291 };
+    mappedData.tEnd_      = { 9903, 9902, 9893, 9900 };
+    mappedData.aStart_    = { 2130, 2581, 4102, 5619 };
+    mappedData.aEnd_      = { 2531, 4055, 5560, 6237 };
+    mappedData.revStrand_ = { 0, 1, 0, 1 };
+    mappedData.mapQV_     = { 254, 254, 254, 254 };
+    mappedData.nM_        = { 384, 1411, 1393, 598 };
+    mappedData.nMM_       = { 0, 0, 0, 0 };
+
+    PbiRawBarcodeData& barcodeData = index.BarcodeData();
+    barcodeData.bcForward_ = { 0, 17, 256, 17 };
+    barcodeData.bcReverse_ = { 1, 18, 257, 18 };
+    barcodeData.bcQual_    = { 42, 80, 42, 110 };
+
+    PbiRawReferenceData& referenceData = index.ReferenceData();
+    referenceData.entries_.emplace_back( 0, 0, 3 );
+    referenceData.entries_.emplace_back( 1 );
+    referenceData.entries_.emplace_back( PbiReferenceEntry::UNMAPPED_ID );
+
+    return index;
+}
+
+static const PbiRawData shared_index = test2Bam_RawIndex();
+
+static
+void checkFilterRows(const PbiFilter& filter, const std::vector<size_t> expectedRows)
+{
+    if (expectedRows.empty())
+    {
+        for (size_t row = 0; row < shared_index.NumReads(); ++row)
+            EXPECT_FALSE(filter.Accepts(shared_index, row));
+    } else {
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(shared_index, row));
+    }
+}
+
+static
+void checkFilterInternals(const PbiFilter& filter,
+                          const PbiFilter::CompositionType expectedType,
+                          const size_t expectedNumChildren,
+                          const std::vector<size_t> expectedRows)
+{
+    EXPECT_EQ(expectedType,        filter.Type());
+    EXPECT_EQ(expectedNumChildren, filter.NumChildren());
+    checkFilterRows(filter, expectedRows);
+}
+
+struct SimpleFilter
+{
+    bool Accepts(const PbiRawData& /* idx */, const size_t /* row */) const
+    { /*()idx; ()row;*/ return true; }
+};
+
+struct NoncompliantFilter { };
+
+struct SortUniqueTestFilter
+{
+    bool Accepts(const PbiRawData& /* idx */, const size_t row) const
+    {
+//        ()idx;
+        switch(row) {
+            case 0: // fall through
+            case 1: // .
+            case 2: // .
+            case 3: // .
+            case 4: // .
+            case 7: // .
+            case 8: return true;
+            default:
+                return false;
+        }
+    }
+};
+
+struct SortUniqueTestFilter2
+{
+    bool Accepts(const PbiRawData& /* idx */, const size_t row) const
+    {
+//        ()idx;
+        switch(row) {
+            case 3: // fall through
+            case 7: // .
+            case 5: return true;
+            default:
+                return false;
+        }
+    }
+};
+
+static inline
+PbiFilter emptyFilter()
+{ return PbiFilter{ }; }
+
+static inline
+PbiFilter simpleFilter()
+{ return PbiFilter{ SimpleFilter{ } }; }
+
+} // namespace PbiFilterTests
+
+TEST(PbiFilterTest, DefaultCtorOk)
+{
+    auto filter = PbiFilter{ };
+    PbiFilterTests::checkFilterInternals(filter, PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+}
+
+TEST(PbiFilterTest, CompositionOk)
+{
+    auto filter = PbiFilter{ };
+    filter.Add(PbiFilter{ });
+    PbiFilterTests::checkFilterInternals(filter, PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+}
+
+TEST(PbiFilterTest, CustomFilterOk)
+{
+    { // ctor
+        auto filter = PbiFilter{ PbiFilterTests::SimpleFilter{ } };
+        PbiFilterTests::checkFilterInternals(filter, PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+    }
+    { // Add
+        auto filter = PbiFilter{ };
+        filter.Add(PbiFilterTests::SimpleFilter{ });
+        PbiFilterTests::checkFilterInternals(filter, PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+    }
+
+//    PbiFilter shouldNotCompile = PbiFilter{ PbiFilterTests::NoncompliantFilter{ } };                       // <-- when uncommented, should not compile
+//    PbiFilter shouldNotCompileEither; shouldNotCompileEither.Add(PbiFilterTests::NoncompliantFilter{ });   // <-- when uncommented, should not compile
+}
+
+TEST(PbiFilterTest, CopyOk)
+{
+    { // empty
+        const auto original = PbiFilter{ };
+
+        PbiFilter copyCtor(original);
+        PbiFilter copyAssign;
+        copyAssign = original;
+
+        PbiFilterTests::checkFilterInternals(original,   PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(copyCtor,   PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(copyAssign, PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+    }
+    { // with children
+        const auto original = PbiFilter{ PbiFilterTests::SimpleFilter{ } };
+
+        PbiFilter copyCtor(original);
+        PbiFilter copyAssign;
+        copyAssign = original;
+
+        PbiFilterTests::checkFilterInternals(original,   PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(copyCtor,   PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(copyAssign, PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+    }
+}
+
+TEST(PbiFilterTest, MoveOk)
+{
+    { // empty
+        const auto original = PbiFilterTests::emptyFilter();
+
+        PbiFilter moveCtor(PbiFilterTests::emptyFilter());
+        PbiFilter moveAssign;
+        moveAssign = PbiFilterTests::emptyFilter();
+
+        PbiFilterTests::checkFilterInternals(original,   PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(moveCtor,   PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(moveAssign, PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+    }
+    { // with children
+        const auto original = PbiFilterTests::simpleFilter();
+
+        PbiFilter moveCtor(PbiFilterTests::simpleFilter());
+        PbiFilter moveAssign;
+        moveAssign = PbiFilterTests::simpleFilter();
+
+        PbiFilterTests::checkFilterInternals(original,   PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(moveCtor,   PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(moveAssign, PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+    }
+}
+
+TEST(PbiFilterTest, SortsAndUniquesChildFilterResultsOk)
+{
+    const auto childFilter = PbiFilterTests::SortUniqueTestFilter{ };
+    const auto filter = PbiFilter{ childFilter };
+    PbiFilterTests::checkFilterRows(childFilter, std::vector<size_t>{2, 7, 0, 3, 4, 1, 8});
+    PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0, 1, 2, 3, 4, 7, 8});
+}
+
+TEST(PbiFilterTest, UnionOk)
+{
+    { // empty
+        { // copy
+            const auto emptyFilter = PbiFilterTests::emptyFilter();
+            const auto emptyFilter2 = PbiFilterTests::emptyFilter();
+            const auto u = PbiFilter::Union({ emptyFilter, emptyFilter2 });
+            PbiFilterTests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector<size_t>{0,1,2,3});
+        }
+        { // move
+            const auto u = PbiFilter::Union({ PbiFilter{ }, PbiFilter{ } });
+            PbiFilterTests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector<size_t>{0,1,2,3});
+        }
+    }
+
+    { // with (no-data) children - just checking composition
+        { // copy
+            const auto simpleFilter = PbiFilterTests::SimpleFilter{ };
+            const auto simpleFilter2 = PbiFilterTests::SimpleFilter{ };
+            const auto u = PbiFilter::Union({ simpleFilter, simpleFilter2 });
+            PbiFilterTests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector<size_t>{0,1,2,3});
+        }
+        { // move
+            const auto u = PbiFilter::Union({ PbiFilterTests::SimpleFilter{ }, PbiFilterTests::SimpleFilter{ } });
+            PbiFilterTests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector<size_t>{0,1,2,3});
+        }
+    }
+
+    { // 2-child union, results sorted & unique-d by PbiFilter
+
+        const auto child1 = PbiFilterTests::SortUniqueTestFilter{ };
+        const auto child2 = PbiFilterTests::SortUniqueTestFilter2{ };
+        const auto u = PbiFilter::Union({ child1, child2 });
+
+        PbiFilterTests::checkFilterRows(child1, std::vector<size_t>{2, 7, 0, 3, 4, 1, 8});
+        PbiFilterTests::checkFilterRows(child2, std::vector<size_t>{3, 7, 5});
+        PbiFilterTests::checkFilterRows(u, std::vector<size_t>{0, 1, 2, 3, 4, 5, 7, 8});
+    }
+}
+
+TEST(PbiFilterTest, IntersectOk)
+{
+    { // empty
+        { // copy
+            const auto emptyFilter = PbiFilterTests::emptyFilter();
+            const auto emptyFilter2 = PbiFilterTests::emptyFilter();
+            const auto i = PbiFilter::Intersection({ emptyFilter, emptyFilter2 });
+            PbiFilterTests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector<size_t>{0,1,2,3});
+        }
+        { // move
+            const auto i = PbiFilter::Intersection({ PbiFilter{ }, PbiFilter{ } });
+            PbiFilterTests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector<size_t>{0,1,2,3});
+        }
+    }
+
+    { // with (no-data) children - just checking composition
+        { // copy
+            const auto simpleFilter = PbiFilterTests::SimpleFilter{ };
+            const auto simpleFilter2 = PbiFilterTests::SimpleFilter{ };
+            const auto i = PbiFilter::Intersection({ simpleFilter, simpleFilter2 });
+            PbiFilterTests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector<size_t>{0,1,2,3});
+        }
+        { // move
+            const auto i = PbiFilter::Intersection({ PbiFilterTests::SimpleFilter{ }, PbiFilterTests::SimpleFilter{ } });
+            PbiFilterTests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector<size_t>{0,1,2,3});
+        }
+    }
+
+    { // 2-child intersect, sorted & unique-d by PbiFilter
+
+        const auto child1 = PbiFilterTests::SortUniqueTestFilter{ };
+        const auto child2 = PbiFilterTests::SortUniqueTestFilter2{ };
+        const auto i = PbiFilter::Intersection({ child1, child2 });
+
+        PbiFilterTests::checkFilterRows(child1, std::vector<size_t>{2, 7, 0, 3, 4, 1, 8});
+        PbiFilterTests::checkFilterRows(child2, std::vector<size_t>{3, 7, 5 });
+        PbiFilterTests::checkFilterRows(i, std::vector<size_t>{3, 7});
+    }
+}
+
+TEST(PbiFilterTest, AlignedEndFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 4055 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 4055, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 4000, Compare::LESS_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 5560, Compare::GREATER_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 5560, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{2,3});
+    }
+
+    {
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 7000, Compare::GREATER_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+}
+
+TEST(PbiFilterTest, AlignedLengthFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiAlignedLengthFilter{ 500, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedLengthFilter{ 1000, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2});
+    }
+}
+
+TEST(PbiFilterTest, AlignedStartFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiAlignedStartFilter{ 2600, Compare::LESS_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedStartFilter{ 4102, Compare::GREATER_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedStartFilter{ 4102, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedStartFilter{ 6000, Compare::GREATER_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{ });
+    }
+}
+
+TEST(PbiFilterTest, AlignedStrandFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiAlignedStrandFilter{ Strand::FORWARD } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedStrandFilter{ Strand::REVERSE } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedStrandFilter{ Strand::FORWARD, Compare::NOT_EQUAL } }; // same as Strand::REVERSE
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+
+    // unsupported compare types throw
+    EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::LESS_THAN),          std::runtime_error);
+    EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::LESS_THAN_EQUAL),    std::runtime_error);
+    EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::GREATER_THAN),       std::runtime_error);
+    EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::GREATER_THAN_EQUAL), std::runtime_error);
+}
+
+TEST(PbiFilterTest, BarcodeFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiBarcodeFilter{ 17 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeFilter{ 18 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeFilter{ 0 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0});
+    }
+}
+
+TEST(PbiFilterTest, BarcodeForwardFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ 17 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ 400 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ {0, 256} } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2});
+    }
+    {
+        //blacklist
+        const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ {0, 256}, Compare::NOT_CONTAINS } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+}
+
+TEST(PbiFilterTest, BarcodeQualityFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiBarcodeQualityFilter{ 80, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeQualityFilter{ 40, Compare::LESS_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+}
+
+TEST(PbiFilterTest, BarcodeReverseFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ 18 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ 400 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{ });
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ {1, 257} } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2});
+    }
+    {
+        // blacklist
+        const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ {1, 257}, Compare::NOT_CONTAINS } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+}
+
+TEST(PbiFilterTest, BarcodesFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiBarcodesFilter{ 17, 18 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodesFilter{ 17, 19 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{ });
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodesFilter{ std::make_pair(17,18) } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+}
+
+TEST(PbiFilterTest, IdentityFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiIdentityFilter{ 0.95, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+}
+
+TEST(PbiFilterTest, LocalContextFilterOk)
+{
+    { // == NO_LOCAL_CONTEXT
+        const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0});
+    }
+    { // != ADAPTER_BEFORE (exact match)
+        const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2,3});
+    }
+    { // contains ADAPTER_BEFORE
+        const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    { // does not contain ADAPTER_BEFORE
+        const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2});
+    }
+    { // include both ADAPTER_BEFORE and ADAPTER_AFTER
+        const auto filter = PbiFilter::Intersection(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::CONTAINS }
+        });
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+    { // exclude both ADAPTER_BEFORE and ADAPTER_AFTER
+        const auto filter = PbiFilter::Intersection(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::NOT_CONTAINS }
+        });
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0});
+    }
+    { // include everything with either ADAPTER_BEFORE or ADAPTER_AFTER
+        const auto filter = PbiFilter::Union(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::CONTAINS }
+        });
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2,3});
+    }
+    { // include everything with either ADAPTER_BEFORE or ADAPTER_AFTER, but not both
+        const auto filter = PbiFilter::Intersection(
+        {
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL },
+                PbiFilter::Union(
+                {
+                    PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS },
+                    PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::NOT_CONTAINS }
+                })
+        });
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2});
+    }
+}
+
+TEST(PbiFilterTest, MapQualityFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiMapQualityFilter{ 254 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiMapQualityFilter{ 254, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+}
+
+TEST(PbiFilterTest, MovieNameFilterOk)
+{
+    const auto bamFile = BamFile{ PbbamTestsConfig::Data_Dir + std::string{ "/group/test2.bam" } };
+    const auto index = PbiRawData{ bamFile.PacBioIndexFilename() };
+
+    {
+        const auto filter = PbiFilter{ PbiMovieNameFilter{ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0" } };
+        const auto expectedRows = std::vector<size_t>{0,1,2,3};
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(index, row));
+    }
+    {
+        const auto filter = PbiFilter{ PbiMovieNameFilter{ "does_not_exist" } };
+        const auto expectedRows = std::vector<size_t>{};
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(index, row));
+    }
+    {
+        const auto names = std::vector<std::string>{"does_not_exist",
+                                          "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0"};
+        const auto filter = PbiFilter{ PbiMovieNameFilter{ names } };
+        const auto expectedRows = std::vector<size_t>{0,1,2,3};
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(index, row));
+    }
+    {
+        // blacklist
+        const auto names = std::vector<std::string>{"does_not_exist",
+                                          "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0"};
+        const auto filter = PbiFilter{ PbiMovieNameFilter{ names, Compare::NOT_CONTAINS } };
+        for (size_t row = 0; row < index.NumReads(); ++row)
+            EXPECT_FALSE(filter.Accepts(index, row));
+    }
+}
+
+TEST(PbiFilterTest, NumDeletedBasesFilterOk)
+{
+    // del: { 12, 38, 45, 11} - calculated from raw data, not stored directly in testing object or read from PBI file
+
+    {
+        const auto filter = PbiFilter{ PbiNumDeletedBasesFilter{ 12, Compare::LESS_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiNumDeletedBasesFilter{ 45, Compare::EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{2});
+    }
+}
+
+TEST(PbiFilterTest, NumInsertedBasesFilterOk)
+{
+    // ins: { 17, 63, 65, 20 }  - calculated from raw data, not stored directly testing object or read from PBI file
+
+    {
+        const auto filter = PbiFilter{ PbiNumInsertedBasesFilter{ 63, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2});
+    }
+    {
+        const auto filter = PbiFilter{ PbiNumInsertedBasesFilter{ 17, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2,3});
+    }
+}
+
+TEST(PbiFilterTest, NumMatchesFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiNumMatchesFilter{ 1000, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2});
+    }
+    {
+        const auto filter = PbiFilter{ PbiNumMatchesFilter{ 400, Compare::LESS_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0});
+    }
+}
+
+TEST(PbiFilterTest, NumMismatchesFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiNumMismatchesFilter{ 0, Compare::EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiNumMismatchesFilter{ 0, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+}
+
+TEST(PbiFilterTest, QueryEndFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiQueryEndFilter{ 4055 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1});
+    }
+    {
+        const auto filter = PbiFilter{ PbiQueryEndFilter{ 6200, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+}
+
+TEST(PbiFilterTest, QueryLengthFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiQueryLengthFilter{ 500, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiQueryLengthFilter{ 1000, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2});
+    }
+}
+
+TEST(PbiFilterTest, QueryNameFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1});
+    }
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "does_not_exist/0/0_0" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    {
+        const auto names = std::vector<std::string>{"m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055",
+                                          "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237"};
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ names } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+
+    // invalid QNAME syntax throws
+    EXPECT_THROW(
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    },
+    std::runtime_error);
+    EXPECT_THROW(
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "foo" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    },
+    std::runtime_error);
+    EXPECT_THROW(
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "foo/bar" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    },
+    std::runtime_error);
+    EXPECT_THROW(
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "foo/bar/baz_bam" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    },
+    std::exception); // come back to see why this is not runtime_error but something else
+}
+
+TEST(PbiFilterTest, QueryStartFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiQueryStartFilter{ 4101 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{2});
+    }
+    {
+        const auto filter = PbiFilter{ PbiQueryStartFilter{ 5000 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    {
+        const auto filter = PbiFilter{ PbiQueryStartFilter{ 5000, Compare::GREATER_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+}
+
+TEST(PbiFilterTest, ReadAccuracyFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiReadAccuracyFilter{ 0.9 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    {
+        const auto filter = PbiFilter{ PbiReadAccuracyFilter{ 0.9, Compare::GREATER_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2});
+    }
+}
+
+TEST(PbiFilterTest, ReadGroupFilterOk)
+{
+    { // numeric ID
+        const auto filter = PbiReadGroupFilter{ -1197849594 };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+
+        const auto filter2 = PbiReadGroupFilter{ 200 };
+        PbiFilterTests::checkFilterRows(filter2, std::vector<size_t>{});
+    }
+    { // string ID
+        const auto filter = PbiReadGroupFilter{ "b89a4406" };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+
+        const auto filter2 = PbiReadGroupFilter{ "b89a4406" };
+        PbiFilterTests::checkFilterRows(filter2, std::vector<size_t>{0,1,2,3});
+    }
+    { // ReadGroupInfo object
+        const auto rg = ReadGroupInfo{ "b89a4406" };
+        const auto filter = PbiReadGroupFilter{ rg };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    { // multi-ID
+        const auto ids = std::vector<int32_t>({-1197849594, 200});
+        const auto filter = PbiReadGroupFilter{ ids };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    { // multi-ID blacklist
+        const auto ids = std::vector<int32_t>({-1197849594, 200});
+        const auto filter = PbiReadGroupFilter{ ids, Compare::NOT_CONTAINS };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    { // multi-string
+        const auto ids = std::vector<std::string>({"b89a4406", "deadbeef"});
+        const auto filter = PbiReadGroupFilter{ ids };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    { // multi-ReadGroupInfo
+        const auto ids = std::vector<ReadGroupInfo>({ ReadGroupInfo("b89a4406"), ReadGroupInfo("deadbeef")});
+        const auto filter = PbiReadGroupFilter{ ids };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+}
+
+TEST(PbiFilterTest, ReferenceEndFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiReferenceEndFilter{ 9900 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiReferenceEndFilter{ 9900, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,3});
+    }
+}
+
+TEST(PbiFilterTest, ReferenceIdFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiReferenceIdFilter{ 0 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiReferenceIdFilter{ 0, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    {
+        const auto ids = std::vector<int32_t>({0, 42});
+        const auto filter = PbiFilter{ PbiReferenceIdFilter{ ids } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    {
+        const auto ids = std::vector<int32_t>({0});
+        const auto filter = PbiFilter{ PbiReferenceIdFilter{ ids, Compare::NOT_CONTAINS } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+}
+
+TEST(PbiFilterTest, ReferenceNameFilterOk)
+{
+    const auto bamFile = BamFile{ PbbamTestsConfig::Data_Dir + std::string{ "/group/test2.bam" } };
+    const auto index = PbiRawData{ bamFile.PacBioIndexFilename() };
+
+    {
+        const auto filter = PbiFilter{ PbiReferenceNameFilter{ "lambda_NEB3011" } };
+        const auto expectedRows = std::vector<size_t>{0,1,2,3};
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(index, row));
+
+    }
+    {
+        const auto filter = PbiFilter{ PbiReferenceNameFilter{ "lambda_NEB3011", Compare::NOT_EQUAL } };
+        const auto expectedRows = std::vector<size_t>{};
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(index, row));
+    }
+    {
+        const auto names = std::vector<std::string>({ "lambda_NEB3011" }); // this file only has 1 :(
+        const auto filter = PbiFilter{ PbiReferenceNameFilter{ names } };
+        const auto expectedRows = std::vector<size_t>{0,1,2,3};
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(index, row));
+    }
+
+    // unsupported compare types throw
+    EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::LESS_THAN),          std::runtime_error);
+    EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::LESS_THAN_EQUAL),    std::runtime_error);
+    EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::GREATER_THAN),       std::runtime_error);
+    EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::GREATER_THAN_EQUAL), std::runtime_error);
+}
+
+TEST(PbiFilterTest, ReferenceStartFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiReferenceStartFilter{ 8453 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1});
+    }
+    {
+        const auto filter = PbiFilter{ PbiReferenceStartFilter{ 9200, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,3});
+    }
+}
+
+TEST(PbiFilterTest, ZmwFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiZmwFilter{ 14743 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    {
+        // blacklist
+        const auto filter = PbiFilter{ PbiZmwFilter{ 14743, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    {
+        const auto zmws = std::vector<int32_t>({14743,42,200});
+        const auto filter = PbiFilter{ PbiZmwFilter{ zmws } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    {
+        // blacklist
+        const auto zmws = std::vector<int32_t>{14743};
+        const auto filter = PbiFilter{ PbiZmwFilter{zmws, Compare::NOT_CONTAINS}};
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+}
+
+TEST(PbiFilterTest, FromDataSetOk)
+{
+    const auto expectedFilter =
+        PbiFilter::Union(
+        {
+            PbiFilter::Intersection(
+            {
+                PbiZmwFilter{ 14743 },
+                PbiReadAccuracyFilter { 0.9, Compare::GREATER_THAN_EQUAL }
+            }),
+
+            PbiReferenceStartFilter { 9200, Compare::GREATER_THAN_EQUAL }
+        });
+
+
+    auto properties1 = Properties{ };
+    properties1.Add(Property{ "zm", "14743",  "==" });
+    properties1.Add(Property{ "rq", "0.9", ">=" });
+
+    auto datasetFilter1 = Filter{ };
+    datasetFilter1.Properties(properties1);
+
+    auto properties2 = Properties{ };
+    properties2.Add(Property{ "pos", "9200", ">=" });
+
+    auto datasetFilter2 = Filter{ };
+    datasetFilter2.Properties(properties2);
+
+    auto datasetFilters = Filters{ };
+    datasetFilters.Add(datasetFilter1);
+    datasetFilters.Add(datasetFilter2);
+    auto dataset = DataSet{ };
+    dataset.Filters(datasetFilters);
+
+    const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+
+    for (size_t i = 0; i < PbiFilterTests::shared_index.NumReads(); ++i) {
+        EXPECT_EQ(expectedFilter.Accepts(PbiFilterTests::shared_index, i),
+                  generatedFilter.Accepts(PbiFilterTests::shared_index, i));
+    }
+}
+
+TEST(PbiFilterTest, BarcodeListFromDataSetXmlOk)
+{
+    auto runner = [](const Property& property,
+                     const PbiFilter& expectedFilter,
+                     const std::vector<size_t>& expectedResults)
+    {
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  expectedResults);
+        PbiFilterTests::checkFilterRows(generatedFilter, expectedResults);
+    };
+
+    // single barcode
+    runner(Property{ "bc", "18", "==" },
+           PbiBarcodeFilter{ 18, Compare::EQUAL },
+           std::vector<size_t>{1,3});
+
+    // single barcode (bracketed)
+    runner(Property{ "bc", "[18]", "==" },
+           PbiBarcodeFilter{ 18, Compare::EQUAL },
+           std::vector<size_t>{1,3});
+
+    // barcode pair (square brackets)
+    runner(Property{ "bc", "[17,18]", "==" },
+           PbiBarcodesFilter{ {17, 18}, Compare::EQUAL },
+           std::vector<size_t>{1,3});
+
+    // barcode pair (parens)
+    runner(Property{ "bc", "(17,18)", "==" },
+           PbiBarcodesFilter{ {17, 18}, Compare::EQUAL },
+           std::vector<size_t>{1,3});
+
+    // barcode pair (curly brackets)
+    runner(Property{ "bc", "{17,18}", "==" },
+           PbiBarcodesFilter{ {17, 18}, Compare::EQUAL },
+           std::vector<size_t>{1,3});
+
+    // barcode pair (list, but no brackets)
+    runner(Property{ "bc", "17,18", "==" },
+           PbiBarcodesFilter{ {17, 18}, Compare::EQUAL },
+           std::vector<size_t>{1,3});
+
+    // barcode pair - same value
+    runner(Property{ "bc", "[18,18]", "==" },
+           PbiBarcodesFilter{ {18, 18}, Compare::EQUAL },
+           std::vector<size_t>{}); // none share forward & reverse
+
+    auto expectFail = [](const Property& property)
+    {
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        EXPECT_THROW(PbiFilter::FromDataSet(dataset), std::runtime_error);
+    };
+
+    // list-ish, but only one value
+    expectFail(Property{ "bc", "[18,]", "==" });
+
+    // too many barcodes
+    expectFail(Property{ "bc", "[18,18,18]", "==" });
+}
+
+TEST(PbiFilterTest, LocalContextFiltersFromDataSetXmlOk)
+{
+    {   // no adapters or barcodes
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::EQUAL };
+
+        // XML: <Property Name="cx" Value="0" Operator="==" />
+        Property property("cx", "0", "==");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{0});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{0});
+    }
+    {   // any adapters or barcodes
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL };
+
+        // XML: <Property Name="cx" Value="0" Operator="!=" />
+        Property property("cx", "0", "!=");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});
+    }
+    {   // contains adapter_before
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="1" Operator="&" />
+        Property property("cx", "1", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,3});
+    }
+    {   // contains adapter_before
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="ADAPTER_BEFORE" Operator="&" />
+        Property property("cx", "ADAPTER_BEFORE", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,3});
+    }
+    {   // contains adapter_after
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="2" Operator="&" />
+        Property property("cx", "2", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{2,3});
+    }
+    {   // contains adapter_before or adapter_after
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,
+                                       Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="3" Operator="&" />
+        Property property("cx", "3", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});
+    }
+    {   // contains adapter_before or adapter_after
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,
+                                       Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="ADAPTER_BEFORE | ADAPTER_AFTER" Operator="&" />
+        Property property("cx", "ADAPTER_BEFORE | ADAPTER_AFTER", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});
+    }
+    {   // contains adapter_before or adapter_after - no whitespace separation
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,
+                                       Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="ADAPTER_BEFORE|ADAPTER_AFTER" Operator="&" />
+        Property property("cx", "ADAPTER_BEFORE|ADAPTER_AFTER", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});
+    }
+    {   // contains adapter_before or adapter_after - a lot of whitespace separation
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,
+                                       Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="ADAPTER_BEFORE        |           ADAPTER_AFTER" Operator="&" />
+        Property property("cx", "ADAPTER_BEFORE        |           ADAPTER_AFTER", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});
+    }
+    {   // contains adapter_before or adapter_after, but not both
+
+        const auto expectedFilter = PbiFilter::Union(
+        {
+            PbiFilter::Intersection(
+            {
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL },
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS }
+            }),
+            PbiFilter::Intersection(
+            {
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL },
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::NOT_CONTAINS }
+            })
+        });
+
+        // XML:
+        // <Filters>
+        //   <Filter>
+        //     <Properties>
+        //       <Property Name="cx" Value="0" Operator="!=" />
+        //       <Property Name="cx" Value="1" Operator="~" />
+        //     </Properties>
+        //   </Filter>
+        //   <Filter>
+        //     <Properties>
+        //       <Property Name="cx" Value="0" Operator="!=" />
+        //       <Property Name="cx" Value="2" Operator="~" />
+        //     </Properties>
+        //   </Filter>
+        // </Filters>
+
+        auto filter1 = Filter{ };
+        filter1.Properties().Add(Property("cx", "0", "!="));
+        filter1.Properties().Add(Property("cx", "1", "~"));
+
+        auto filter2 = Filter{ };
+        filter2.Properties().Add(Property("cx", "0", "!="));
+        filter2.Properties().Add(Property("cx", "2", "~"));
+
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter1);
+        dataset.Filters().Add(filter2);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2});
+
+    }
+    {   // contains adapter_before or adapter_after
+
+        const auto expectedFilter = PbiFilter::Union(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::CONTAINS }
+        });
+
+        // XML:
+        // <Filters>
+        //   <Filter>
+        //     <Properties>
+        //       <Property Name="cx" Value="1" Operator="&" />
+        //     </Properties>
+        //   </Filter>
+        //   <Filter>
+        //     <Properties>
+        //       <Property Name="cx" Value="2" Operator="&" />
+        //     </Properties>
+        //   </Filter>
+        // </Filters>
+
+        auto filter1 = Filter{ };
+        filter1.Properties().Add(Property("cx", "1", "&"));
+
+        auto filter2 = Filter{ };
+        filter2.Properties().Add(Property("cx", "2", "&"));
+
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter1);
+        dataset.Filters().Add(filter2);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});
+    }
+    { // adapter_before and adapter_after
+
+        const auto expectedFilter = PbiFilter::Intersection(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::CONTAINS }
+        });
+
+        // XML:
+        // <Property Name="cx" Value="1" Operator="&" />
+        // <Property Name="cx" Value="2" Operator="&" />
+        Property property1("cx", "1", "&");
+        Property property2("cx", "2", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property1);
+        filter.Properties().Add(property2);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{3});
+    }
+    {   // adapter_before, but no adapter_after
+
+        const auto expectedFilter = PbiFilter::Intersection(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::NOT_CONTAINS }
+        });
+
+        // XML:
+        // <Property Name="cx" Value="1" Operator="&" />
+        // <Property Name="cx" Value="2" Operator="~" />
+        Property property1("cx", "1", "&");
+        Property property2("cx", "2", "~");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property1);
+        filter.Properties().Add(property2);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1});
+    }
+    {   // contains no adapter_before
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS };
+
+        // XML: <Property Name="cx" Value="1" Operator="~" />
+        Property property("cx", "1", "~");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{0,2});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{0,2});
+    }
+    {   // contains no adapter_before or adapter_after
+
+        const auto expectedFilter = PbiFilter::Intersection(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::NOT_CONTAINS }
+        });
+
+        // XML:
+        // <Property Name="cx" Value="1" Operator="~" />
+        // <Property Name="cx" Value="2" Operator="~" />
+        Property property1("cx", "1", "~");
+        Property property2("cx", "2", "~");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property1);
+        filter.Properties().Add(property2);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{0});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{0});
+    }
+    {   // contains no adapter_before or adapter_after
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,
+                                       Compare::NOT_CONTAINS };
+
+        // XML: <Property Name="cx" Value="3" Operator="~" />
+        Property property("cx", "3", "~");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{0});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{0});
+    }
+    {   // throws on invalid enum name
+
+        Property property("cx", "DOES_NOT_EXIST", "~");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        EXPECT_THROW(PbiFilter::FromDataSet(dataset), std::runtime_error);
+    }
+}
+
+// clang-format on
diff --git a/tests/src/test_PbiFilterQuery.cpp b/tests/src/test_PbiFilterQuery.cpp

new file mode 100644 (file)

index 0000000..35771cb
--- /dev/null
+++ b/tests/src/test_PbiFilterQuery.cpp
@@ -0,0 +1,919 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/PbiFilterQuery.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(PbiFilterQueryTest, QueryOk)
+{
+    const auto bamFile = BamFile{PbbamTestsConfig::Data_Dir + std::string{"/group/test2.bam"}};
+
+    {
+        PbiFilterQuery query(PbiQueryLengthFilter{500, Compare::GREATER_THAN_EQUAL}, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(3, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_GE((r.QueryEnd() - r.QueryStart()), 500);
+        }
+        EXPECT_EQ(3, count);
+    }
+    {
+        // all records aligned to reverse strand && pos >= 9200
+        const auto filter =
+            PbiFilter::Intersection({PbiAlignedStrandFilter{Strand::REVERSE},
+                                     PbiReferenceStartFilter{9200, Compare::GREATER_THAN_EQUAL}});
+
+        PbiFilterQuery query(filter, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_EQ(Strand::REVERSE, r.AlignedStrand());
+            EXPECT_GE((r.ReferenceStart()), 9200);
+            EXPECT_EQ(
+                std::string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/"
+                            "5615_6237"),
+                r.FullName());
+        }
+        EXPECT_EQ(1, count);
+    }
+    {
+        // all records aligned to forward strand && pos >= 9200
+        const auto filter =
+            PbiFilter::Intersection({PbiAlignedStrandFilter{Strand::FORWARD},
+                                     PbiReferenceStartFilter{9200, Compare::GREATER_THAN_EQUAL}});
+
+        PbiFilterQuery query(filter, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_EQ(Strand::FORWARD, r.AlignedStrand());
+            EXPECT_GE((r.ReferenceStart()), 9200);
+            EXPECT_EQ(
+                std::string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/"
+                            "2114_2531"),
+                r.FullName());
+        }
+        EXPECT_EQ(1, count);
+    }
+    {
+        // all records from RG ("b89a4406") with numMatches >= 1200
+        const auto filter =
+            PbiFilter::Intersection({PbiReadGroupFilter{"b89a4406"},
+                                     PbiNumMatchesFilter{1200, Compare::GREATER_THAN_EQUAL}});
+
+        PbiFilterQuery query(filter, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(2, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_EQ(std::string("b89a4406"), r.ReadGroupId());
+            EXPECT_GE((r.NumMatches()), 1200);
+            if (count == 1)
+                EXPECT_EQ(
+                    std::string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/"
+                                "14743/2579_4055"),
+                    r.FullName());
+            else {
+                if (count == 2) {
+                    EXPECT_EQ(
+                        std::string("m140905_042212_sidney_c100564852550000001823085912221377_s1_"
+                                    "X0/14743/4101_5571"),
+                        r.FullName());
+                }
+            }
+        }
+        EXPECT_EQ(2, count);
+    }
+}
+
+TEST(PbiFilterQueryTest, ZmwRangeFromDatasetOk)
+{
+    const std::string expectedMovieName{"m64004_190414_193017"};
+
+    const DataSet ds(PbbamTestsConfig::Data_Dir + "/chunking/chunking.subreadset.xml");
+    EXPECT_EQ(3, ds.BamFiles().size());
+
+    {  // movie name
+
+        PbiFilterQuery query{PbiMovieNameFilter{expectedMovieName}, ds};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1220, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            EXPECT_EQ(expectedMovieName, r.MovieName());
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+
+    {  // sequencing chemistries
+        std::set<std::string> chems{ds.SequencingChemistries()};
+        std::set<std::string> expected{"S/P3-C1/5.0-8M"};
+        EXPECT_TRUE(equal(chems.begin(), chems.end(), expected.begin()));
+    }
+
+    {  // min ZMW
+
+        PbiFilterQuery query{PbiZmwFilter{54, Compare::GREATER_THAN}, ds};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1220, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            EXPECT_GT(r.HoleNumber(), 54);
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+
+    {  // max ZMW
+
+        PbiFilterQuery query{PbiZmwFilter{1816, Compare::LESS_THAN}, ds};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(150, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            EXPECT_LT(r.HoleNumber(), 1816);
+            ++count;
+        }
+        EXPECT_EQ(150, count);
+    }
+
+    {  // put all together, from DataSet XML
+
+        const PbiFilter filter = PbiFilter::FromDataSet(ds);
+        PbiFilterQuery query(filter, ds);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(150, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            EXPECT_EQ(expectedMovieName, r.MovieName());
+            const auto zmw = r.HoleNumber();
+            EXPECT_GT(zmw, 54);
+            EXPECT_LT(zmw, 1816);
+            ++count;
+        }
+        EXPECT_EQ(150, count);
+    }
+    {  // empty filter object - should return all records from the same dataset
+
+        PbiFilterQuery query(PbiFilter{}, ds);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1220, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+    {  // no <Filters> element present at all
+
+        const DataSet dsData(PbbamTestsConfig::GeneratedData_Dir +
+                             "/chunking_missingfilters.subreadset.xml");
+        const PbiFilter filter = PbiFilter::FromDataSet(dsData);
+        PbiFilterQuery query(filter, dsData);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1220, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+    {  // <Filters> element contains no child <Filter> elements
+
+        const DataSet dsData(PbbamTestsConfig::GeneratedData_Dir +
+                             "/chunking_emptyfilters.subreadset.xml");
+        const PbiFilter filter = PbiFilter::FromDataSet(dsData);
+        PbiFilterQuery query(filter, dsData);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1220, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+}
+
+TEST(PbiFilterQueryTest, MissingPbiShouldThrow)
+{
+    const PbiFilter filter{PbiZmwFilter{31883}};
+    const std::string phi29Bam = PbbamTestsConfig::GeneratedData_Dir + "/missing_pbi.bam";
+    const std::string hasPbiBam = PbbamTestsConfig::Data_Dir + "/polymerase/production.scraps.bam";
+
+    {  // single file, missing PBI
+
+        EXPECT_THROW(PbiFilterQuery(filter, phi29Bam), std::runtime_error);
+    }
+
+    {  // from dataset, all missing PBI
+
+        DataSet ds;
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        EXPECT_THROW(PbiFilterQuery(filter, ds), std::runtime_error);
+    }
+
+    {  // from dataset, mixed PBI presence
+
+        DataSet ds;
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.ScrapsBamFile", hasPbiBam));
+        EXPECT_THROW(PbiFilterQuery(filter, ds), std::runtime_error);
+    }
+}
+
+TEST(PbiFilterQueryTest, QNameWhitelistFile)
+{
+    const DataSet ds(PbbamTestsConfig::Data_Dir + "/polymerase/qnameFiltered.subreads.dataset.xml");
+    const PbiFilter filter = PbiFilter::FromDataSet(ds);
+    PbiFilterQuery query(filter, ds);
+    const auto numReads = query.NumReads();
+    EXPECT_EQ(3, numReads);
+
+    int count = 0;
+    for (const BamRecord& r : query) {
+        UNUSED(r);
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(PbiFilterQueryTest, EmptyFiles)
+{
+    const BamFile file{PbbamTestsConfig::Data_Dir + "/empty.bam"};
+    PbiFilterQuery query{PbiFilter{}, file};
+    const auto numReads = query.NumReads();
+    EXPECT_EQ(0, numReads);
+
+    size_t count = 0;
+    for (const auto& r : query) {
+        UNUSED(r);
+        ++count;
+    }
+    EXPECT_EQ(0, count);
+}
+
+TEST(PbiFilterQueryTest, BarcodeData)
+{
+    const BamFile file{PbbamTestsConfig::Data_Dir + "/phi29.bam"};
+
+    // bc_quality == 1
+    {
+        PbiFilterQuery query{PbiBarcodeQualityFilter{1}, file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(120, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(120, count);
+    }
+
+    // bc_quality != 1
+    {
+        PbiFilterQuery query{PbiBarcodeQualityFilter{1, Compare::NOT_EQUAL}, file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(0, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(0, count);
+    }
+
+    // bc_forward == 0
+    {
+        PbiFilterQuery query{PbiBarcodeForwardFilter{0}, file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(40, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(40, count);
+    }
+
+    // bc_forward == [0,2]
+    {
+        const auto ids = std::vector<int16_t>{0, 2};
+        PbiFilterQuery query{PbiBarcodeForwardFilter{ids}, file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(80, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(80, count);
+    }
+
+    // bc_reverse != 0
+    {
+        PbiFilterQuery query{PbiBarcodeReverseFilter{0, Compare::NOT_EQUAL}, file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(80, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(80, count);
+    }
+}
+
+TEST(PbiFilterQueryTest, BarcodeQualityFromXml)
+{
+
+    const std::string xml_all = R"_XML_(
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet
+   xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+   xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+   xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+   xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+   xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+   xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd"
+   UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c"
+   TimeStampedName="subreadset_150304_231155"
+   MetaType="PacBio.DataSet.SubreadSet"
+   Name="DataSet_SubreadSet"
+   Tags=""
+   Version="3.0.0"
+   CreatedAt="2015-01-27T09:00:01">
+<pbbase:ExternalResources>
+   <pbbase:ExternalResource
+       UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193"
+       TimeStampedName="subread_bam_150304_231155"
+       MetaType="PacBio.SubreadFile.SubreadBamFile"
+       ResourceId="m64004_190414_193017.1.subreads.bam">
+       <pbbase:FileIndices>
+           <pbbase:FileIndex
+               UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5194"
+               TimeStampedName="bam_index_150304_231155"
+               MetaType="PacBio.Index.PacBioIndex"
+               ResourceId="m64004_190414_193017.1.subreads.bam.pbi"/>
+       </pbbase:FileIndices>
+   </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="bq" Operator="=" Value="1"/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
+)_XML_";
+
+    const std::string xml_none = R"_XML_(
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet
+   xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+   xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+   xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+   xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+   xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+   xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd"
+   UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c"
+   TimeStampedName="subreadset_150304_231155"
+   MetaType="PacBio.DataSet.SubreadSet"
+   Name="DataSet_SubreadSet"
+   Tags=""
+   Version="3.0.0"
+   CreatedAt="2015-01-27T09:00:01">
+<pbbase:ExternalResources>
+   <pbbase:ExternalResource
+       UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193"
+       TimeStampedName="subread_bam_150304_231155"
+       MetaType="PacBio.SubreadFile.SubreadBamFile"
+       ResourceId="m64004_190414_193017.1.subreads.bam">
+       <pbbase:FileIndices>
+           <pbbase:FileIndex
+               UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5194"
+               TimeStampedName="bam_index_150304_231155"
+               MetaType="PacBio.Index.PacBioIndex"
+               ResourceId="m64004_190414_193017.1.subreads.bam.pbi"/>
+       </pbbase:FileIndices>
+   </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="bq" Operator="!=" Value="1"/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
+)_XML_";
+
+    const BamFile file{PbbamTestsConfig::Data_Dir + "/phi29.bam"};
+
+    {  // filter allows all records
+        const DataSet ds = DataSet::FromXml(xml_all);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(120, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(120, count);
+    }
+    {  // filter allows no records
+        const DataSet ds = DataSet::FromXml(xml_none);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(0, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(0, count);
+    }
+}
+
+TEST(PbiFilterQueryTest, ReadGroupFilterFromXml)
+{
+    const BamFile file{PbbamTestsConfig::Data_Dir + "/phi29.bam"};
+    const std::string xmlHeader = R"_XML_(
+        <?xml version="1.0" encoding="utf-8"?>
+        <pbds:SubreadSet
+           xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+           xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+           xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+           xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+           xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+           xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd"
+           UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c"
+           TimeStampedName="subreadset_150304_231155"
+           MetaType="PacBio.DataSet.SubreadSet"
+           Name="DataSet_SubreadSet"
+           Tags=""
+           Version="3.0.0"
+           CreatedAt="2015-01-27T09:00:01">
+        <pbbase:ExternalResources>
+           <pbbase:ExternalResource
+               UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193"
+               TimeStampedName="subread_bam_150304_231155"
+               MetaType="PacBio.SubreadFile.SubreadBamFile"
+               ResourceId="phi29.bam">
+               <pbbase:FileIndices>
+                   <pbbase:FileIndex
+                       UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5194"
+                       TimeStampedName="bam_index_150304_231155"
+                       MetaType="PacBio.Index.PacBioIndex"
+                       ResourceId="phi29.bam.pbi"/>
+               </pbbase:FileIndices>
+           </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+        <pbds:Filters>
+            <pbds:Filter>
+                <pbbase:Properties>)_XML_";
+
+    const std::string xmlFooter = R"_XML_(
+                </pbbase:Properties>
+            </pbds:Filter>
+        </pbds:Filters>
+        </pbds:SubreadSet>
+        )_XML_";
+
+    {  // equal
+        const std::string xmlProperty =
+            R"_XML_(<pbbase:Property Name="qid" Operator="==" Value="-1453990154"/>\n)_XML_";
+        const std::string xml = xmlHeader + xmlProperty + xmlFooter;
+        const DataSet ds = DataSet::FromXml(xml);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(120, numReads);
+
+        size_t observedReadCount = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++observedReadCount;
+        }
+        EXPECT_EQ(120, observedReadCount);
+    }
+    {  // not equal
+        const std::string xmlProperty =
+            R"_XML_(<pbbase:Property Name="qid" Operator="!=" Value="-1453990154"/>\n)_XML_";
+        const std::string xml = xmlHeader + xmlProperty + xmlFooter;
+        const DataSet ds = DataSet::FromXml(xml);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(0, numReads);
+
+        size_t observedReadCount = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++observedReadCount;
+        }
+        EXPECT_EQ(0, observedReadCount);
+    }
+}
+
+TEST(PbiFilterQueryTest, ZmwWhitelistFromXml)
+{
+    const BamFile file{PbbamTestsConfig::Data_Dir + "/phi29.bam"};
+    const std::string xmlHeader = R"_XML_(
+        <?xml version="1.0" encoding="utf-8"?>
+        <pbds:SubreadSet
+           xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+           xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+           xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+           xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+           xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+           xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd"
+           UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c"
+           TimeStampedName="subreadset_150304_231155"
+           MetaType="PacBio.DataSet.SubreadSet"
+           Name="DataSet_SubreadSet"
+           Tags=""
+           Version="3.0.0"
+           CreatedAt="2015-01-27T09:00:01">
+        <pbbase:ExternalResources>
+           <pbbase:ExternalResource
+               UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193"
+               TimeStampedName="subread_bam_150304_231155"
+               MetaType="PacBio.SubreadFile.SubreadBamFile"
+               ResourceId="phi29.bam">
+               <pbbase:FileIndices>
+                   <pbbase:FileIndex
+                       UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5194"
+                       TimeStampedName="bam_index_150304_231155"
+                       MetaType="PacBio.Index.PacBioIndex"
+                       ResourceId="phi29.bam.pbi"/>
+               </pbbase:FileIndices>
+           </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+        <pbds:Filters>
+            <pbds:Filter>
+                <pbbase:Properties>)_XML_";
+
+    const std::string xmlFooter = R"_XML_(
+                </pbbase:Properties>
+            </pbds:Filter>
+        </pbds:Filters>
+        </pbds:SubreadSet>
+        )_XML_";
+
+    size_t count_30422 = 0;
+    size_t count_648 = 0;
+    size_t count_17299 = 0;
+    size_t count_whitelist = 0;
+
+    {  // 30422
+        const std::string xmlProperty =
+            R"_XML_(<pbbase:Property Name="zm" Operator="=" Value="30422"/>\n)_XML_";
+        const std::string xml = xmlHeader + xmlProperty + xmlFooter;
+        const DataSet ds = DataSet::FromXml(xml);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(13, numReads);
+
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count_30422;
+        }
+        EXPECT_EQ(13, count_30422);
+    }
+    {  // 648
+        const std::string xmlProperty =
+            R"_XML_(<pbbase:Property Name="zm" Operator="=" Value="648"/>\n)_XML_";
+        const std::string xml = xmlHeader + xmlProperty + xmlFooter;
+        const DataSet ds = DataSet::FromXml(xml);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(11, numReads);
+
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count_648;
+        }
+        EXPECT_EQ(11, count_648);
+    }
+    {  // 17299
+        const std::string xmlProperty =
+            R"_XML_(<pbbase:Property Name="zm" Operator="=" Value="17299"/>\n)_XML_";
+        const std::string xml = xmlHeader + xmlProperty + xmlFooter;
+        const DataSet ds = DataSet::FromXml(xml);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(4, numReads);
+
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count_17299;
+        }
+        EXPECT_EQ(4, count_17299);
+    }
+    {  // now check whitelist
+        const std::string xmlProperty =
+            R"_XML_(<pbbase:Property Name="zm" Operator="=" Value="[30422,648,17299]"/>\n)_XML_";
+        const std::string xml = xmlHeader + xmlProperty + xmlFooter;
+        const DataSet ds = DataSet::FromXml(xml);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(28, numReads);
+
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count_whitelist;
+        }
+        EXPECT_EQ(count_30422 + count_648 + count_17299, count_whitelist);
+    }
+}
+
+TEST(PbiFilterQueryTest, TranscriptRecords)
+{
+    const std::string transcriptFn = PbbamTestsConfig::Data_Dir + "/transcript.subreads.bam";
+
+    PbiFilterQuery query{PbiFilter{}, transcriptFn};
+    for (const auto& b : query)
+        EXPECT_TRUE(b.HasHoleNumber());
+
+    {  // zmw whitelist
+        const std::vector<int32_t> whitelist = {1, 3};
+
+        std::vector<int32_t> observed;
+
+        PbiFilter filter{PbiZmwFilter{whitelist}};
+        PbiFilterQuery queryData{filter, transcriptFn};
+        for (const auto& b : queryData) {
+            observed.push_back(b.HoleNumber());
+        }
+
+        ASSERT_EQ(2, observed.size());
+        EXPECT_EQ(1, observed.at(0));
+        EXPECT_EQ(3, observed.at(1));
+    }
+    {  // zmw bounds
+        const PbiFilter filter{
+            {PbiZmwFilter{2, Compare::GREATER_THAN_EQUAL}, PbiZmwFilter{4, Compare::LESS_THAN}}};
+
+        std::vector<int32_t> observed;
+
+        PbiFilterQuery queryData{filter, transcriptFn};
+        for (const auto& b : queryData) {
+            observed.push_back(b.HoleNumber());
+        }
+
+        ASSERT_EQ(2, observed.size());
+        EXPECT_EQ(2, observed.at(0));
+        EXPECT_EQ(3, observed.at(1));
+    }
+    {  // QNAME
+
+        std::vector<int32_t> observed;
+
+        PbiFilter filter{PbiQueryNameFilter{"transcript/2"}};
+        PbiFilterQuery queryData{filter, transcriptFn};
+        for (const auto& b : queryData) {
+            observed.push_back(b.HoleNumber());
+        }
+
+        ASSERT_EQ(1, observed.size());
+        EXPECT_EQ(2, observed.at(0));
+    }
+    {  // QNAME whitelist
+
+        const std::vector<std::string> whitelist = {"transcript/1", "transcript/4"};
+
+        std::vector<int32_t> observed;
+
+        PbiFilter filter{PbiQueryNameFilter{whitelist}};
+        PbiFilterQuery queryData{filter, transcriptFn};
+        for (const auto& b : queryData) {
+            observed.push_back(b.HoleNumber());
+        }
+
+        ASSERT_EQ(2, observed.size());
+        EXPECT_EQ(1, observed.at(0));
+        EXPECT_EQ(4, observed.at(1));
+    }
+
+    {  // movie name
+        std::vector<int32_t> observed;
+
+        PbiFilter filter{PbiMovieNameFilter{"transcript"}};
+        PbiFilterQuery queryData{filter, transcriptFn};
+        for (const auto& b : queryData) {
+            observed.push_back(b.HoleNumber());
+        }
+
+        EXPECT_EQ(4, observed.size());
+    }
+
+    {  // movie name from DataSet
+
+        const std::string datasetFn = PbbamTestsConfig::Data_Dir + "/transcriptset.xml";
+
+        std::vector<int32_t> observed;
+
+        PacBio::BAM::DataSet ds(datasetFn);
+        PacBio::BAM::PbiFilter filter = PacBio::BAM::PbiFilter::FromDataSet(ds);
+        PbiFilterQuery queryData{filter, ds};
+        for (const auto& b : queryData) {
+            observed.push_back(b.HoleNumber());
+        }
+
+        EXPECT_EQ(4, observed.size());
+    }
+}
+
+TEST(PbiFilterQueryTest, BarcodedReadGroupId)
+{
+    const BamFile bamFile{PbbamTestsConfig::Data_Dir + std::string{"/barcoded_read_groups.bam"}};
+
+    {  //  query read group with no barcodes - should catche all, barcoded or not
+        const PbiReadGroupFilter filter{"0d7b28fa"};
+        PbiFilterQuery query{filter, bamFile};
+        size_t count = 0;
+        for (const auto& b : query) {
+            (void)b;
+            ++count;
+        }
+        EXPECT_EQ(5, count);
+    }
+    {  // query read group with barcode label
+
+        const ReadGroupInfo rg{"0d7b28fa/0--0"};
+        PbiReadGroupFilter filter{rg};
+        PbiFilterQuery query{filter, bamFile};
+        size_t count = 0;
+        for (const auto& b : query) {
+            (void)b;
+            ++count;
+        }
+        EXPECT_EQ(1, count);
+    }
+    {  // query multiple read groups with barcode label
+
+        const ReadGroupInfo rg{"0d7b28fa/0--0"};
+        const ReadGroupInfo rg1{"0d7b28fa/1--0"};
+        PbiReadGroupFilter filter{std::vector<ReadGroupInfo>{rg, rg1}};
+        PbiFilterQuery query{filter, bamFile};
+        size_t count = 0;
+        for (const auto& b : query) {
+            (void)b;
+            ++count;
+        }
+        EXPECT_EQ(2, count);
+    }
+}
+
+TEST(PbiFilterQueryTest, CanReusePbiIndexCache)
+{
+    const DataSet ds(PbbamTestsConfig::Data_Dir + "/chunking/chunking.subreadset.xml");
+    const auto indexCache = MakePbiIndexCache(ds);
+
+    {
+        // min ZMW
+        PbiFilterQuery query{PbiZmwFilter{54, Compare::GREATER_THAN}, ds, indexCache};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1220, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            EXPECT_GT(r.HoleNumber(), 54);
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+
+    {  // max ZMW
+        PbiFilterQuery query{PbiZmwFilter{1816, Compare::LESS_THAN}, ds, indexCache};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(150, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            EXPECT_LT(r.HoleNumber(), 1816);
+            ++count;
+        }
+        EXPECT_EQ(150, count);
+    }
+}
+
+TEST(PbiFilterQueryTest, ConsistentWhitelistAndBlacklist)
+{
+    const std::string fn{PbbamTestsConfig::Data_Dir + "/dataset/qname_filter.bam"};
+
+    const std::vector<std::string> recordNames{"singleInsertion/0/0_10", "singleInsertion/0/10_20",
+                                               "singleInsertion/1/0_10", "singleInsertion/1/10_20"};
+    const std::vector<std::string> whitelist{"singleInsertion/0/0_10", "singleInsertion/1/0_10"};
+    const std::vector<std::string> blacklist{"singleInsertion/0/10_20", "singleInsertion/1/10_20"};
+
+    {  // sanity check on input
+        PbiFilter filter{};
+        PbiFilterQuery query{filter, fn};
+        EXPECT_EQ(4, query.NumReads());
+
+        size_t i = 0;
+        for (const auto& b : query) {
+            EXPECT_EQ(recordNames.at(i), b.FullName());
+            ++i;
+        }
+    }
+    {
+        // whitelist
+        PbiFilter filter{PbiQueryNameFilter{whitelist}};
+        PbiFilterQuery query{filter, fn};
+        EXPECT_EQ(2, query.NumReads());
+
+        size_t i = 0;
+        for (const auto& b : query) {
+            if (i == 0) {
+                EXPECT_EQ(recordNames.at(0), b.FullName());
+            }
+            if (i == 1) {
+                EXPECT_EQ(recordNames.at(2), b.FullName());
+            }
+            ++i;
+        }
+    }
+    {
+        // !whitelist
+        PbiFilter filter{PbiQueryNameFilter{whitelist, Compare::NOT_CONTAINS}};
+        PbiFilterQuery query{filter, fn};
+        EXPECT_EQ(2, query.NumReads());
+
+        size_t i = 0;
+        for (const auto& b : query) {
+            if (i == 0) {
+                EXPECT_EQ(recordNames.at(1), b.FullName());
+            }
+            if (i == 1) {
+                EXPECT_EQ(recordNames.at(3), b.FullName());
+            }
+            ++i;
+        }
+    }
+    {
+        // blacklist
+        PbiFilter filter{PbiQueryNameFilter{blacklist, Compare::NOT_CONTAINS}};
+        PbiFilterQuery query{filter, fn};
+        EXPECT_EQ(2, query.NumReads());
+
+        size_t i = 0;
+        for (const auto& b : query) {
+            if (i == 0) {
+                EXPECT_EQ(recordNames.at(0), b.FullName());
+            }
+            if (i == 1) {
+                EXPECT_EQ(recordNames.at(2), b.FullName());
+            }
+            ++i;
+        }
+    }
+}
diff --git a/tests/src/test_Pulse2BaseCache.cpp b/tests/src/test_Pulse2BaseCache.cpp

new file mode 100644 (file)

index 0000000..400d381
--- /dev/null
+++ b/tests/src/test_Pulse2BaseCache.cpp
@@ -0,0 +1,47 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/../../src/Pulse2BaseCache.h>
+
+using Pulse2BaseCache = PacBio::BAM::Pulse2BaseCache;
+
+TEST(Pulse2BaseCacheTest, CountsDetectedInConstructor)
+{
+    const std::string pulseCalls = "ACccTTAGtTCAtG";
+    const std::string trimmedPC = "ACTTAGTCAG";
+
+    const Pulse2BaseCache cache{pulseCalls};
+
+    EXPECT_EQ(pulseCalls.size(), cache.NumPulses());
+    EXPECT_EQ(trimmedPC.size(), cache.NumBases());
+}
+
+TEST(Pulse2BaseCacheTest, RemovesSquashedPulsesFromString)
+{
+    const std::string pulseCalls = "ACccTTAGtTCAtG";
+    const std::string trimmedPC = "ACTTAGTCAG";
+    const std::string altLabel = "-G--A--T--AC--";
+    const std::string trimmedAlt = "-GA--T-AC-";
+
+    const Pulse2BaseCache cache{pulseCalls};
+
+    EXPECT_EQ(trimmedPC, cache.RemoveSquashedPulses(pulseCalls));
+    EXPECT_EQ(trimmedAlt, cache.RemoveSquashedPulses(altLabel));
+}
+
+TEST(Pulse2BaseCacheTest, RemovesSquashedPulsesFromVector)
+{
+    const std::string pulseCalls = "ACccTTAGtTCAtG";
+    const std::vector<uint16_t> pkMean = {5, 4, 2, 2, 3, 8, 8, 8, 4, 7, 7, 7, 3, 4};
+    const std::vector<uint16_t> trimmedPkmean = {5, 4, 3, 8, 8, 8, 7, 7, 7, 4};
+
+    const Pulse2BaseCache cache{pulseCalls};
+
+    EXPECT_EQ(trimmedPkmean, cache.RemoveSquashedPulses(pkMean));
+}
diff --git a/tests/src/test_QNameQuery.cpp b/tests/src/test_QNameQuery.cpp

new file mode 100644 (file)

index 0000000..b051728
--- /dev/null
+++ b/tests/src/test_QNameQuery.cpp
@@ -0,0 +1,64 @@
+// Author: Yuan Li
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/QNameQuery.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace QNameQueryTests {
+
+static const std::string dataDir = PbbamTestsConfig::Data_Dir + "/group/";
+static const std::string test1fn = std::string(dataDir) + "test1.bam";
+static const std::string test2fn = std::string(dataDir) + "test2.bam";
+static const std::string test3fn = std::string(dataDir) + "test3.bam";
+
+static void TestQNameQuery(const std::string& fn, const std::vector<int>& expected)
+{
+    EXPECT_NO_THROW({
+        std::vector<int> counts;
+        QNameQuery qQuery(fn);
+        for (const std::vector<BamRecord>& records : qQuery)
+            counts.push_back(records.size());
+        EXPECT_EQ(expected, counts);
+    });
+}
+
+static void TestNoneConstQNameQuery(const std::string& fn, const std::vector<int>& expected)
+{
+    EXPECT_NO_THROW({
+        std::vector<int> counts;
+        QNameQuery qQuery(fn);
+        for (std::vector<BamRecord>& records : qQuery)
+            counts.push_back(records.size());
+        EXPECT_EQ(expected, counts);
+    });
+}
+
+}  // namespace QNameQueryTests
+
+TEST(QNameQueryTest, CountQSizes)
+{
+    // test case 1 has exactly one bamRecord.
+    std::string fn = QNameQueryTests::test1fn;
+    std::vector<int> expected({1});
+    QNameQueryTests::TestQNameQuery(fn, expected);
+    QNameQueryTests::TestNoneConstQNameQuery(fn, expected);
+
+    // test case 2 has bamRecords of four subreads.
+    fn = QNameQueryTests::test2fn;
+    expected = {1, 1, 1, 1};
+    QNameQueryTests::TestQNameQuery(fn, expected);
+    QNameQueryTests::TestNoneConstQNameQuery(fn, expected);
+
+    fn = QNameQueryTests::test3fn;
+    expected = {2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1};
+    QNameQueryTests::TestQNameQuery(fn, expected);
+    QNameQueryTests::TestNoneConstQNameQuery(fn, expected);
+}
diff --git a/tests/src/test_QualityValues.cpp b/tests/src/test_QualityValues.cpp

new file mode 100644 (file)

index 0000000..c06fd09
--- /dev/null
+++ b/tests/src/test_QualityValues.cpp
@@ -0,0 +1,88 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/QualityValues.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(QualityValueTest, DefaultsOk)
+{
+    const QualityValue value;
+    EXPECT_EQ(0, value);
+    EXPECT_EQ('!', value.Fastq());
+}
+
+TEST(QualityValueTest, FromNumber)
+{
+    const QualityValue zero(0);
+    const QualityValue thirtyThree(33);
+    const QualityValue valid(42);
+    const QualityValue max(93);
+    const QualityValue tooHigh(94);
+    const QualityValue wayTooHigh(std::numeric_limits<int8_t>::max());
+
+    EXPECT_EQ(0, zero);
+    EXPECT_EQ(33, thirtyThree);
+    EXPECT_EQ(42, valid);
+    EXPECT_EQ(93, max);
+    EXPECT_EQ(93, tooHigh);
+    EXPECT_EQ(93, wayTooHigh);
+
+    EXPECT_EQ('!', zero.Fastq());
+    EXPECT_EQ('B', thirtyThree.Fastq());
+    EXPECT_EQ('K', valid.Fastq());
+    EXPECT_EQ('~', max.Fastq());
+    EXPECT_EQ('~', tooHigh.Fastq());
+    EXPECT_EQ('~', wayTooHigh.Fastq());
+}
+
+TEST(QualityValueTest, FromFastq)
+{
+    const QualityValue zero = QualityValue::FromFastq('!');
+    const QualityValue thirtyThree = QualityValue::FromFastq('B');
+    const QualityValue valid = QualityValue::FromFastq('K');
+    const QualityValue max = QualityValue::FromFastq('~');
+
+    EXPECT_EQ(0, zero);
+    EXPECT_EQ(33, thirtyThree);
+    EXPECT_EQ(42, valid);
+    EXPECT_EQ(93, max);
+}
+
+TEST(QualityValuesTest, Default)
+{
+    const QualityValues qvs;
+    EXPECT_TRUE(qvs.empty());
+    EXPECT_EQ(std::string(), qvs.Fastq());
+}
+
+TEST(QualityValuesTest, FromNumbers)
+{
+    const std::string fastqString = "~~~KKBB!!";
+    const std::vector<uint8_t> values = {93, 93, 93, 42, 42, 33, 33, 0, 0};
+
+    QualityValues qvs;
+    for (auto qv : values)
+        qvs.push_back(qv);
+    EXPECT_EQ(fastqString, qvs.Fastq());
+}
+
+TEST(QualityValuesTest, FromFastq)
+{
+    const std::string fastqString = "~~~KKBB!!";
+    const std::vector<uint8_t> values = {93, 93, 93, 42, 42, 33, 33, 0, 0};
+
+    const QualityValues qvs = QualityValues::FromFastq(fastqString);
+    EXPECT_EQ(fastqString.size(), qvs.size());
+    EXPECT_EQ(values.size(), qvs.size());
+    for (size_t i = 0; i < fastqString.size(); ++i)
+        EXPECT_EQ(values.at(i), qvs.at(i));
+}
diff --git a/tests/src/test_ReadAccuracyQuery.cpp b/tests/src/test_ReadAccuracyQuery.cpp

new file mode 100644 (file)

index 0000000..f7b1f6d
--- /dev/null
+++ b/tests/src/test_ReadAccuracyQuery.cpp
@@ -0,0 +1,42 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/ReadAccuracyQuery.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(ReadAccuracyQueryTest, QueryOk)
+{
+    const auto bamFile = BamFile{PbbamTestsConfig::Data_Dir + std::string{"/group/test2.bam"}};
+
+    {
+        ReadAccuracyQuery query(0.901, Compare::GREATER_THAN_EQUAL, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(4, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_GE(r.ReadAccuracy(), 0.901);
+        }
+        EXPECT_EQ(4, count);
+    }
+    {
+        ReadAccuracyQuery query(0.95, Compare::GREATER_THAN_EQUAL, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(0, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_GE(r.ReadAccuracy(), 0.901);
+        }
+        EXPECT_EQ(0, count);
+    }
+}
diff --git a/tests/src/test_ReadGroupInfo.cpp b/tests/src/test_ReadGroupInfo.cpp

new file mode 100644 (file)

index 0000000..67acb51
--- /dev/null
+++ b/tests/src/test_ReadGroupInfo.cpp
@@ -0,0 +1,317 @@
+// Author: Derek Barnett, Lance Hepler
+
+#include <cstddef>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/ReadGroupInfo.h>
+#include <pbbam/exception/BundleChemistryMappingException.h>
+#include <pbbam/exception/InvalidSequencingChemistryException.h>
+
+// clang-format off
+
+using namespace PacBio::BAM;
+
+TEST(ReadGroupInfoTest, IdFromMovieNameAndReadType)
+{
+    ReadGroupInfo rg("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0", "HQREGION");
+    EXPECT_EQ("00082ba1", rg.Id());
+}
+
+TEST(ReadGroupInfoTest, FrameCodecSetOk)
+{
+    ReadGroupInfo rg("test");
+    rg.IpdCodec(FrameCodec::V1);
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::IPD));
+    EXPECT_EQ("ip", rg.BaseFeatureTag(BaseFeature::IPD));
+    EXPECT_EQ(FrameCodec::V1, rg.IpdCodec());
+}
+
+TEST(ReadGroupInfoTest, SequencingChemistryOk)
+{
+    {   // S/P1-C1/beta
+        const std::string chem{"S/P1-C1/beta"};
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-620-000","3.0"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-620-000","3.1"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-620-000")
+          .BasecallerVersion("3.0");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+
+    {   // S/P1-C1.1 (Echidna)
+        const std::string chem{"S/P1-C1.1"};
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-867-300","3.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-867-300","3.2"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-867-300","3.3"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-867-300")
+          .BasecallerVersion("3.1");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+
+    {   // S/P1-C1.2 (Flea)
+        const std::string chem{"S/P1-C1.2"};
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-902-100","3.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-902-100","3.2"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-902-100","3.3"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-902-100")
+          .BasecallerVersion("3.1");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+    {   // S/P1-C1.3 (Goat)
+        const std::string chem{"S/P1-C1.3"};
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-972-200","3.2"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-972-200","3.3"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-972-200")
+          .BasecallerVersion("3.3");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+}
+
+#ifdef _WIN32
+int setenv(const char* name, const char* value, int overwrite)
+{
+    int err = 0;
+    if (!overwrite) {
+        size_t sz = 0;
+        err = getenv_s(&sz, NULL, 0, name);
+        if (err || sz) return err;
+    }
+    return _putenv_s(name, value);
+}
+
+int unsetenv(const char* name) {
+    static const char* empty = "";
+    return _putenv_s(name, empty);
+}
+#endif
+
+TEST(ReadGroupInfoTest, SequencingChemistryFromMappingXml)
+{
+    ReadGroupInfo rg("MAYBE");
+    rg.BindingKit("1").SequencingKit("2").BasecallerVersion("3.4");
+    EXPECT_THROW(rg.SequencingChemistry(), InvalidSequencingChemistryException);
+
+    // set the magic environment variable
+    const char* varname = "SMRT_CHEMISTRY_BUNDLE_DIR";
+    EXPECT_EQ(0, setenv(varname, PbbamTestsConfig::Data_Dir.c_str(), 0));
+
+    EXPECT_EQ("FOUND", rg.SequencingChemistry());
+
+    // unset the environment variable
+    EXPECT_EQ(0, unsetenv(varname));
+
+    // test memoization
+    EXPECT_THROW(ReadGroupInfo::SequencingChemistryFromTriple("1", "2", "3.4"),
+                 InvalidSequencingChemistryException);
+    EXPECT_EQ("FOUND", rg.SequencingChemistry());
+
+    EXPECT_EQ(0, setenv(varname, "/dev/null", 0));
+
+    // test that a bogus SMRT_CHEMISTRY_BUNDLE_DIR throws
+    EXPECT_THROW(ReadGroupInfo::SequencingChemistryFromTriple("1", "2", "3.4"),
+                 BundleChemistryMappingException);
+
+    EXPECT_EQ(0, unsetenv(varname));
+}
+
+TEST(ReadGroupInfoTest, SequencingChemistryThrowsOnBadTriple)
+{
+    // check that we actually throw
+    ReadGroupInfo rg("BAD");
+    rg.BindingKit("100372700")
+      .SequencingKit("100-619-400")
+      .BasecallerVersion("2.0");
+    EXPECT_THROW(rg.SequencingChemistry(), InvalidSequencingChemistryException);
+
+    // now check thrown contents
+    try {
+        ReadGroupInfo rg2("BAD");
+        rg2.BindingKit("100372700")
+          .SequencingKit("100-619-400")
+          .BasecallerVersion("2.0");
+    } catch (InvalidSequencingChemistryException& e) {
+        EXPECT_EQ(std::string("100372700"),   e.BindingKit());
+        EXPECT_EQ(std::string("100-619-400"), e.SequencingKit());
+        EXPECT_EQ(std::string("2.0"),         e.BasecallerVersion());
+    }
+}
+
+TEST(ReadGroupInfoTest, BasecallerVersion)
+{
+    // too short
+    try {
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-867-300")
+          .BasecallerVersion("3");
+        const std::string chem = rg.SequencingChemistry();
+//        ()chem;
+
+    } catch (std::runtime_error& e) {
+        EXPECT_EQ(std::string("ReadGroupInfo: basecaller version is too short: 3"), std::string(e.what()));
+    }
+
+    // initial implementation assumed single digit version numbers:
+    //    const std::string ver{ basecallerVersion.substr(0, 3) };
+    // So '3.299.dummy' would incorrectly be interpreted as (OK) '3.2'.
+    // 3.
+
+    try {
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-867-300")
+          .BasecallerVersion("3.199.dummy");
+        const std::string chem = rg.SequencingChemistry();
+//        ()chem;
+
+    } catch (InvalidSequencingChemistryException& e) {
+        EXPECT_EQ("100-619-300", e.BindingKit());
+        EXPECT_EQ("100-867-300", e.SequencingKit());
+        EXPECT_EQ("3.199.dummy", e.BasecallerVersion());
+    }
+    //EXPECT_THROW(rg.SequencingChemistry(), InvalidSequencingChemistryException);
+}
+
+TEST(ReadGroupInfoTest, ClearBaseFeatures)
+{
+    ReadGroupInfo rg("test");
+    rg.BaseFeatureTag(BaseFeature::DELETION_QV,     "dq");
+    rg.BaseFeatureTag(BaseFeature::DELETION_TAG,    "dt");
+    rg.BaseFeatureTag(BaseFeature::INSERTION_QV,    "iq");
+    rg.BaseFeatureTag(BaseFeature::MERGE_QV,        "mq");
+    rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV, "sq");
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::DELETION_QV));
+    EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV));
+
+    rg.ClearBaseFeatures();
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_QV));
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_TAG));
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::INSERTION_QV));
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::MERGE_QV));
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_QV));
+}
+
+TEST(ReadGroupInfoTest, RemoveBaseFeature)
+{
+    ReadGroupInfo rg("test");
+    rg.BaseFeatureTag(BaseFeature::DELETION_QV,     "dq");
+    rg.BaseFeatureTag(BaseFeature::DELETION_TAG,    "dt");
+    rg.BaseFeatureTag(BaseFeature::INSERTION_QV,    "iq");
+    rg.BaseFeatureTag(BaseFeature::MERGE_QV,        "mq");
+    rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV, "sq");
+    rg.BaseFeatureTag(BaseFeature::PULSE_EXCLUSION, "pe");
+
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::DELETION_QV));
+    EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV));
+
+    rg.RemoveBaseFeature(BaseFeature::DELETION_QV);
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_QV));
+
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::DELETION_TAG));
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::INSERTION_QV));
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::MERGE_QV));
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_QV));
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::PULSE_EXCLUSION));
+}
+
+TEST(ReadGroupInfoTest, BaseIdFromBarcodedId)
+{
+    const ReadGroupInfo rg{"00082ba1/0--1"};
+    EXPECT_EQ("00082ba1/0--1", rg.Id());
+    EXPECT_EQ("00082ba1", rg.BaseId());
+}
+
+TEST(ReadGroupInfoTest, BaseIdFromNonBarcodedId)
+{
+    const ReadGroupInfo rg{"00082ba1"};
+    EXPECT_EQ("00082ba1", rg.Id());
+    EXPECT_EQ("00082ba1", rg.BaseId());
+}
+
+TEST(ReadGroupInfoTest, BarcodeDataFromBarcodedId)
+{
+    const ReadGroupInfo rg{"00082ba1/0--1"};
+    EXPECT_EQ("00082ba1/0--1", rg.Id());
+    EXPECT_EQ("00082ba1", rg.BaseId());
+
+    const auto barcodes = rg.Barcodes();
+    ASSERT_TRUE(barcodes);
+    EXPECT_EQ(0, barcodes->first);
+    EXPECT_EQ(1, barcodes->second);
+    EXPECT_EQ(0, rg.BarcodeForward().get());
+    EXPECT_EQ(1, rg.BarcodeReverse().get());
+}
+
+TEST(ReadGroupInfoTest, BarcodeDataFromIdPlusBarcodesCtor)
+{
+    const ReadGroupInfo rg{"00082ba1", std::pair<uint16_t, uint16_t>(0,1)};
+
+    EXPECT_EQ("00082ba1/0--1", rg.Id());
+    EXPECT_EQ("00082ba1", rg.BaseId());
+
+    const auto barcodes = rg.Barcodes();
+    ASSERT_TRUE(barcodes);
+    EXPECT_EQ(0, barcodes->first);
+    EXPECT_EQ(1, barcodes->second);
+    EXPECT_EQ(0, rg.BarcodeForward().get());
+    EXPECT_EQ(1, rg.BarcodeReverse().get());
+}
+
+TEST(ReadGroupInfoTest, NoBarcodeDataFromNonbarcodedId)
+{
+    {   // "standard" ID
+        const ReadGroupInfo rg{"00082ba1"};
+        EXPECT_EQ("00082ba1", rg.Id());
+        EXPECT_EQ("00082ba1", rg.BaseId());
+
+        const auto barcodes = rg.Barcodes();
+        EXPECT_EQ(boost::none, barcodes);
+        EXPECT_EQ(boost::none, rg.BarcodeForward());
+        EXPECT_EQ(boost::none, rg.BarcodeReverse());
+    }
+    {   // no '/' found
+        const ReadGroupInfo rg{"00082ba1.0--1"};
+        const auto barcodes = rg.Barcodes();
+        EXPECT_EQ(boost::none, barcodes);
+        EXPECT_EQ(boost::none, rg.BarcodeForward());
+        EXPECT_EQ(boost::none, rg.BarcodeReverse());
+    }
+}
+
+TEST(ReadGroupInfoTest, NoBarcodeDataFromEmptyId)
+{
+    const ReadGroupInfo rg{""};
+    const auto barcodes = rg.Barcodes();
+    EXPECT_EQ(boost::none, barcodes);
+    EXPECT_EQ(boost::none, rg.BarcodeForward());
+    EXPECT_EQ(boost::none, rg.BarcodeReverse());
+}
+
+TEST(ReadGroupInfoTest, ThrowsOnConstructingIdFromMalformattedBarcodeLabels)
+{
+    EXPECT_THROW(ReadGroupInfo{"00082ba1/0-1"}, std::runtime_error);
+    EXPECT_THROW(ReadGroupInfo{"00082ba1/0---1"}, std::runtime_error);
+    EXPECT_THROW(ReadGroupInfo{"00082ba1/0..1"};, std::runtime_error);
+    EXPECT_THROW(ReadGroupInfo{"00082ba1/0"}, std::runtime_error);
+    EXPECT_THROW(ReadGroupInfo{"00082ba1/A--B"}, std::runtime_error);
+}
+
+// clang-format on
diff --git a/tests/src/test_SamWriter.cpp b/tests/src/test_SamWriter.cpp

new file mode 100644 (file)

index 0000000..6091618
--- /dev/null
+++ b/tests/src/test_SamWriter.cpp
@@ -0,0 +1,170 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamFile.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/SamWriter.h>
+#include <pbbam/StringUtilities.h>
+#include "PbbamTestData.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(SamWriterTest, HeaderOk)
+{
+    // setup header
+    const std::string hdrText{
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.3\n"
+        "@RG\tID:6002b307\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;"
+        "SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\t"
+        "PU:test\tPM:SEQUEL\n"};
+
+    EXPECT_NO_THROW({
+        // write header to file
+        const std::string generatedFn =
+            PbbamTestsConfig::GeneratedData_Dir + "/samwriter_hdr_only.sam";
+        {
+            const BamHeader inputHeader(hdrText);
+            SamWriter writer(generatedFn, inputHeader);
+            //            ()writer;
+        };
+
+        // check header
+        {
+            std::ifstream f(generatedFn);
+            const std::string text((std::istreambuf_iterator<char>(f)),
+                                   std::istreambuf_iterator<char>());
+            EXPECT_EQ(hdrText, text);
+        }
+
+        // clean up
+        remove(generatedFn.c_str());
+    });
+}
+
+TEST(SamWriterTest, SingleRecordOk)
+{
+
+    // setup header
+    const std::string hdrLine1{"@HD\tVN:1.1\tSO:unknown\tpb:3.0.3"};
+    const std::string hdrLine2{
+        "@RG\tID:6002b307\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;"
+        "SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\t"
+        "PU:test\tPM:SEQUEL"};
+    const std::string hdrText = hdrLine1 + "\n" + hdrLine2 + "\n";
+    const BamHeader inputHeader(hdrText);
+
+    // setup record
+    BamRecord record(inputHeader);
+    record.Impl().Name("test/100/0_5");
+    record.Impl().SetSequenceAndQualities("ACGTC", 5, "@@@@@");
+    record.Impl().CigarData("");
+    record.Impl().Bin(0);
+    record.Impl().Flag(0);
+    record.Impl().InsertSize(0);
+    record.Impl().MapQuality(0);
+    record.Impl().MatePosition(-1);
+    record.Impl().MateReferenceId(-1);
+    record.Impl().Position(-1);
+    record.Impl().ReferenceId(-1);
+    record.Impl().SetMapped(false);
+
+    TagCollection tags;
+    tags["zm"] = int32_t{100};
+    tags["qs"] = int32_t{0};
+    tags["qe"] = int32_t{5};
+    tags["np"] = int32_t{1};
+    tags["rq"] = static_cast<float>(0.6);
+    tags["RG"] = std::string{"6002b307"};
+    tags["sn"] = std::vector<float>{0.2f, 0.2f, 0.2f, 0.2f};
+    record.Impl().Tags(tags);
+
+    const std::string expectedSamRecord{
+        "test/100/0_5\t4\t*\t0\t0\t*\t*\t0\t0\tACGTC\t@@@@@\tRG:Z:6002b307\t"
+        "np:i:1\tqe:i:5\tqs:i:0\trq:f:0.6\tsn:B:f,0.2,0.2,0.2,0.2\tzm:i:100"};
+
+    EXPECT_NO_THROW({
+        // write data to file
+        const std::string generatedFn =
+            PbbamTestsConfig::GeneratedData_Dir + "/samwriter_hdr_and_record.sam";
+        {
+            SamWriter writer(generatedFn, inputHeader);
+            writer.Write(record);
+        };
+
+        // check header & record
+        {
+            std::ifstream f(generatedFn);
+            std::string line1;
+            std::string line2;
+            std::string line3;
+            std::getline(f, line1);
+            std::getline(f, line2);
+            std::getline(f, line3);
+            EXPECT_EQ(hdrLine1, line1);
+            EXPECT_EQ(hdrLine2, line2);
+            EXPECT_EQ(expectedSamRecord, line3);
+        }
+
+        // cleanup
+        remove(generatedFn.c_str());
+    });
+}
+
+TEST(SamWriterTest, LongCigarFormatting)
+{
+    const std::string longCigarFn = PacBio::BAM::PbbamTestsConfig::Data_Dir + "/long-cigar-1.7.bam";
+    const std::string samFn =
+        PacBio::BAM::PbbamTestsConfig::GeneratedData_Dir + "/long-cigar-1.7.sam";
+
+    std::string originalCigar;
+
+    // Generate SAM from long CIGAR BAM
+    {
+        const BamFile inFile{longCigarFn};
+        SamWriter writer{samFn, inFile.Header()};
+        EntireFileQuery query{inFile};
+        for (auto record : query) {
+            originalCigar = record.CigarData().ToStdString();
+            writer.Write(record);
+        }
+    }
+
+    // Verify expected output
+    {
+        std::ifstream f{samFn};
+
+        std::string line1;
+        std::string line2;
+        std::string line3;
+        std::string line4;
+
+        std::getline(f, line1);
+        std::getline(f, line2);
+        std::getline(f, line3);
+        std::getline(f, line4);
+
+        EXPECT_EQ(0, line1.find("@HD"));
+        EXPECT_EQ(0, line2.find("@SQ"));
+        EXPECT_EQ(0, line3.find("@PG"));
+
+        // This is _literal_ value stored in the CIGAR field for this long-CIGAR
+        // record. The real CIGAR data is stored in the "CG" tag.
+        //
+        // That literal value does not belong in SAM (as well as the CG tag).
+        // CIGAR data should be placed in the standard SAM field.
+        //
+        EXPECT_EQ(std::string::npos, line4.find("457350S497223N"));
+        EXPECT_EQ(std::string::npos, line4.find("CG:B:I,"));
+
+        const auto fields = PacBio::BAM::Split(line4);
+        ASSERT_EQ(11, fields.size());
+        EXPECT_EQ(originalCigar, fields.at(5));
+    }
+}
diff --git a/tests/src/test_SequenceUtils.cpp b/tests/src/test_SequenceUtils.cpp

new file mode 100644 (file)

index 0000000..0c6543c
--- /dev/null
+++ b/tests/src/test_SequenceUtils.cpp
@@ -0,0 +1,80 @@
+// Author: Derek Barnett
+
+#include <climits>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/../../src/SequenceUtils.h>
+
+TEST(SequenceUtilsTest, ComplementChar)
+{
+    // complement
+    const char A = 'A';  // T
+    const char B = 'B';  // V
+    const char C = 'C';  // G
+    const char D = 'D';  // H
+    const char E = 'E';  // null
+    const char F = 'F';  // null
+    const char G = 'G';  // C
+    const char H = 'H';  // D
+    const char I = 'I';  // null
+    const char J = 'J';  // null
+    const char K = 'K';  // M
+    const char L = 'L';  // null
+    const char M = 'M';  // K
+    const char N = 'N';  // N
+    const char O = 'O';  // null
+    const char P = 'P';  // null
+    const char Q = 'Q';  // null
+    const char R = 'R';  // Y
+    const char S = 'S';  // S
+    const char T = 'T';  // A
+    const char U = 'U';  // A
+    const char V = 'V';  // B
+    const char W = 'W';  // W
+    const char X = 'X';  // null
+    const char Y = 'Y';  // R
+    const char Z = 'Z';  // null
+
+    using PacBio::BAM::Complement;
+
+    EXPECT_EQ(T, Complement(A));
+    EXPECT_EQ(V, Complement(B));
+    EXPECT_EQ(G, Complement(C));
+    EXPECT_EQ(H, Complement(D));
+    EXPECT_EQ(0, Complement(E));
+    EXPECT_EQ(0, Complement(F));
+    EXPECT_EQ(C, Complement(G));
+    EXPECT_EQ(D, Complement(H));
+    EXPECT_EQ(0, Complement(I));
+    EXPECT_EQ(0, Complement(J));
+    EXPECT_EQ(M, Complement(K));
+    EXPECT_EQ(0, Complement(L));
+    EXPECT_EQ(K, Complement(M));
+    EXPECT_EQ(N, Complement(N));
+    EXPECT_EQ(0, Complement(O));
+    EXPECT_EQ(0, Complement(P));
+    EXPECT_EQ(0, Complement(Q));
+    EXPECT_EQ(Y, Complement(R));
+    EXPECT_EQ(S, Complement(S));
+    EXPECT_EQ(A, Complement(T));
+    EXPECT_EQ(A, Complement(U));
+    EXPECT_EQ(B, Complement(V));
+    EXPECT_EQ(W, Complement(W));
+    EXPECT_EQ(0, Complement(X));
+    EXPECT_EQ(R, Complement(Y));
+    EXPECT_EQ(0, Complement(Z));
+}
+
+TEST(SequenceUtilsTest, ReverseComplement)
+{
+    std::string input1{"ATATATCCCGGCG"};
+    const std::string rc1{"CGCCGGGATATAT"};
+
+    using PacBio::BAM::ReverseComplement;
+
+    ReverseComplement(input1);
+    EXPECT_EQ(rc1, input1);
+}
diff --git a/tests/src/test_StringUtils.cpp b/tests/src/test_StringUtils.cpp

new file mode 100644 (file)

index 0000000..7ae5af0
--- /dev/null
+++ b/tests/src/test_StringUtils.cpp
@@ -0,0 +1,81 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+
+#include <pbbam/StringUtilities.h>
+
+TEST(StringUtilsTest, BasicSplitWithDefaultDelim)
+{
+    using PacBio::BAM::Split;
+
+    const std::string test{"foo\tbar\tbaz"};
+    const auto tokens = Split(test);
+    EXPECT_EQ(3, tokens.size());
+    EXPECT_TRUE(tokens.at(0) == "foo");
+    EXPECT_TRUE(tokens.at(1) == "bar");
+    EXPECT_TRUE(tokens.at(2) == "baz");
+}
+
+TEST(StringUtilsTest, BasicSplitWithProvidedDelim)
+{
+    using PacBio::BAM::Split;
+
+    const std::string test{"foo:bar:baz"};
+    const auto tokens = Split(test, ':');
+    EXPECT_EQ(3, tokens.size());
+    EXPECT_TRUE(tokens.at(0) == "foo");
+    EXPECT_TRUE(tokens.at(1) == "bar");
+    EXPECT_TRUE(tokens.at(2) == "baz");
+}
+
+TEST(StringUtilsTest, SplitEmptyStringReturnsEmptyResult)
+{
+    using PacBio::BAM::Split;
+
+    const std::string test;
+    const auto tokens = Split(test);
+    EXPECT_TRUE(tokens.empty());
+}
+
+TEST(StringUtilsTest, SplitKeepsEmptyTokens)
+{
+    using PacBio::BAM::Split;
+
+    const std::string test{"foo\tbar\t\tbaz"};
+    const auto tokens = Split(test);
+    EXPECT_EQ(4, tokens.size());
+    EXPECT_TRUE(tokens.at(0) == "foo");
+    EXPECT_TRUE(tokens.at(1) == "bar");
+    EXPECT_TRUE(tokens.at(2) == "");
+    EXPECT_TRUE(tokens.at(3) == "baz");
+}
+
+TEST(StringUtilsTest, RemoveWhitespaceNormal)
+{
+    using PacBio::BAM::RemoveAllWhitespace;
+
+    {  // lvalue
+        const std::string input{" \f\r\v  Lorem ipsum     \tdolor sit\n\namet "};
+        const auto result = RemoveAllWhitespace(input);
+        EXPECT_EQ("Loremipsumdolorsitamet", result);
+    }
+    {  // rvalue
+        const auto result = RemoveAllWhitespace(" \f\r\v  Lorem ipsum     \tdolor sit\n\namet ");
+        EXPECT_EQ("Loremipsumdolorsitamet", result);
+    }
+}
+
+TEST(StringUtilsTest, RemoveWhitespaceOnEmptyString)
+{
+    using PacBio::BAM::RemoveAllWhitespace;
+
+    {  // lvalue
+        const std::string input;
+        const auto result = RemoveAllWhitespace(input);
+        EXPECT_TRUE(result.empty());
+    }
+    {  // rvalue
+        const auto result = RemoveAllWhitespace("");
+        EXPECT_TRUE(result.empty());
+    }
+}
diff --git a/tests/src/test_SubreadLengthQuery.cpp b/tests/src/test_SubreadLengthQuery.cpp

new file mode 100644 (file)

index 0000000..81c6791
--- /dev/null
+++ b/tests/src/test_SubreadLengthQuery.cpp
@@ -0,0 +1,54 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/SubreadLengthQuery.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(SubreadLengthQueryTest, QueryOk)
+{
+    const auto bamFile = BamFile{PbbamTestsConfig::Data_Dir + std::string{"/group/test2.bam"}};
+
+    {
+        SubreadLengthQuery query(500, Compare::GREATER_THAN_EQUAL, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(3, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_GE((r.QueryEnd() - r.QueryStart()), 500);
+        }
+        EXPECT_EQ(3, count);
+    }
+    {
+        SubreadLengthQuery query(1000, Compare::GREATER_THAN_EQUAL, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(2, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_GE((r.QueryEnd() - r.QueryStart()), 1000);
+        }
+        EXPECT_EQ(2, count);
+    }
+    {
+        SubreadLengthQuery query(5000, Compare::GREATER_THAN_EQUAL, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(0, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_GE((r.QueryEnd() - r.QueryStart()), 5000);
+        }
+        EXPECT_EQ(0, count);
+    }
+}
diff --git a/tests/src/test_Tags.cpp b/tests/src/test_Tags.cpp

new file mode 100644 (file)

index 0000000..dd6788b
--- /dev/null
+++ b/tests/src/test_Tags.cpp
@@ -0,0 +1,1115 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <sstream>
+#include <string>
+#include <typeinfo>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <boost/type_traits/is_convertible.hpp>
+
+#include <pbbam/BamTagCodec.h>
+#include <pbbam/SamTagCodec.h>
+#include <pbbam/TagCollection.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(TagTest, TagConstruction)
+{
+    int8_t i8 = 0;
+    uint8_t u8 = 0;
+    int16_t i16 = 0;
+    uint16_t u16 = 0;
+    int32_t i32 = 0;
+    uint32_t u32 = 0;
+    float f = 0.0;
+    std::string str = "";
+    std::vector<int8_t> i8_array;
+    std::vector<uint8_t> u8_array;
+    std::vector<int16_t> i16_array;
+    std::vector<uint16_t> u16_array;
+    std::vector<int32_t> i32_array;
+    std::vector<uint32_t> u32_Array;
+    std::vector<float> float_array;
+
+    signed char c = 'A';
+    unsigned char uc = 'A';
+
+    Tag i8Tag(i8);
+    Tag u8Tag(u8);
+    Tag i16Tag(i16);
+    Tag u16Tag(u16);
+    Tag i32Tag(i32);
+    Tag u32Tag(u32);
+    Tag floatTag(f);
+    Tag stringTag(str);
+    Tag i8_array_Tag(i8_array);
+    Tag u8_array_Tag(u8_array);
+    Tag i16_array_Tag(i16_array);
+    Tag u16_array_Tag(u16_array);
+    Tag i32_array_Tag(i32_array);
+    Tag u32_array_Tag(u32_Array);
+    Tag float_array_Tag(float_array);
+
+    Tag charTag(c, TagModifier::ASCII_CHAR);
+    Tag ucharTag(uc, TagModifier::ASCII_CHAR);
+
+    EXPECT_TRUE(i8Tag.Type() == TagDataType::INT8);
+    EXPECT_TRUE(u8Tag.Type() == TagDataType::UINT8);
+    EXPECT_TRUE(i16Tag.Type() == TagDataType::INT16);
+    EXPECT_TRUE(u16Tag.Type() == TagDataType::UINT16);
+    EXPECT_TRUE(i32Tag.Type() == TagDataType::INT32);
+    EXPECT_TRUE(u32Tag.Type() == TagDataType::UINT32);
+    EXPECT_TRUE(floatTag.Type() == TagDataType::FLOAT);
+    EXPECT_TRUE(stringTag.Type() == TagDataType::STRING);
+    EXPECT_TRUE(i8_array_Tag.Type() == TagDataType::INT8_ARRAY);
+    EXPECT_TRUE(u8_array_Tag.Type() == TagDataType::UINT8_ARRAY);
+    EXPECT_TRUE(i16_array_Tag.Type() == TagDataType::INT16_ARRAY);
+    EXPECT_TRUE(u16_array_Tag.Type() == TagDataType::UINT16_ARRAY);
+    EXPECT_TRUE(i32_array_Tag.Type() == TagDataType::INT32_ARRAY);
+    EXPECT_TRUE(u32_array_Tag.Type() == TagDataType::UINT32_ARRAY);
+    EXPECT_TRUE(float_array_Tag.Type() == TagDataType::FLOAT_ARRAY);
+
+    EXPECT_TRUE(charTag.ToAscii() == 'A');
+    EXPECT_TRUE(ucharTag.ToAscii() == 'A');
+}
+
+TEST(TagTest, CopyAndCompare)
+{
+    int8_t i8 = 0;
+    uint8_t u8 = 0;
+    int16_t i16 = 0;
+    uint16_t u16 = 0;
+    int32_t i32 = 0;
+    uint32_t u32 = 0;
+    float f = 0.0;
+    std::string str = "";
+    std::vector<int8_t> i8_array;
+    std::vector<uint8_t> u8_array;
+    std::vector<int16_t> i16_array;
+    std::vector<uint16_t> u16_array;
+    std::vector<int32_t> i32_array;
+    std::vector<uint32_t> u32_Array;
+    std::vector<float> float_array;
+
+    Tag i8Tag(i8);
+    Tag u8Tag(u8);
+    Tag i16Tag(i16);
+    Tag u16Tag(u16);
+    Tag i32Tag(i32);
+    Tag u32Tag(u32);
+    Tag floatTag(f);
+    Tag stringTag(str);
+    Tag i8_array_Tag(i8_array);
+    Tag u8_array_Tag(u8_array);
+    Tag i16_array_Tag(i16_array);
+    Tag u16_array_Tag(u16_array);
+    Tag i32_array_Tag(i32_array);
+    Tag u32_array_Tag(u32_Array);
+    Tag float_array_Tag(float_array);
+
+    Tag i8Tag2 = i8Tag;
+    Tag u8Tag2 = u8Tag;
+    Tag i16Tag2 = i16Tag;
+    Tag u16Tag2 = u16Tag;
+    Tag i32Tag2 = i32Tag;
+    Tag u32Tag2 = u32Tag;
+    Tag floatTag2 = floatTag;
+    Tag stringTag2 = stringTag;
+    Tag i8_array_Tag2 = i8_array_Tag;
+    Tag u8_array_Tag2 = u8_array_Tag;
+    Tag i16_array_Tag2 = i16_array_Tag;
+    Tag u16_array_Tag2 = u16_array_Tag;
+    Tag i32_array_Tag2 = i32_array_Tag;
+    Tag u32_array_Tag2 = u32_array_Tag;
+    Tag float_array_Tag2 = float_array_Tag;
+
+    EXPECT_EQ(i8Tag, i8Tag2);
+    EXPECT_EQ(u8Tag, u8Tag2);
+    EXPECT_EQ(i16Tag, i16Tag2);
+    EXPECT_EQ(u16Tag, u16Tag2);
+    EXPECT_EQ(i32Tag, i32Tag2);
+    EXPECT_EQ(u32Tag, u32Tag2);
+    EXPECT_EQ(floatTag, floatTag2);
+    EXPECT_EQ(stringTag, stringTag2);
+    EXPECT_EQ(i8_array_Tag, i8_array_Tag2);
+    EXPECT_EQ(u8_array_Tag, u8_array_Tag2);
+    EXPECT_EQ(i16_array_Tag, i16_array_Tag2);
+    EXPECT_EQ(u16_array_Tag, u16_array_Tag2);
+    EXPECT_EQ(i32_array_Tag, i32_array_Tag2);
+    EXPECT_EQ(u32_array_Tag, u32_array_Tag2);
+    EXPECT_EQ(float_array_Tag, float_array_Tag2);
+}
+
+TEST(TagTest, Type_None)
+{
+    Tag tag;
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INVALID);
+    EXPECT_TRUE(tag.IsNull());
+    EXPECT_TRUE(tag.Typename() == "none");
+
+    EXPECT_FALSE(tag.IsNumeric());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+}
+
+TEST(TagTest, Type_Int8)
+{
+    const int8_t v = -42;
+    const Tag tag(v);
+
+    int8_t v2{};
+    EXPECT_NO_THROW(v2 = tag.ToInt8());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT8);
+    EXPECT_TRUE(tag.Typename() == "int8_t");
+    EXPECT_TRUE(tag.IsInt8());
+
+    EXPECT_TRUE(tag.IsSignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsUnsignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt8)
+{
+    const uint8_t v = 42;
+    const Tag tag(v);
+
+    uint8_t v2{};
+    EXPECT_NO_THROW(v2 = tag.ToUInt8());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT8);
+    EXPECT_TRUE(tag.Typename() == "uint8_t");
+    EXPECT_TRUE(tag.IsUInt8());
+
+    EXPECT_TRUE(tag.IsUnsignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsSignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_Ascii)
+{
+    const char c = '$';
+    const signed char sc = '$';
+    const unsigned char uc = '$';
+    const uint8_t u8 = 65;
+    const int8_t i8 = 66;
+
+    {  // old style: construct-then-modify
+
+        Tag fromPlainChar = Tag(c);
+        Tag fromSignedChar = Tag(sc);
+        Tag fromUnsignedChar = Tag(uc);
+        Tag fromUint8 = Tag(u8);
+        Tag fromInt8 = Tag(i8);
+        fromPlainChar.Modifier(TagModifier::ASCII_CHAR);
+        fromSignedChar.Modifier(TagModifier::ASCII_CHAR);
+        fromUnsignedChar.Modifier(TagModifier::ASCII_CHAR);
+        fromUint8.Modifier(TagModifier::ASCII_CHAR);
+        fromInt8.Modifier(TagModifier::ASCII_CHAR);
+
+        EXPECT_TRUE(fromPlainChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromPlainChar.IsIntegral());
+        EXPECT_TRUE(fromPlainChar.IsNumeric());
+        EXPECT_EQ('$', fromPlainChar.ToAscii());
+
+        EXPECT_TRUE(fromSignedChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromSignedChar.IsIntegral());
+        EXPECT_TRUE(fromSignedChar.IsNumeric());
+        EXPECT_EQ('$', fromSignedChar.ToAscii());
+
+        EXPECT_TRUE(fromUnsignedChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromUnsignedChar.IsIntegral());
+        EXPECT_TRUE(fromUnsignedChar.IsNumeric());
+        EXPECT_EQ('$', fromUnsignedChar.ToAscii());
+
+        EXPECT_TRUE(fromUint8.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromUint8.IsIntegral());
+        EXPECT_TRUE(fromUint8.IsNumeric());
+        EXPECT_EQ('A', fromUint8.ToAscii());
+
+        EXPECT_TRUE(fromInt8.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromInt8.IsIntegral());
+        EXPECT_TRUE(fromInt8.IsNumeric());
+        EXPECT_EQ('B', fromInt8.ToAscii());
+    }
+
+    {  // new style: construct directly as ASCII
+
+        const Tag fromPlainChar = Tag(c, TagModifier::ASCII_CHAR);
+        const Tag fromSignedChar = Tag(sc, TagModifier::ASCII_CHAR);
+        const Tag fromUnsignedChar = Tag(uc, TagModifier::ASCII_CHAR);
+        const Tag fromUint8 = Tag(u8, TagModifier::ASCII_CHAR);
+        const Tag fromInt8 = Tag(i8, TagModifier::ASCII_CHAR);
+
+        EXPECT_TRUE(fromPlainChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromPlainChar.IsIntegral());
+        EXPECT_TRUE(fromPlainChar.IsNumeric());
+        EXPECT_EQ('$', fromPlainChar.ToAscii());
+
+        EXPECT_TRUE(fromSignedChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromSignedChar.IsIntegral());
+        EXPECT_TRUE(fromSignedChar.IsNumeric());
+        EXPECT_EQ('$', fromSignedChar.ToAscii());
+
+        EXPECT_TRUE(fromUnsignedChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromUnsignedChar.IsIntegral());
+        EXPECT_TRUE(fromUnsignedChar.IsNumeric());
+        EXPECT_EQ('$', fromUnsignedChar.ToAscii());
+
+        EXPECT_TRUE(fromUint8.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromUint8.IsIntegral());
+        EXPECT_TRUE(fromUint8.IsNumeric());
+        EXPECT_EQ('A', fromUint8.ToAscii());
+
+        EXPECT_TRUE(fromInt8.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromInt8.IsIntegral());
+        EXPECT_TRUE(fromInt8.IsNumeric());
+        EXPECT_EQ('B', fromInt8.ToAscii());
+    }
+
+    // check invalid constructs
+    EXPECT_THROW(Tag('A', TagModifier::HEX_STRING), std::runtime_error);
+}
+
+TEST(TagTest, Type_Int16)
+{
+    const int16_t v = -42;
+    const Tag tag(v);
+
+    int16_t v2{};
+    EXPECT_NO_THROW(v2 = tag.ToInt16());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT16);
+    EXPECT_TRUE(tag.Typename() == "int16_t");
+    EXPECT_TRUE(tag.IsInt16());
+    EXPECT_TRUE(tag.IsSignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsUnsignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt16)
+{
+    const uint16_t v = 42;
+    const Tag tag(v);
+
+    uint16_t v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt16());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT16);
+    EXPECT_TRUE(tag.Typename() == "uint16_t");
+    EXPECT_TRUE(tag.IsUInt16());
+    EXPECT_TRUE(tag.IsUnsignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsSignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_Int32)
+{
+    const int32_t v = -42;
+    const Tag tag(v);
+
+    int32_t v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt32());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT32);
+    EXPECT_TRUE(tag.Typename() == "int32_t");
+    EXPECT_TRUE(tag.IsInt32());
+    EXPECT_TRUE(tag.IsSignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsUnsignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt32)
+{
+    const uint32_t v = 42;
+    const Tag tag(v);
+
+    uint32_t v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt32());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT32);
+    EXPECT_TRUE(tag.Typename() == "uint32_t");
+    EXPECT_TRUE(tag.IsUInt32());
+    EXPECT_TRUE(tag.IsUnsignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsSignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_Float)
+{
+    const float v = 3.141;
+    const Tag tag(v);
+
+    float v2;
+    EXPECT_NO_THROW(v2 = tag.ToFloat());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::FLOAT);
+    EXPECT_TRUE(tag.Typename() == "float");
+    EXPECT_TRUE(tag.IsFloat());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsIntegral());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_String)
+{
+    const std::string v = "foo_who";
+    const Tag tag(v);
+
+    std::string v2;
+    EXPECT_NO_THROW(v2 = tag.ToString());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::STRING);
+    EXPECT_TRUE(tag.Typename() == "string");
+    EXPECT_TRUE(tag.IsString());
+
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+
+    // "Hex format" string
+    const Tag hex("DEADBEEF", TagModifier::HEX_STRING);
+    EXPECT_TRUE(hex.Type() == TagDataType::STRING);
+    EXPECT_TRUE(hex.Typename() == "string");
+    EXPECT_TRUE(hex.IsString());
+    EXPECT_TRUE(hex.HasModifier(TagModifier::HEX_STRING));
+    EXPECT_FALSE(hex.IsNull());
+    EXPECT_FALSE(hex.IsNumeric());
+    EXPECT_FALSE(hex.IsArray());
+
+    // check invalid constructs
+    EXPECT_THROW(Tag("DEADBEEF", TagModifier::ASCII_CHAR), std::runtime_error);
+}
+
+TEST(TagTest, Type_Int8Array)
+{
+    const std::vector<int8_t> v = {-42, 100, 0};
+    const Tag tag(v);
+
+    std::vector<int8_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt8Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT8_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<int8_t>");
+    EXPECT_TRUE(tag.IsInt8Array());
+    EXPECT_TRUE(tag.IsSignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt8Array)
+{
+    const std::vector<uint8_t> v = {42, 200, 0};
+    const Tag tag(v);
+
+    std::vector<uint8_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt8Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT8_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<uint8_t>");
+    EXPECT_TRUE(tag.IsUInt8Array());
+    EXPECT_TRUE(tag.IsUnsignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_Int16Array)
+{
+    const std::vector<int16_t> v = {42, -300, 0};
+    const Tag tag(v);
+
+    std::vector<int16_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt16Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT16_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<int16_t>");
+    EXPECT_TRUE(tag.IsInt16Array());
+    EXPECT_TRUE(tag.IsSignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt16Array)
+{
+    const std::vector<uint16_t> v = {42, 300, 0};
+    const Tag tag(v);
+
+    std::vector<uint16_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt16Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT16_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<uint16_t>");
+    EXPECT_TRUE(tag.IsUInt16Array());
+    EXPECT_TRUE(tag.IsUnsignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+    ;
+}
+
+TEST(TagTest, Type_Int32Array)
+{
+    const std::vector<int32_t> v = {42, -300, 0};
+    const Tag tag(v);
+
+    std::vector<int32_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt32Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT32_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<int32_t>");
+    EXPECT_TRUE(tag.IsInt32Array());
+    EXPECT_TRUE(tag.IsSignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt32Array)
+{
+    const std::vector<uint32_t> v = {42, 300, 0};
+    const Tag tag(v);
+
+    std::vector<uint32_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt32Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT32_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<uint32_t>");
+    EXPECT_TRUE(tag.IsUInt32Array());
+    EXPECT_TRUE(tag.IsUnsignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_FloatArray)
+{
+    const std::vector<float> v = {1.1f, 1.2f, 1.3f};
+    const Tag tag(v);
+
+    std::vector<float> v2;
+    EXPECT_NO_THROW(v2 = tag.ToFloatArray());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::FLOAT_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<float>");
+    EXPECT_TRUE(tag.IsFloatArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsIntegralArray());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, CastBackToOriginalOk)
+{
+    int8_t i8 = 0;
+    uint8_t u8 = 0;
+    int16_t i16 = 0;
+    uint16_t u16 = 0;
+    int32_t i32 = 0;
+    uint32_t u32 = 0;
+    float f = 0.0;
+    std::string str = "";
+    std::vector<int8_t> i8_array;
+    std::vector<uint8_t> u8_array;
+    std::vector<int16_t> i16_array;
+    std::vector<uint16_t> u16_array;
+    std::vector<int32_t> i32_array;
+    std::vector<uint32_t> u32_array;
+    std::vector<float> float_array;
+
+    Tag i8Tag(i8);
+    Tag u8Tag(u8);
+    Tag i16Tag(i16);
+    Tag u16Tag(u16);
+    Tag i32Tag(i32);
+    Tag u32Tag(u32);
+    Tag floatTag(f);
+    Tag stringTag(str);
+    Tag i8_array_Tag(i8_array);
+    Tag u8_array_Tag(u8_array);
+    Tag i16_array_Tag(i16_array);
+    Tag u16_array_Tag(u16_array);
+    Tag i32_array_Tag(i32_array);
+    Tag u32_array_Tag(u32_array);
+    Tag float_array_Tag(float_array);
+
+    EXPECT_NO_THROW({
+        i8 = i8Tag.ToInt8();
+        u8 = u8Tag.ToUInt8();
+        i16 = i16Tag.ToInt16();
+        u16 = u16Tag.ToUInt16();
+        i32 = i32Tag.ToInt32();
+        u32 = u32Tag.ToUInt32();
+        f = floatTag.ToFloat();
+        str = stringTag.ToString();
+        i8_array = i8_array_Tag.ToInt8Array();
+        u8_array = u8_array_Tag.ToUInt8Array();
+        i16_array = i16_array_Tag.ToInt16Array();
+        u16_array = u16_array_Tag.ToUInt16Array();
+        i32_array = i32_array_Tag.ToInt32Array();
+        u32_array = u32_array_Tag.ToUInt32Array();
+        float_array = float_array_Tag.ToFloatArray();
+    });
+}
+
+TEST(TagTest, ConvertToInt8)
+{
+    Tag zero(int32_t{0});
+    Tag min(int32_t{std::numeric_limits<int8_t>::min()});
+    Tag normal(int32_t{42});
+    Tag max(int32_t{std::numeric_limits<int8_t>::max()});
+    Tag floatTag(float{3.14});
+    Tag stringTag(std::string{"foo"});
+    Tag arrayTag(std::vector<int8_t>{{1, 2, 3}});
+
+    // allowed
+    EXPECT_NO_THROW({
+        zero.ToInt8();
+        min.ToInt8();
+        normal.ToInt8();
+        max.ToInt8();
+    });
+
+    // not allowed
+    EXPECT_THROW(floatTag.ToInt8(), std::exception);
+    EXPECT_THROW(stringTag.ToInt8(), std::exception);
+    EXPECT_THROW(arrayTag.ToInt8(), std::exception);
+}
+
+TEST(TagTest, ConvertToUInt8)
+{
+    Tag zero(int32_t{0});
+    Tag neg(int32_t{-1});
+    Tag normal(int32_t{42});
+    Tag max(int32_t{std::numeric_limits<uint8_t>::max()});
+    Tag floatTag(float{3.14});
+    Tag stringTag(std::string{"foo"});
+    Tag arrayTag(std::vector<uint8_t>{{1, 2, 3}});
+
+    // allowed
+    EXPECT_NO_THROW({
+        zero.ToUInt8();
+        normal.ToUInt8();
+        max.ToUInt8();
+    });
+
+    // not allowed
+    EXPECT_THROW(neg.ToUInt8(), std::exception);
+    EXPECT_THROW(floatTag.ToUInt8(), std::exception);
+    EXPECT_THROW(stringTag.ToUInt8(), std::exception);
+    EXPECT_THROW(arrayTag.ToUInt8(), std::exception);
+}
+
+TEST(TagTest, ConvertToInt16)
+{
+    Tag zero(int32_t{0});
+    Tag min(int32_t{std::numeric_limits<int16_t>::min()});
+    Tag normal(int32_t{42});
+    Tag max(int32_t{std::numeric_limits<int16_t>::max()});
+    Tag floatTag(float{3.14});
+    Tag stringTag(std::string{"foo"});
+    Tag arrayTag(std::vector<int16_t>{{1, 2, 3}});
+
+    // allowed
+    EXPECT_NO_THROW({
+        zero.ToInt16();
+        min.ToInt16();
+        normal.ToInt16();
+        max.ToInt16();
+    });
+
+    // not allowed
+    EXPECT_THROW(floatTag.ToInt16(), std::exception);
+    EXPECT_THROW(stringTag.ToInt16(), std::exception);
+    EXPECT_THROW(arrayTag.ToInt16(), std::exception);
+}
+
+TEST(TagTest, ConvertToUInt16)
+{
+    Tag zero(int32_t{0});
+    Tag neg(int32_t{-1});
+    Tag normal(int32_t{42});
+    Tag max(int32_t{std::numeric_limits<uint16_t>::max()});
+    Tag floatTag(float{3.14});
+    Tag stringTag(std::string{"foo"});
+    Tag arrayTag(std::vector<uint16_t>{{1, 2, 3}});
+
+    // allowed
+    EXPECT_NO_THROW({
+        zero.ToUInt16();
+        normal.ToUInt16();
+        max.ToUInt16();
+    });
+
+    // not allowed
+    EXPECT_THROW(neg.ToUInt16(), std::exception);
+    EXPECT_THROW(floatTag.ToUInt16(), std::exception);
+    EXPECT_THROW(stringTag.ToUInt16(), std::exception);
+    EXPECT_THROW(arrayTag.ToUInt16(), std::exception);
+}
+
+TEST(TagTest, ConvertToInt32)
+{
+    Tag zero(int32_t{0});
+    Tag min(int32_t{std::numeric_limits<int32_t>::min()});
+    Tag normal(int32_t{42});
+    Tag max(int32_t{std::numeric_limits<int32_t>::max()});
+    Tag floatTag(float{3.14});
+    Tag stringTag(std::string{"foo"});
+    Tag arrayTag(std::vector<int32_t>{{1, 2, 3}});
+
+    // allowed
+    EXPECT_NO_THROW({
+        zero.ToInt32();
+        min.ToInt32();
+        normal.ToInt32();
+        max.ToInt32();
+    });
+
+    // not allowed
+    EXPECT_THROW(floatTag.ToInt32(), std::exception);
+    EXPECT_THROW(stringTag.ToInt32(), std::exception);
+    EXPECT_THROW(arrayTag.ToInt32(), std::exception);
+}
+
+TEST(TagTest, ConvertToUInt32)
+{
+    Tag zero(int32_t{0});
+    Tag neg(int32_t{-1});
+    Tag normal(int32_t{42});
+    Tag max(uint32_t{std::numeric_limits<uint32_t>::max()});
+    Tag floatTag(float{3.14});
+    Tag stringTag(std::string{"foo"});
+    Tag arrayTag(std::vector<uint32_t>{{1, 2, 3}});
+
+    // allowed
+    EXPECT_NO_THROW({
+        zero.ToUInt32();
+        normal.ToUInt32();
+        max.ToUInt32();
+    });
+
+    // not allowed
+    EXPECT_THROW(neg.ToUInt32(), std::exception);
+    EXPECT_THROW(floatTag.ToUInt32(), std::exception);
+    EXPECT_THROW(stringTag.ToUInt32(), std::exception);
+    EXPECT_THROW(arrayTag.ToUInt32(), std::exception);
+}
+
+TEST(TagCollectionTest, DefaultConstruction)
+{
+    TagCollection tags;
+    EXPECT_TRUE(tags.empty());
+    EXPECT_FALSE(tags.Contains("XY"));
+}
+
+TEST(TagCollectionTest, AddSimpleTags)
+{
+    const int32_t intValue = -42;
+    const std::string strValue = "foo";
+    const std::string hexStrValue = "1abc75";
+
+    TagCollection tags;
+    tags["ST"] = strValue;
+    tags["XY"] = intValue;
+    tags["HX"] = hexStrValue;
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+
+    EXPECT_EQ(3, tags.size());
+    EXPECT_TRUE(tags.Contains("XY"));
+    EXPECT_TRUE(tags.Contains("ST"));
+    EXPECT_TRUE(tags.Contains("HX"));
+    EXPECT_FALSE(tags.Contains("ZZ"));
+
+    EXPECT_TRUE(tags["XY"].ToInt32() == intValue);
+    EXPECT_TRUE(tags["ST"].ToString() == strValue);
+    EXPECT_TRUE(tags["HX"].ToString() == hexStrValue);
+    EXPECT_TRUE(tags["HX"].HasModifier(TagModifier::HEX_STRING));
+}
+
+TEST(SamTagCodecTest, DecodeTest)
+{
+    std::string tagString;
+    tagString.append("HX:H:1abc75");
+    tagString.append("\t");
+    tagString.append("ST:Z:foo");
+    tagString.append("\t");
+    tagString.append("VC:B:i,42,-100,37,2048");
+    tagString.append("\t");
+    tagString.append("XY:i:-42");
+
+    TagCollection expected;
+    expected["ST"] = std::string("foo");
+    expected["XY"] = int32_t{-42};
+    expected["HX"] = Tag("1abc75", TagModifier::HEX_STRING);
+    expected["VC"] = std::vector<int32_t>({42, -100, 37, 2048});
+
+    TagCollection tags = SamTagCodec::Decode(tagString);
+
+    EXPECT_TRUE(tags.Contains("ST"));
+    EXPECT_TRUE(tags.Contains("HX"));
+    EXPECT_TRUE(tags.Contains("XY"));
+    EXPECT_TRUE(tags.Contains("VC"));
+
+    EXPECT_EQ(std::string("foo"), tags["ST"].ToString());
+    EXPECT_TRUE(tags["HX"].HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), tags["HX"].ToString());
+    EXPECT_EQ(int8_t{-42}, tags["XY"].ToInt8());
+    EXPECT_EQ(std::vector<int32_t>({42, -100, 37, 2048}), tags["VC"].ToInt32Array());
+}
+
+TEST(SamTagCodecTest, EncodeSingleTag)
+{
+    {  // string
+        const std::string expected{"ST:Z:foo"};
+        const Tag t = std::string{"foo"};
+        EXPECT_EQ(expected, SamTagCodec::Encode("ST", t));
+    }
+    {  // int
+        const std::string expected{"XY:i:-42"};
+        const Tag t = int32_t{-42};
+        EXPECT_EQ(expected, SamTagCodec::Encode("XY", t));
+    }
+    {  // hex string
+        const std::string expected{"HX:H:1abc75"};
+        const Tag t = Tag("1abc75", TagModifier::HEX_STRING);
+        EXPECT_EQ(expected, SamTagCodec::Encode("HX", t));
+    }
+    {  // int array
+        const std::string expected{"VC:B:i,42,-100,37,2048"};
+        const Tag t = std::vector<int32_t>({42, -100, 37, 2048});
+        EXPECT_EQ(expected, SamTagCodec::Encode("VC", t));
+    }
+    {  // float
+        const std::string expected{"rq:f:0.99"};
+        const Tag t = 0.99f;
+        EXPECT_EQ(expected, SamTagCodec::Encode("rq", t));
+    }
+    {  // null tag
+        const std::string expected;
+        const Tag t;
+        EXPECT_TRUE(t.IsNull());
+        EXPECT_EQ(expected, SamTagCodec::Encode("no", t));
+    }
+    {
+        // invalid name
+        EXPECT_THROW(SamTagCodec::Encode("invalid", Tag{}), std::runtime_error);
+    }
+}
+
+TEST(SamTagCodecTest, EncodeTagCollection)
+{
+    TagCollection tags;
+    tags["ST"] = std::string("foo");
+    tags["XY"] = int32_t{-42};
+    tags["HX"] = Tag("1abc75", TagModifier::HEX_STRING);
+    tags["VC"] = std::vector<int32_t>({42, -100, 37, 2048});
+    tags["rq"] = 0.99f;
+
+    std::ostringstream expected;
+    expected << "HX:H:1abc75" << '\t' << "ST:Z:foo" << '\t' << "VC:B:i,42,-100,37,2048" << '\t'
+             << "XY:i:-42" << '\t' << "rq:f:0.99";
+
+    const std::string sam = SamTagCodec::Encode(tags);
+    EXPECT_EQ(expected.str(), sam);
+}
+
+TEST(BamTagCodecTest, DecodeTest)
+{
+    std::vector<uint8_t> data;
+    data.push_back(uint8_t('H'));
+    data.push_back(uint8_t('X'));
+    data.push_back(uint8_t('H'));
+    data.push_back(uint8_t('1'));
+    data.push_back(uint8_t('a'));
+    data.push_back(uint8_t('b'));
+    data.push_back(uint8_t('c'));
+    data.push_back(uint8_t('7'));
+    data.push_back(uint8_t('5'));
+    data.push_back(uint8_t(0));
+
+    data.push_back(uint8_t('X'));
+    data.push_back(uint8_t('Y'));
+    data.push_back(uint8_t('i'));
+    const int32_t x = -42;
+    char valueBytes[sizeof x];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&x)),
+              static_cast<const char*>(static_cast<const void*>(&x)) + sizeof x, valueBytes);
+    data.push_back(valueBytes[0]);
+    data.push_back(valueBytes[1]);
+    data.push_back(valueBytes[2]);
+    data.push_back(valueBytes[3]);
+
+    data.push_back('C');
+    data.push_back('A');
+    data.push_back('B');
+    data.push_back('C');
+    const uint32_t numChars = 3;
+    char numCharsValueBytes[sizeof numChars];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&numChars)),
+              static_cast<const char*>(static_cast<const void*>(&numChars)) + sizeof numChars,
+              numCharsValueBytes);
+    data.push_back(numCharsValueBytes[0]);
+    data.push_back(numCharsValueBytes[1]);
+    data.push_back(numCharsValueBytes[2]);
+    data.push_back(numCharsValueBytes[3]);
+
+    const std::vector<uint8_t> charArray = std::vector<uint8_t>({34, 5, 125});
+    data.push_back(charArray.at(0));
+    data.push_back(charArray.at(1));
+    data.push_back(charArray.at(2));
+
+    TagCollection tags = BamTagCodec::Decode(data);
+
+    EXPECT_TRUE(tags["HX"].HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), tags["HX"].ToString());
+    EXPECT_EQ(x, tags["XY"].ToInt32());
+    EXPECT_EQ(charArray, tags["CA"].ToUInt8Array());
+
+    // sanity check - convert tags back to SAM
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(tags);
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamTagCodecTest, EncodeTest)
+{
+    std::vector<uint8_t> expected;
+
+    expected.push_back('C');
+    expected.push_back('A');
+    expected.push_back('B');
+    expected.push_back('C');
+    const uint32_t numChars = 3;
+    char numCharsValueBytes[sizeof numChars];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&numChars)),
+              static_cast<const char*>(static_cast<const void*>(&numChars)) + sizeof numChars,
+              numCharsValueBytes);
+    expected.push_back(numCharsValueBytes[0]);
+    expected.push_back(numCharsValueBytes[1]);
+    expected.push_back(numCharsValueBytes[2]);
+    expected.push_back(numCharsValueBytes[3]);
+
+    const std::vector<uint8_t> charArray = std::vector<uint8_t>({34, 5, 125});
+    expected.push_back(charArray.at(0));
+    expected.push_back(charArray.at(1));
+    expected.push_back(charArray.at(2));
+
+    expected.push_back(uint8_t('H'));
+    expected.push_back(uint8_t('X'));
+    expected.push_back(uint8_t('H'));
+    expected.push_back(uint8_t('1'));
+    expected.push_back(uint8_t('a'));
+    expected.push_back(uint8_t('b'));
+    expected.push_back(uint8_t('c'));
+    expected.push_back(uint8_t('7'));
+    expected.push_back(uint8_t('5'));
+    expected.push_back(uint8_t(0));
+
+    expected.push_back(uint8_t('X'));
+    expected.push_back(uint8_t('Y'));
+    expected.push_back(uint8_t('i'));
+    const int32_t x = -42;
+    char valueBytes[sizeof x];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&x)),
+              static_cast<const char*>(static_cast<const void*>(&x)) + sizeof x, valueBytes);
+    expected.push_back(valueBytes[0]);
+    expected.push_back(valueBytes[1]);
+    expected.push_back(valueBytes[2]);
+    expected.push_back(valueBytes[3]);
+
+    TagCollection tags;
+    tags["HX"] = Tag("1abc75", TagModifier::HEX_STRING);
+    tags["CA"] = charArray;
+    tags["XY"] = x;
+
+    const std::vector<uint8_t> data = BamTagCodec::Encode(tags);
+    EXPECT_EQ(expected, data);
+}
+
+TEST(BamTagCodecTest, AsciiTagsTest)
+{
+    std::vector<uint8_t> expected;
+    expected.reserve(20);
+    expected.push_back('I');  // I8:A:B
+    expected.push_back('8');
+    expected.push_back('A');
+    expected.push_back('B');
+    expected.push_back('P');  // PC:A:$
+    expected.push_back('C');
+    expected.push_back('A');
+    expected.push_back('$');
+    expected.push_back('S');  // SC:A:$
+    expected.push_back('C');
+    expected.push_back('A');
+    expected.push_back('$');
+    expected.push_back('U');  // U8:A:A
+    expected.push_back('8');
+    expected.push_back('A');
+    expected.push_back('A');
+    expected.push_back('U');  // UC:A:$
+    expected.push_back('C');
+    expected.push_back('A');
+    expected.push_back('$');
+
+    const char c = '$';
+    const signed char sc = '$';
+    const unsigned char uc = '$';
+    const uint8_t u8 = 65;
+    const int8_t i8 = 66;
+
+    {  // old style: construct-then-modify
+
+        Tag fromPlainChar = Tag(c);
+        Tag fromSignedChar = Tag(sc);
+        Tag fromUnsignedChar = Tag(uc);
+        Tag fromUint8 = Tag(u8);
+        Tag fromInt8 = Tag(i8);
+        fromPlainChar.Modifier(TagModifier::ASCII_CHAR);
+        fromSignedChar.Modifier(TagModifier::ASCII_CHAR);
+        fromUnsignedChar.Modifier(TagModifier::ASCII_CHAR);
+        fromUint8.Modifier(TagModifier::ASCII_CHAR);
+        fromInt8.Modifier(TagModifier::ASCII_CHAR);
+
+        TagCollection tags;
+        tags["PC"] = fromPlainChar;
+        tags["SC"] = fromSignedChar;
+        tags["UC"] = fromUnsignedChar;
+        tags["U8"] = fromUint8;
+        tags["I8"] = fromInt8;
+
+        const std::vector<uint8_t> data = BamTagCodec::Encode(tags);
+        EXPECT_EQ(expected, data);
+    }
+
+    {  // new style: construct directly as ASCII
+
+        const Tag fromPlainChar = Tag(c, TagModifier::ASCII_CHAR);
+        const Tag fromSignedChar = Tag(sc, TagModifier::ASCII_CHAR);
+        const Tag fromUnsignedChar = Tag(uc, TagModifier::ASCII_CHAR);
+        const Tag fromUint8 = Tag(u8, TagModifier::ASCII_CHAR);
+        const Tag fromInt8 = Tag(i8, TagModifier::ASCII_CHAR);
+
+        TagCollection tags;
+        tags["PC"] = fromPlainChar;
+        tags["SC"] = fromSignedChar;
+        tags["UC"] = fromUnsignedChar;
+        tags["U8"] = fromUint8;
+        tags["I8"] = fromInt8;
+
+        const std::vector<uint8_t> data = BamTagCodec::Encode(tags);
+        EXPECT_EQ(expected, data);
+    }
+}
diff --git a/tests/src/test_TextFileReader.cpp b/tests/src/test_TextFileReader.cpp

new file mode 100644 (file)

index 0000000..c1d9d79
--- /dev/null
+++ b/tests/src/test_TextFileReader.cpp
@@ -0,0 +1,140 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/TextFileReader.h>
+
+#include "FastxTests.h"
+
+using TextFileReader = PacBio::BAM::TextFileReader;
+
+// clang-format off
+
+TEST(TextFileReaderTest, throws_on_empty_filename)
+{
+    EXPECT_THROW(TextFileReader reader{""}, std::runtime_error);
+}
+
+TEST(TextFileReaderTest, can_open_plain_text)
+{
+    const auto& fn = FastxTests::simpleFastaFn;
+    EXPECT_NO_THROW(TextFileReader reader{fn});
+}
+
+TEST(TextFileReaderTest, can_open_gzip_text)
+{
+    const auto& fn = FastxTests::simpleFastaGzipFn;
+    EXPECT_NO_THROW(TextFileReader reader{fn});
+}
+
+TEST(TextFileReaderTest, can_open_bgzf_text)
+{
+    const auto& fn = FastxTests::simpleFastaBgzfFn;
+    EXPECT_NO_THROW(TextFileReader reader{fn});
+}
+
+TEST(TextFileReaderTest, can_iterate_manually_on_plain_text)
+{
+    const auto& fn = FastxTests::simpleFastaFn;
+    TextFileReader reader{fn};
+
+    size_t count = 0;
+    std::string line;
+    while (reader.GetNext(line))
+        ++count;
+
+    EXPECT_EQ(FastxTests::ExpectedFasta.size() * 2, count); // FASTA header + seq
+}
+
+TEST(TextFileReaderTest, can_iterate_manually_on_gzip_text)
+{
+    const auto& fn = FastxTests::simpleFastaGzipFn;
+    TextFileReader reader{fn};
+
+    size_t count = 0;
+    std::string line;
+    while (reader.GetNext(line))
+        ++count;
+
+    EXPECT_EQ(FastxTests::ExpectedFasta.size() * 2, count); // FASTA header + seq
+}
+
+TEST(TextFileReaderTest, can_iterate_manually_on_bgzf_text)
+{
+    const auto& fn = FastxTests::simpleFastaBgzfFn;
+    TextFileReader reader{fn};
+
+    size_t count = 0;
+    std::string line;
+    while (reader.GetNext(line))
+        ++count;
+
+    EXPECT_EQ(FastxTests::ExpectedFasta.size() * 2, count); // FASTA header + seq
+}
+
+TEST(TextFileReaderTest, can_iterate_using_range_for_on_plain_text)
+{
+    const auto& fn = FastxTests::simpleFastaFn;
+    TextFileReader reader{fn};
+
+    size_t count = 0;
+    for (const auto& line : reader) {
+        EXPECT_FALSE(line.empty());
+        ++count;
+    }
+
+    EXPECT_EQ(FastxTests::ExpectedFasta.size() * 2, count); // FASTA header + seq
+}
+
+TEST(TextFileReaderTest, can_iterate_using_range_for_on_gzip_text)
+{
+    const auto& fn = FastxTests::simpleFastaGzipFn;
+    TextFileReader reader{fn};
+
+    size_t count = 0;
+    for (const auto& line : reader) {
+        EXPECT_FALSE(line.empty());
+        ++count;
+    }
+
+    EXPECT_EQ(FastxTests::ExpectedFasta.size() * 2, count); // FASTA header + seq
+}
+
+TEST(TextFileReaderTest, can_iterate_using_range_for_on_bgzf_text)
+{
+    const auto& fn = FastxTests::simpleFastaBgzfFn;
+    TextFileReader reader{fn};
+
+    size_t count = 0;
+    for (const auto& line : reader) {
+        EXPECT_FALSE(line.empty());
+        ++count;
+    }
+
+    EXPECT_EQ(FastxTests::ExpectedFasta.size() * 2, count); // FASTA header + seq
+}
+
+TEST(TextFileReaderTest, can_read_all_from_plain_text)
+{
+    const auto& fn = FastxTests::simpleFastaFn;
+    const auto lines = TextFileReader::ReadAll(fn);
+    EXPECT_EQ(FastxTests::ExpectedFasta.size() * 2, lines.size()); // FASTA header + seq)
+}
+
+TEST(TextFileReaderTest, can_read_all_from_gzip_text)
+{
+    const auto& fn = FastxTests::simpleFastaGzipFn;
+    const auto lines = TextFileReader::ReadAll(fn);
+    EXPECT_EQ(FastxTests::ExpectedFasta.size() * 2, lines.size()); // FASTA header + seq)
+}
+
+TEST(TextFileReaderTest, can_read_all_from_bgzf_text)
+{
+    const auto& fn = FastxTests::simpleFastaBgzfFn;
+    const auto lines = TextFileReader::ReadAll(fn);
+    EXPECT_EQ(FastxTests::ExpectedFasta.size() * 2, lines.size()); // FASTA header + seq)
+}
+
+// clang-foramt on
diff --git a/tests/src/test_TextFileWriter.cpp b/tests/src/test_TextFileWriter.cpp

new file mode 100644 (file)

index 0000000..b13ffb8
--- /dev/null
+++ b/tests/src/test_TextFileWriter.cpp
@@ -0,0 +1,61 @@
+// Author: Derek Barnett
+
+#include <cstdio>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/FormatUtils.h>
+#include <pbbam/TextFileReader.h>
+#include <pbbam/TextFileWriter.h>
+
+#include "PbbamTestData.h"
+
+using TextFileReader = PacBio::BAM::TextFileReader;
+using TextFileWriter = PacBio::BAM::TextFileWriter;
+
+TEST(TextFileWriterTest, throws_on_empty_filename)
+{
+    EXPECT_THROW(TextFileWriter writer{""}, std::runtime_error);
+}
+
+TEST(TextFileWriterTest, can_write_plain_text)
+{
+    const std::string outFn = PacBio::BAM::PbbamTestsConfig::GeneratedData_Dir + "/out.txt";
+    const std::vector<std::string> lines{"foo", "bar", "baz"};
+
+    {
+        TextFileWriter writer{outFn};
+        for (const auto& line : lines)
+            writer.Write(line);
+    }
+    EXPECT_EQ(PacBio::BAM::HtslibCompression::NONE,
+              PacBio::BAM::FormatUtils::CompressionType(outFn));
+
+    const auto contents = TextFileReader::ReadAll(outFn);
+    EXPECT_TRUE(std::equal(lines.cbegin(), lines.cend(), contents.cbegin()));
+
+    remove(outFn.c_str());
+}
+
+TEST(TextFileWriterTest, can_write_gzipped_text)
+{
+    const std::string outFn = PacBio::BAM::PbbamTestsConfig::GeneratedData_Dir + "/out.txt.gz";
+    const std::vector<std::string> lines{"foo", "bar", "baz"};
+
+    {
+        TextFileWriter writer{outFn};
+        for (const auto& line : lines)
+            writer.Write(line);
+    }
+    EXPECT_EQ(PacBio::BAM::HtslibCompression::GZIP,
+              PacBio::BAM::FormatUtils::CompressionType(outFn));
+
+    const auto contents = TextFileReader::ReadAll(outFn);
+    EXPECT_TRUE(std::equal(lines.cbegin(), lines.cend(), contents.cbegin()));
+
+    remove(outFn.c_str());
+}
diff --git a/tests/src/test_TimeUtils.cpp b/tests/src/test_TimeUtils.cpp

new file mode 100644 (file)

index 0000000..e019d84
--- /dev/null
+++ b/tests/src/test_TimeUtils.cpp
@@ -0,0 +1,32 @@
+// Author: Derek Barnett
+
+#include <chrono>
+#include <ctime>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/../../src/TimeUtils.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(TimeUtilsTest, ToIso8601)
+{
+    const time_t rawTime = 436428750L;
+    const auto timestamp = std::chrono::system_clock::from_time_t(rawTime);
+
+    const auto expected = std::string{"1983-10-31T06:12:30Z"};  // no ms in test case
+    const auto actual = TimeUtils::ToIso8601(timestamp);
+    EXPECT_EQ(expected, actual);
+}
+
+TEST(TimeUtilsTest, ToDataSetFormat)
+{
+    const time_t rawTime = 436428750L;
+    const auto timestamp = std::chrono::system_clock::from_time_t(rawTime);
+
+    const auto expected = std::string{"831031_061230"};  // no ms in test case
+    const std::string actual = TimeUtils::ToDataSetFormat(timestamp);
+    EXPECT_EQ(expected, actual);
+}
diff --git a/tests/src/test_Validator.cpp b/tests/src/test_Validator.cpp

new file mode 100644 (file)

index 0000000..af45cdc
--- /dev/null
+++ b/tests/src/test_Validator.cpp
@@ -0,0 +1,571 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamFile.h>
+#include <pbbam/BamHeader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/Cigar.h>
+#include <pbbam/ReadGroupInfo.h>
+#include <pbbam/Validator.h>
+
+#include "../src/ValidationErrors.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace ValidatorTests {
+
+static BamRecord makeValidMappedRecord()
+{
+    BamRecordImpl impl;
+    impl.Bin(4680);
+    impl.Flag(2);
+    impl.InsertSize(0);
+    impl.MapQuality(10);
+    impl.MatePosition(-1);
+    impl.MateReferenceId(-1);
+    impl.Name("movie1/54130/0_10");
+    impl.Position(1);
+    impl.ReferenceId(0);
+    impl.SetMapped(true);
+    impl.SetSequenceAndQualities("AATGAGGAGA");
+    impl.CigarData(Cigar{"10="});
+
+    TagCollection tags;
+    tags["RG"] = std::string{"db972a04"};
+    tags["dq"] = std::string{"2222'$22'2"};
+    tags["dt"] = std::string{"NNNNAGNNGN"};
+    tags["iq"] = std::string{"(+#1'$#*1&"};
+    tags["mq"] = std::string{"&1~51*5&~2"};
+    tags["sq"] = std::string{"<32<4<<<<3"};
+    tags["ip"] = std::vector<uint8_t>{2, 0, 10, 22, 34, 0, 2, 3, 0, 16};
+    tags["np"] = int32_t{1};
+    tags["qe"] = int32_t{10};
+    tags["qs"] = int32_t{0};
+    tags["zm"] = int32_t{54130};
+    tags["cx"] = int32_t{2};
+    tags["AS"] = int32_t{-3020};
+    tags["NM"] = int32_t{134};
+    tags["rq"] = static_cast<float>(0.854);
+    tags["sn"] = std::vector<float>{2.0, 2.0, 2.0, 2.0};
+    impl.Tags(tags);
+
+    return BamRecord(impl);
+}
+
+static BamRecord makeValidUnmappedRecord()
+{
+    BamRecordImpl impl;
+    impl.Bin(4680);
+    impl.Flag(4);
+    impl.InsertSize(0);
+    impl.MapQuality(10);
+    impl.MatePosition(-1);
+    impl.MateReferenceId(-1);
+    impl.Name("m140906_231018_42161_c100676332550000001823129611271486_s1_p0/8/0_10");
+    impl.Position(-1);
+    impl.ReferenceId(-1);
+    impl.SetSequenceAndQualities("AATGAGGAGA");
+
+    TagCollection tags;
+    tags["RG"] = std::string{"b5482b33"};
+    tags["dq"] = std::string{"2222222222"};
+    tags["dt"] = std::string{"NNNNNNNNNN"};
+    tags["iq"] = std::string{",*11111001"};
+    tags["mq"] = std::string{"&47088')34"};
+    tags["sq"] = std::string{"8<4<:<6<0<"};
+    tags["ip"] = std::vector<uint8_t>{255, 9, 20, 43, 38, 12, 9, 30, 39, 22};
+    tags["np"] = int32_t{1};
+    tags["qe"] = int32_t{10};
+    tags["qs"] = int32_t{0};
+    tags["zm"] = int32_t{8};
+    tags["cx"] = int32_t{2};
+    tags["AS"] = int32_t{-3020};
+    tags["NM"] = int32_t{134};
+    tags["rq"] = static_cast<float>(0.811);
+    tags["sn"] = std::vector<float>{2.0, 2.0, 2.0, 2.0};
+    impl.Tags(tags);
+
+    return BamRecord(impl);
+}
+
+static ReadGroupInfo makeValidReadGroup()
+{
+    ReadGroupInfo rg("f5b4ffb6");
+    rg.MovieName("movie32");
+    rg.ReadType("CCS");
+    rg.BindingKit("101-789-500");
+    rg.SequencingKit("101-789-300");
+    rg.BasecallerVersion("5.0");
+    rg.FrameRateHz("100");
+    rg.Control("TRUE");
+    return rg;
+}
+
+// valid, 'starter' objects
+static const ReadGroupInfo validReadGroup = makeValidReadGroup();
+static const BamRecord validMappedRecord = makeValidMappedRecord();
+static const BamRecord validUnmappedRecord = makeValidUnmappedRecord();
+
+}  // namespace ValidatorTests
+
+TEST(ValidatorErrorsTest, SetMaxNumErrors)
+{
+    {  // default - use "no max"
+        ValidationErrors errors;
+        EXPECT_EQ(ValidationErrors::MAX, errors.MaxNumErrors());
+    }
+    {  // max of zero doesn't make sense... make equivalent to "no max"
+        ValidationErrors errors(0);
+        EXPECT_EQ(ValidationErrors::MAX, errors.MaxNumErrors());
+    }
+    {  // max = 1
+        ValidationErrors errors(1);
+        EXPECT_EQ(1, errors.MaxNumErrors());
+    }
+    {  // max = 10
+        ValidationErrors errors(10);
+        EXPECT_EQ(10, errors.MaxNumErrors());
+    }
+}
+
+TEST(ValidatorErrorsTest, ThrowOnMaxReached)
+{
+    {
+        ValidationErrors errors(1);
+        EXPECT_THROW(errors.AddFileError("foo", "you"), ValidationException);
+    }
+    {
+        ValidationErrors errors(2);
+        errors.AddFileError("foo", "you");
+        EXPECT_THROW(errors.AddFileError("foo", "me"), ValidationException);
+    }
+}
+
+TEST(ValidatorErrorsTest, ExceptionFromResults)
+{
+    const std::string error1 = "error1";
+    const std::string error2 = "error2";
+
+    try {
+
+        ValidationErrors errors(4);
+        errors.AddFileError("path/to/foo.bam", error1);
+        errors.AddFileError("path/to/foo.bam", error2);
+        errors.AddReadGroupError("deadbeef", "invalid sequencing chemistry combination detected");
+        errors.AddRecordError(
+            "m140906_231018_42161_c100676332550000001823129611271486_s1_p0/8/0_10",
+            "MergeQV does not match expected length");
+
+    } catch (ValidationException& e) {
+
+        EXPECT_EQ(1, e.FileErrors().size());                        // only 1 file
+        EXPECT_EQ(2, e.FileErrors().at("path/to/foo.bam").size());  // 2 errors for this file
+        EXPECT_EQ(1, e.ReadGroupErrors().size());
+        EXPECT_EQ(1, e.RecordErrors().size());
+    }
+}
+
+TEST(ValidatorTest, ValidReadGroup)
+{
+    ASSERT_NO_THROW(Validator::Validate(ValidatorTests::validReadGroup));
+}
+
+TEST(ValidatorTest, ReadGroupRequiredComponents)
+{
+    {  // missing ID
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.Id("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // missing movie name
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.MovieName("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // missing read type
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.ReadType("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // missing binding kit
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.BindingKit("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // missing sequencing kit
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.SequencingKit("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // missing basecaller version
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.BasecallerVersion("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // missing frame rate
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.FrameRateHz("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+}
+
+TEST(ValidatorTest, ReadGroupValues)
+{
+    {  // mismatch expected ID vs stored ID - change ID
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.Id("deadbeef");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // mismatch expected ID vs stored ID - change read type
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.ReadType("SUBREAD");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // mismatch expected ID vs stored ID - change movie name
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.MovieName("foo");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // unknown read type
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.ReadType("FOO");
+
+        // recompute ID so we're only checking the new read type, not read ID
+        rg.Id(MakeReadGroupId(rg.MovieName(), rg.ReadType()));
+
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // invalid chemistry triple - change binding kit
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.BindingKit("foo");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // invalid chemistry triple - change sequencing kit
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.SequencingKit("foo");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // invalid chemistry triple - change basecaller version
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.BasecallerVersion("0.42");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // non-numeric frame rate
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.FrameRateHz("foo");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+}
+
+TEST(ValidatorTest, ValidHeader)
+{
+    static const BamHeader validMappedHeader{
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.7\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:"
+        "734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:db972a04\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;"
+        "BINDINGKIT=101-717-300;SEQUENCINGKIT=101-644-500;BASECALLERVERSION=5.0.0;FRAMERATEHZ=100."
+        "000000"
+        "\tPU:m64004_190414_193017\tPM:SEQUELII\n"};
+
+    static const BamHeader validUnmappedHeader{
+        "@HD\tVN:1.5\tSO:unknown\tpb:3.0.7\n"
+        "@RG\tID:db972a04\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;"
+        "BINDINGKIT=101-717-300;SEQUENCINGKIT=101-644-500;BASECALLERVERSION=5.0.0;FRAMERATEHZ=100."
+        "000000\t"
+        "PU:m64004_190414_193017\tPM:SEQUELII\n"};
+
+    ASSERT_NO_THROW(Validator::Validate(validMappedHeader));
+    ASSERT_NO_THROW(Validator::Validate(validUnmappedHeader));
+}
+
+TEST(ValidatorTest, ValidateHeader)
+{
+    static const BamHeader validMappedHeader{
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.7\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:"
+        "734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:db972a04\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;"
+        "BINDINGKIT=101-717-300;SEQUENCINGKIT=101-644-500;BASECALLERVERSION=5.0.0;FRAMERATEHZ=100."
+        "000000"
+        "\tPU:m64004_190414_193017\tPM:SEQUELII\n"};
+
+    {  // invalid SAM version - non-numeric
+        BamHeader header = validMappedHeader.DeepCopy();
+        header.Version("foo");
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+    {  // invalid SAM version - negative version numbers
+        BamHeader header = validMappedHeader.DeepCopy();
+        header.Version("-1.4.0");
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+    {  // invalid sort order
+        BamHeader header = validMappedHeader.DeepCopy();
+        header.SortOrder("not_a_valid_sort_order");
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+
+    // invalid PacBioBamVersion numbers (non-numeric, negative, earlier than min)
+    // already throw when you try to set them... so we have to catch & ignore
+    // initial exception to get to validator
+
+    {  // invalid PacBioBAM version - non-numeric
+        BamHeader header = validMappedHeader.DeepCopy();
+        try {
+            header.PacBioBamVersion("foo");
+        } catch (...) {
+        }
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+    {  // invalid PacBioBAM version - negative version numbers
+        BamHeader header = validMappedHeader.DeepCopy();
+        try {
+            header.PacBioBamVersion("-1.4.0");
+        } catch (...) {
+        }
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+    {  // invalid PacBioBAM version - earlier than minimum allowed
+        BamHeader header = validMappedHeader.DeepCopy();
+        try {
+            header.PacBioBamVersion("3.0.0");
+        } catch (...) {
+        }
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+}
+
+TEST(ValidatorTest, ValidRecord)
+{
+    static const BamHeader validMappedHeader{
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.7\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:"
+        "734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:db972a04\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;"
+        "BINDINGKIT=101-717-300;SEQUENCINGKIT=101-644-500;BASECALLERVERSION=5.0.0;FRAMERATEHZ=100."
+        "000000"
+        "\tPU:m64004_190414_193017\tPM:SEQUELII\n"};
+    BamRecord record(ValidatorTests::validMappedRecord);
+    record.header_ = validMappedHeader;
+    ASSERT_NO_THROW(Validator::Validate(record));
+}
+
+static inline void ModifyTag(BamRecord* record, const std::string& tagName, const Tag& tag)
+{
+    if (record->Impl().HasTag(tagName))
+        record->Impl().EditTag(tagName, tag);
+    else
+        record->Impl().AddTag(tagName, tag);
+}
+
+static inline void CheckInvalidTagLength(const std::string& tagName, const Tag& tag)
+{
+    static const BamHeader validUnmappedHeader{
+        "@HD\tVN:1.5\tSO:unknown\tpb:3.0.7\n"
+        "@RG\tID:db972a04\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;"
+        "BINDINGKIT=101-717-300;SEQUENCINGKIT=101-644-500;BASECALLERVERSION=5.0.0;FRAMERATEHZ=100."
+        "000000\t"
+        "PU:m64004_190414_193017\tPM:SEQUELII\n"};
+    BamRecord record(ValidatorTests::validUnmappedRecord);
+    record.header_ = validUnmappedHeader;
+
+    ModifyTag(&record, tagName, tag);
+
+    EXPECT_THROW(Validator::Validate(record), ValidationException);
+    EXPECT_FALSE(Validator::IsValid(record));
+}
+
+TEST(ValidatorTest, TagDataLengths)
+{
+    static const BamHeader validUnmappedHeader{
+        "@HD\tVN:1.5\tSO:unknown\tpb:3.0.7\n"
+        "@RG\tID:db972a04\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;"
+        "BINDINGKIT=101-717-300;SEQUENCINGKIT=101-644-500;BASECALLERVERSION=5.0.0;FRAMERATEHZ=100."
+        "000000\t"
+        "PU:m64004_190414_193017\tPM:SEQUELII\n"};
+
+    // make these "variable-length" SEQ/tags too short for the read's stated
+    // queryStart/queryEnd
+
+    {  // SEQ
+        BamRecord record(ValidatorTests::validUnmappedRecord);
+        record.header_ = validUnmappedHeader;
+        record.Impl().SetSequenceAndQualities("AA");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+
+    CheckInvalidTagLength("dq", QualityValues("@@").Fastq());  // DeletionQV
+    CheckInvalidTagLength("iq", QualityValues("@@").Fastq());  // InsertionQV
+    CheckInvalidTagLength("mq", QualityValues("@@").Fastq());  // MergeQV
+    CheckInvalidTagLength("sq", QualityValues("@@").Fastq());  // SubstitutionQV
+    CheckInvalidTagLength("dt", std::string("AA"));            // DeletionTag
+    CheckInvalidTagLength("st", std::string("AA"));            // SubstitutionTag
+
+    const Frames f{{42, 42, 42}};
+    const auto& frames = f.Data();
+    CheckInvalidTagLength("ip", frames);  // IPD
+
+    // NOTE: disabling "internal" tag checks for now, only checking "standard"
+    //       PacBioBAM tags
+
+    //    const auto& pulses = vector<uint16_t>{42, 42, 42};
+    //    CheckInvalidTagLength("pv", QualityValues("@@").Fastq());  // AltLabelQV
+    //    CheckInvalidTagLength("pq", QualityValues("@@").Fastq());  // LabelQV
+    //    CheckInvalidTagLength("pg", QualityValues("@@").Fastq());  // PulseMergeQv
+    //    CheckInvalidTagLength("pt", string("AA")); // AltLabelTag
+    //    CheckInvalidTagLength("pc", string("AA")); // PulseCall
+    //    CheckInvalidTagLength("pd", frames); // PrePulseFrames
+    //    CheckInvalidTagLength("px", frames); // PulseCallWidth
+    //    CheckInvalidTagLength("pw", frames); // PulseWidth
+    //    CheckInvalidTagLength("pa", pulses); // Pkmean
+    //    CheckInvalidTagLength("ps", pulses); // Pkmean2
+    //    CheckInvalidTagLength("pm", pulses); // Pkmid
+    //    CheckInvalidTagLength("pi", pulses); // Pkmid2
+}
+
+TEST(ValidatorTest, TagDataValues)
+{
+    static const BamHeader validMappedHeader{
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.7\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:"
+        "734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:db972a04\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;"
+        "BINDINGKIT=101-717-300;SEQUENCINGKIT=101-644-500;BASECALLERVERSION=5.0.0;FRAMERATEHZ=100."
+        "000000"
+        "\tPU:m64004_190414_193017\tPM:SEQUELII\n"};
+
+    {  // missing qe
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("qe");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // missing qs
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("qs");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // queryStart should be < queryEnd
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.QueryStart(10);
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // missing zm
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("zm");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // missing np
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("np");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // numPasses for SUBREAD type records should be 1
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.NumPasses(42);
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // missing sn
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("sn");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+}
+
+TEST(ValidatorTest, MappedRecords)
+{
+    static const BamHeader validMappedHeader{
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.7\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:"
+        "734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:db972a04\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;"
+        "BINDINGKIT=101-717-300;SEQUENCINGKIT=101-644-500;BASECALLERVERSION=5.0.0;FRAMERATEHZ=100."
+        "000000"
+        "\tPU:m64004_190414_193017\tPM:SEQUELII\n"};
+
+    {  // mapped record should have valid refID
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().ReferenceId(-1);
+
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // mapped record should have valid position
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().Position(-1);
+
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+}
+
+TEST(ValidatorTest, UnmappedRecords)
+{
+    static const BamHeader validUnmappedHeader{
+        "@HD\tVN:1.5\tSO:unknown\tpb:3.0.7\n"
+        "@RG\tID:db972a04\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;"
+        "BINDINGKIT=101-717-300;SEQUENCINGKIT=101-644-500;BASECALLERVERSION=5.0.0;FRAMERATEHZ=100."
+        "000000\t"
+        "PU:m64004_190414_193017\tPM:SEQUELII\n"};
+
+    {  // unmapped should have no refID
+        BamRecord record(ValidatorTests::validUnmappedRecord);
+        record.header_ = validUnmappedHeader;
+        record.Impl().ReferenceId(0);
+
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // unmapped should have no position
+        BamRecord record(ValidatorTests::validUnmappedRecord);
+        record.header_ = validUnmappedHeader;
+        record.Impl().Position(42);
+
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+}
diff --git a/tests/src/test_VcfFile.cpp b/tests/src/test_VcfFile.cpp

new file mode 100644 (file)

index 0000000..793fb69
--- /dev/null
+++ b/tests/src/test_VcfFile.cpp
@@ -0,0 +1,44 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfFormat.h>
+
+#include "PbbamTestData.h"
+
+using VcfFile = PacBio::VCF::VcfFile;
+using VcfFormat = PacBio::VCF::VcfFormat;
+
+namespace VcfFileTests {
+
+static const std::string BasicHeaderText{
+    "##fileformat=VCFv4.2\n"
+    "##fileDate=20180509\n"
+    "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variant\">\n"
+    "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n"
+    "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant "
+    "described in this record\">\n"
+    "##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"Difference in length between REF and ALT "
+    "alleles\">\n"
+    "##INFO=<ID=SVANN,Number=.,Type=String,Description=\"Repeat annotation of structural "
+    "variant\">\n"
+    "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
+    "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Per-sample read depth of this structural "
+    "variant\">\n"
+    "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth at this position for this "
+    "sample\">\n"
+    "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tUnnamedSample"};
+
+static const std::string VcfFn{PacBio::BAM::PbbamTestsConfig::Data_Dir +
+                               "/vcf/structural_variants.vcf"};
+
+}  // namespace VcfFileTests
+
+TEST(VCF_File, initializes_header_from_input_file)
+{
+    const VcfFile file{VcfFileTests::VcfFn};
+    const auto hdrText = VcfFormat::FormattedHeader(file.Header());
+
+    EXPECT_EQ(VcfFileTests::BasicHeaderText, hdrText);
+}
diff --git a/tests/src/test_VcfFormat.cpp b/tests/src/test_VcfFormat.cpp

new file mode 100644 (file)

index 0000000..632594b
--- /dev/null
+++ b/tests/src/test_VcfFormat.cpp
@@ -0,0 +1,421 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+#include <pbbam/vcf/VcfFormat.h>
+#include <pbbam/vcf/VcfHeader.h>
+#include <pbbam/vcf/VcfVariant.h>
+
+#include "PbbamTestData.h"
+
+using ContigDefinition = PacBio::VCF::ContigDefinition;
+using FilterDefinition = PacBio::VCF::FilterDefinition;
+using FormatDefinition = PacBio::VCF::FormatDefinition;
+using GeneralDefinition = PacBio::VCF::GeneralDefinition;
+using InfoDefinition = PacBio::VCF::InfoDefinition;
+using Sample = PacBio::VCF::Sample;
+using VcfFormat = PacBio::VCF::VcfFormat;
+using VcfHeader = PacBio::VCF::VcfHeader;
+using VcfVariant = PacBio::VCF::VcfVariant;
+
+namespace VcfFormatTests {
+
+static const std::string BasicHeaderText{
+    "##fileformat=VCFv4.2\n"
+    "##fileDate=20180509\n"
+    "##contig=<ID=ctg1,length=4200,assembly=foo,md5=dead123beef>\n"
+    "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variant\">\n"
+    "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n"
+    "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant "
+    "described in this record\">\n"
+    "##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"Difference in length between REF and ALT "
+    "alleles\">\n"
+    "##INFO=<ID=SVANN,Number=.,Type=String,Description=\"Repeat annotation of structural "
+    "variant\">\n"
+    "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
+    "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Per-sample read depth of this structural "
+    "variant\">\n"
+    "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth at this position for this "
+    "sample\">\n"
+    "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tUnnamedSample"};
+
+// does not have ##contig line(s) in file
+static const std::string FileHeaderText{
+    "##fileformat=VCFv4.2\n"
+    "##fileDate=20180509\n"
+    "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variant\">\n"
+    "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n"
+    "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant "
+    "described in this record\">\n"
+    "##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"Difference in length between REF and ALT "
+    "alleles\">\n"
+    "##INFO=<ID=SVANN,Number=.,Type=String,Description=\"Repeat annotation of structural "
+    "variant\">\n"
+    "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
+    "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Per-sample read depth of this structural "
+    "variant\">\n"
+    "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth at this position for this "
+    "sample\">\n"
+    "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tUnnamedSample"};
+
+static const std::string BasicVariantText{
+    "chrXVI\t660831\tpbsv.INS.21\tC\tCAAAGGAATGGTAAAGATGGGGGGTCAACGGACAAGGGAAAGGATCCATGGGGGCA\t."
+    "\tPASS"
+    "\tIMPRECISE;SVTYPE=INS;END=660831;SVLEN=55;MULTI=1,2,3\tGT:AD:DP:AC\t0/1:2:5:1,2"};
+
+static const std::string VcfFn{PacBio::BAM::PbbamTestsConfig::Data_Dir +
+                               "/vcf/structural_variants.vcf"};
+
+}  // namespace VcfFormatTests
+
+TEST(VCF_Format, provides_current_version)
+{
+    const std::string version = VcfFormat::CurrentVersion();
+    EXPECT_EQ("VCFv4.2", version);
+}
+
+//## ----------------------------------------------------------------- ##
+//
+//              HEADER FORMATTING
+//
+//## ----------------------------------------------------------------- ##
+
+TEST(VCF_Format, can_format_contig_definition)
+{
+    const ContigDefinition def{"ctg1",
+                               {{"length", "4200"}, {"assembly", "foo"}, {"md5", "dead123beef"}}};
+    const auto text = VcfFormat::FormattedContigDefinition(def);
+    EXPECT_EQ("##contig=<ID=ctg1,length=4200,assembly=foo,md5=dead123beef>", text);
+}
+
+TEST(VCF_Format, can_format_filter_definition)
+{
+    const FilterDefinition def{"FILTER1", "Filter1"};
+    const auto text = VcfFormat::FormattedFilterDefinition(def);
+    EXPECT_EQ("##FILTER=<ID=FILTER1,Description=\"Filter1\">", text);
+}
+
+TEST(VCF_Format, can_format_format_definition)
+{
+    const FormatDefinition def{"GT", "1", "String", "Genotype"};
+    const auto text = VcfFormat::FormattedFormatDefinition(def);
+    EXPECT_EQ("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", text);
+}
+
+TEST(VCF_Format, can_format_general_header_definition)
+{
+    const GeneralDefinition def{"phasing", "partial"};
+    const auto text = VcfFormat::FormattedGeneralDefinition(def);
+    EXPECT_EQ("##phasing=partial", text);
+}
+
+TEST(VCF_Format, can_format_info_definition)
+{
+    const InfoDefinition def{"IMPRECISE", "0", "Flag", "Imprecise structural variant"};
+    const auto text = VcfFormat::FormattedInfoDefinition(def);
+    EXPECT_EQ(
+        "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variant\">",
+        text);
+}
+
+TEST(VCF_Format, can_format_info_definition_with_optional_fields)
+{
+    {  // with Source
+        const InfoDefinition def{"IMPRECISE", "0", "Flag", "Imprecise structural variant",
+                                 "source1"};
+        const auto text = VcfFormat::FormattedInfoDefinition(def);
+        EXPECT_EQ(
+            "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural "
+            "variant\",Source=\"source1\">",
+            text);
+    }
+
+    {  // with Version
+        const InfoDefinition def{"IMPRECISE", "0",       "Flag", "Imprecise structural variant",
+                                 "",          "version1"};
+        const auto text = VcfFormat::FormattedInfoDefinition(def);
+        EXPECT_EQ(
+            "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural "
+            "variant\",Version=\"version1\">",
+            text);
+    }
+    {  // with Source & Version
+        const InfoDefinition def{"IMPRECISE", "0",       "Flag", "Imprecise structural variant",
+                                 "source1",   "version1"};
+        const auto text = VcfFormat::FormattedInfoDefinition(def);
+        EXPECT_EQ(
+            "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural "
+            "variant\",Source=\"source1\",Version=\"version1\">",
+            text);
+    }
+}
+
+TEST(VCF_Format, can_format_basic_header)
+{
+    const VcfHeader header{VcfFormatTests::BasicHeaderText};
+    const auto text = VcfFormat::FormattedHeader(header);
+    EXPECT_EQ(VcfFormatTests::BasicHeaderText, text);
+}
+
+TEST(VCF_Format, format_basic_header_with_only_filedate)
+{
+    VcfHeader header;
+    header.FileDate("1770704");
+    std::string text;
+    EXPECT_NO_THROW(text = VcfFormat::FormattedHeader(header));
+}
+
+TEST(VCF_Format, format_basic_header_with_only_version)
+{
+    VcfHeader header;
+    header.Version("3.14");
+    std::string text;
+    EXPECT_NO_THROW(text = VcfFormat::FormattedHeader(header));
+}
+
+//## ----------------------------------------------------------------- ##
+//
+//              HEADER PARSING
+//
+//## ----------------------------------------------------------------- ##
+
+TEST(VCF_Format, can_parse_general_header_definition)
+{
+    const auto phasing = VcfFormat::ParsedGeneralDefinition("##phasing=partial");
+    EXPECT_EQ("phasing", phasing.Id());
+    EXPECT_EQ("partial", phasing.Text());
+}
+
+TEST(VCF_Format, parsing_general_header_definition_throws_on_empty_string)
+{
+    EXPECT_THROW(VcfFormat::ParsedGeneralDefinition(""), std::runtime_error);
+}
+
+TEST(VCF_Format, parsing_general_header_definition_throws_on_non_vcf_input)
+{
+    EXPECT_THROW(VcfFormat::ParsedGeneralDefinition("not_vcf_header_line"), std::runtime_error);
+    EXPECT_THROW(VcfFormat::ParsedGeneralDefinition("#line=not_vcf_header_line"),
+                 std::runtime_error);
+    EXPECT_THROW(VcfFormat::ParsedGeneralDefinition("##line,not_vcf_header_line"),
+                 std::runtime_error);
+}
+
+TEST(VCF_Format, can_parse_contig_definition_with_id_only)
+{
+    const auto contig = VcfFormat::ParsedContigDefinition("##contig=<ID=ctg1>");
+    EXPECT_EQ("ctg1", contig.Id());
+    EXPECT_TRUE(contig.Attributes().empty());
+}
+
+TEST(VCF_Format, can_parse_contig_definition_with_attributes)
+{
+    const auto contig =
+        VcfFormat::ParsedContigDefinition("##contig=<ID=ctg1,assembly=foo,length=3>");
+    EXPECT_EQ("ctg1", contig.Id());
+    ASSERT_EQ(2, contig.Attributes().size());
+
+    const auto& firstAttr = contig.Attributes().at(0);
+    EXPECT_EQ("assembly", firstAttr.first);
+    EXPECT_EQ("foo", firstAttr.second);
+
+    const auto& secondAttr = contig.Attributes().at(1);
+    EXPECT_EQ("length", secondAttr.first);
+    EXPECT_EQ("3", secondAttr.second);
+}
+
+TEST(VCF_Format, parsing_contig_header_definition_throws_on_malformed_contig_line)
+{
+    // internal code already checks for "##contig=<"
+
+    EXPECT_THROW(VcfFormat::ParsedContigDefinition("##contig=<foo"), std::runtime_error);
+    EXPECT_THROW(VcfFormat::ParsedContigDefinition("##contig=<ID=,>"), std::runtime_error);
+}
+
+TEST(VCF_Format, can_parse_filter_definition)
+{
+    const auto filter =
+        VcfFormat::ParsedFilterDefinition("##FILTER=<ID=FILTER1,Description=\"Filter1\">\n");
+    EXPECT_EQ("FILTER1", filter.Id());
+    EXPECT_EQ("Filter1", filter.Description());
+}
+
+TEST(VCF_Format, parsing_filter_definition_throws_on_malformed_filter_line)
+{
+    // internal code already checks for "##FILTER=<"
+
+    EXPECT_THROW(VcfFormat::ParsedFilterDefinition("##FILTER=<foo"), std::runtime_error);
+    EXPECT_THROW(VcfFormat::ParsedFilterDefinition("##FILTER=<ID=,>"), std::runtime_error);
+}
+
+TEST(VCF_Format, can_parse_format_definition)
+{
+    const auto format = VcfFormat::ParsedFormatDefinition(
+        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n");
+    EXPECT_EQ("GT", format.Id());
+    EXPECT_EQ("1", format.Number());
+    EXPECT_EQ("String", format.Type());
+    EXPECT_EQ("Genotype", format.Description());
+}
+
+TEST(VCF_Format, parsing_format_definition_throws_on_malformed_filter_line)
+{
+    // internal code already checks for "##FORMAT=<"
+
+    EXPECT_THROW(VcfFormat::ParsedFormatDefinition("##FORMAT=<foo"), std::runtime_error);
+    EXPECT_THROW(VcfFormat::ParsedFormatDefinition("##FORMAT=<ID=,>"), std::runtime_error);
+}
+
+TEST(VCF_Format, can_parse_info_definition)
+{
+    const auto info = VcfFormat::ParsedInfoDefinition(
+        "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variant\">\n");
+    EXPECT_EQ("IMPRECISE", info.Id());
+    EXPECT_EQ("0", info.Number());
+    EXPECT_EQ("Flag", info.Type());
+    EXPECT_EQ("Imprecise structural variant", info.Description());
+    EXPECT_FALSE(info.Source().is_initialized());
+    EXPECT_FALSE(info.Version().is_initialized());
+}
+
+TEST(VCF_Format, parsing_info_definition_throws_on_malformed_info_line)
+{
+    // internal code already checks for "##INFO=<"
+
+    EXPECT_THROW(VcfFormat::ParsedInfoDefinition("##INFO=<foo"), std::runtime_error);
+    EXPECT_THROW(VcfFormat::ParsedInfoDefinition("##INFO=<ID=,>"), std::runtime_error);
+}
+
+TEST(VCF_Format, can_create_header_from_text)
+{
+    const VcfHeader hdr{VcfFormatTests::BasicHeaderText};
+
+    EXPECT_EQ("VCFv4.2", hdr.Version());
+    EXPECT_EQ("20180509", hdr.FileDate());
+
+    const auto& infos = hdr.InfoDefinitions();
+    ASSERT_EQ(5, infos.size());
+    EXPECT_EQ("IMPRECISE", infos.at(0).Id());
+    EXPECT_EQ("SVTYPE", infos.at(1).Id());
+    EXPECT_EQ("END", infos.at(2).Id());
+    EXPECT_EQ("SVLEN", infos.at(3).Id());
+    EXPECT_EQ("SVANN", infos.at(4).Id());
+
+    const auto& contigs = hdr.ContigDefinitions();
+    ASSERT_EQ(1, contigs.size());
+    EXPECT_EQ("ctg1", contigs.at(0).Id());
+
+    ASSERT_EQ(3, contigs.at(0).Attributes().size());
+    EXPECT_EQ("length", contigs.at(0).Attributes().at(0).first);
+    EXPECT_EQ("assembly", contigs.at(0).Attributes().at(1).first);
+    EXPECT_EQ("md5", contigs.at(0).Attributes().at(2).first);
+
+    const auto& filters = hdr.FilterDefinitions();
+    ASSERT_EQ(0, filters.size());
+
+    const auto& formats = hdr.FormatDefinitions();
+    ASSERT_EQ(3, formats.size());
+    EXPECT_EQ("GT", formats.at(0).Id());
+    EXPECT_EQ("AD", formats.at(1).Id());
+    EXPECT_EQ("DP", formats.at(2).Id());
+
+    const auto& samples = hdr.Samples();
+    ASSERT_EQ(1, samples.size());
+    EXPECT_EQ("UnnamedSample", samples[0]);
+}
+
+TEST(VCF_Format, header_parsing_throws_on_missing_fileformat_line)
+{
+    const std::string missingFormat{
+        "##fileDate=20180509\n"
+        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tUnnamedSample\n"};
+
+    EXPECT_THROW({ VcfHeader h(missingFormat); }, std::runtime_error);
+}
+
+TEST(VCF_Format, header_parsing_throws_on_non_vcf_header_line)
+{
+    const std::string nonVcfLine{
+        "##fileformat=VCFv4.2\n"
+        " --- how did I get in here?? --- \n"
+        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tUnnamedSample\n"};
+
+    EXPECT_THROW({ VcfHeader h(nonVcfLine); }, std::runtime_error);
+}
+
+TEST(VCF_Format, can_parse_header_from_stream)
+{
+    std::istringstream in(VcfFormatTests::BasicHeaderText);
+    const auto header = VcfFormat::HeaderFromStream(in);
+    EXPECT_EQ(VcfFormatTests::BasicHeaderText, VcfFormat::FormattedHeader(header));
+}
+
+TEST(VCF_Format, can_parse_header_from_file)
+{
+    const std::string fn{VcfFormatTests::VcfFn};
+    const auto header = VcfFormat::HeaderFromFile(fn);
+    EXPECT_EQ(VcfFormatTests::FileHeaderText, VcfFormat::FormattedHeader(header));
+}
+
+//## ----------------------------------------------------------------- ##
+//
+//              VARIANT FORMATTING
+//
+//## ----------------------------------------------------------------- ##
+
+TEST(VCF_Format, can_format_basic_variant)
+{
+    const VcfVariant var = VcfFormat::ParsedVariant(VcfFormatTests::BasicVariantText);
+    const auto text = VcfFormat::FormattedVariant(var);
+    EXPECT_EQ(VcfFormatTests::BasicVariantText, text);
+}
+
+//## ----------------------------------------------------------------- ##
+//
+//              VARIANT PARSING
+//
+//## ----------------------------------------------------------------- ##
+
+TEST(VCF_Format, can_create_variant_from_text)
+{
+    const VcfVariant var = VcfFormat::ParsedVariant(VcfFormatTests::BasicVariantText);
+
+    // CHROM POS ID REF ALT REF QUAL FILTER
+    EXPECT_EQ("chrXVI", var.Chrom());
+    EXPECT_EQ(660831, var.Position());
+    EXPECT_EQ("pbsv.INS.21", var.Id());
+    EXPECT_EQ("C", var.RefAllele());
+    EXPECT_EQ("CAAAGGAATGGTAAAGATGGGGGGTCAACGGACAAGGGAAAGGATCCATGGGGGCA", var.AltAllele());
+    EXPECT_TRUE(var.IsQualityMissing());
+    EXPECT_EQ("PASS", var.Filter());
+
+    // INFO
+    const auto& infoFields = var.InfoFields();
+    ASSERT_EQ(5, infoFields.size());
+    EXPECT_EQ("IMPRECISE", infoFields.at(0).id);
+    EXPECT_EQ("SVTYPE", infoFields.at(1).id);
+    EXPECT_EQ("END", infoFields.at(2).id);
+    EXPECT_EQ("SVLEN", infoFields.at(3).id);
+    EXPECT_EQ("MULTI", infoFields.at(4).id);
+
+    // GENOTYPES
+    const auto& ids = var.GenotypeIds();
+    ASSERT_EQ(4, ids.size());
+    EXPECT_EQ("GT", ids.at(0));
+    EXPECT_EQ("AD", ids.at(1));
+    EXPECT_EQ("DP", ids.at(2));
+    EXPECT_EQ("AC", ids.at(3));
+
+    const auto& genotypes = var.Genotypes();
+    ASSERT_EQ(1, genotypes.size());
+
+    const auto& sampleGenotype = genotypes.at(0);
+    ASSERT_EQ(4, sampleGenotype.data.size());
+    EXPECT_EQ("0/1", sampleGenotype.data.at(0).value.get());
+    EXPECT_EQ("2", sampleGenotype.data.at(1).value.get());
+    EXPECT_EQ("5", sampleGenotype.data.at(2).value.get());
+    const auto& acData = sampleGenotype.data.at(3);
+    ASSERT_EQ(2, acData.values->size());
+    EXPECT_EQ("1", acData.values->at(0));
+    EXPECT_EQ("2", acData.values->at(1));
+
+    //    ASSERT_TRUE(sampleGenotype.values.is_initialized());
+}
diff --git a/tests/src/test_VcfHeader.cpp b/tests/src/test_VcfHeader.cpp

new file mode 100644 (file)

index 0000000..2f17f67
--- /dev/null
+++ b/tests/src/test_VcfHeader.cpp
@@ -0,0 +1,189 @@
+// Author: Derek Barnett
+
+#include <iostream>
+
+#include <gtest/gtest.h>
+#include <pbbam/vcf/VcfHeader.h>
+
+using ContigDefinition = PacBio::VCF::ContigDefinition;
+using FilterDefinition = PacBio::VCF::FilterDefinition;
+using FormatDefinition = PacBio::VCF::FormatDefinition;
+using GeneralDefinition = PacBio::VCF::GeneralDefinition;
+using InfoDefinition = PacBio::VCF::InfoDefinition;
+using VcfHeader = PacBio::VCF::VcfHeader;
+
+namespace VcfHeaderTests {
+
+static const std::string BasicHeaderText{
+    "##fileformat=VCFv4.2\n"
+    "##fileDate=20180509\n"
+    "##contig=<ID=ctg1,length=4200,assembly=foo,md5=dead123beef>\n"
+    "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variant\">\n"
+    "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n"
+    "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant "
+    "described in this record\">\n"
+    "##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"Difference in length between REF and ALT "
+    "alleles\">\n"
+    "##INFO=<ID=SVANN,Number=.,Type=String,Description=\"Repeat annotation of structural "
+    "variant\">\n"
+    "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
+    "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Per-sample read depth of this structural "
+    "variant\">\n"
+    "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth at this position for this "
+    "sample\">\n"
+    "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tUnnamedSample\n"};
+
+}  // namespace VcfHeaderTests
+
+TEST(VCF_GeneralDefinition, throws_on_missing_required_fields)
+{
+    const std::string id{"id"};
+    const std::string desc{"desc"};
+
+    EXPECT_THROW(GeneralDefinition("", desc), std::runtime_error);
+    EXPECT_THROW(GeneralDefinition(id, ""), std::runtime_error);
+}
+
+TEST(VCF_ContigDefinition, throws_on_missing_required_fields)
+{
+    EXPECT_THROW(ContigDefinition(""), std::runtime_error);
+}
+
+TEST(VCF_ContigDefinition, can_edit_and_query_attributes)
+{
+    ContigDefinition contig{"id"};
+
+    EXPECT_TRUE(contig.Attributes().empty());
+
+    const std::vector<std::pair<std::string, std::string>> attributes{{"assembly", "foo"},
+                                                                      {"length", "42"}};
+    contig.Attributes(attributes);
+    ASSERT_EQ(2, contig.Attributes().size());
+    EXPECT_EQ("foo", contig.Attributes().at(0).second);
+    EXPECT_EQ("42", contig.Attributes().at(1).second);
+
+    contig.AddAttribute({"md5", "dead123beef"});
+    ASSERT_EQ(3, contig.Attributes().size());
+    EXPECT_EQ("dead123beef", contig.Attributes().at(2).second);
+}
+
+TEST(VCF_FilterDefinition, throws_on_missing_required_fields)
+{
+    const std::string id{"id"};
+    const std::string desc{"desc"};
+
+    EXPECT_THROW(FilterDefinition("", desc), std::runtime_error);
+    EXPECT_THROW(FilterDefinition(id, ""), std::runtime_error);
+}
+
+TEST(VCF_InfoDefinition, throws_on_missing_required_fields)
+{
+    const std::string id{"id"};
+    const std::string num{"num"};
+    const std::string type{"type"};
+    const std::string desc{"desc"};
+
+    EXPECT_THROW(InfoDefinition("", num, type, desc), std::runtime_error);
+    EXPECT_THROW(InfoDefinition(id, "", type, desc), std::runtime_error);
+    EXPECT_THROW(InfoDefinition(id, num, "", desc), std::runtime_error);
+    EXPECT_THROW(InfoDefinition(id, num, type, ""), std::runtime_error);
+}
+
+TEST(VCF_InfoDefinition, missing_optional_fields_is_not_error)
+{
+    InfoDefinition info{"id", "num", "type", "description"};
+
+    EXPECT_FALSE(info.Source().is_initialized());
+    EXPECT_FALSE(info.Version().is_initialized());
+
+    info.Source("source");
+    info.Version("version");
+
+    EXPECT_TRUE(info.Source().is_initialized());
+    EXPECT_TRUE(info.Version().is_initialized());
+}
+
+TEST(VCF_Header, defaults_to_current_version)
+{
+    VcfHeader hdr;
+    EXPECT_EQ("VCFv4.2", hdr.Version());
+}
+
+TEST(VCF_Header, can_lookup_contig_defnition_by_id)
+{
+    const VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto& contig = hdr.ContigDefinition("ctg1");
+    ASSERT_EQ(3, contig.Attributes().size());
+    EXPECT_EQ("length", contig.Attributes().at(0).first);
+    EXPECT_EQ("assembly", contig.Attributes().at(1).first);
+    EXPECT_EQ("md5", contig.Attributes().at(2).first);
+}
+
+TEST(VCF_Header, can_lookup_format_definition_by_id)
+{
+    const VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto& format = hdr.FormatDefinition("GT");
+    EXPECT_EQ("GT", format.Id());
+}
+
+TEST(VCF_Header, can_lookup_general_definition_by_id)
+{
+    const VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto& def = hdr.GeneralDefinition("fileformat");
+    EXPECT_EQ("fileformat", def.Id());
+}
+
+TEST(VCF_Header, can_lookup_info_definition_by_id)
+{
+    const VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto& info = hdr.InfoDefinition("IMPRECISE");
+    EXPECT_EQ("IMPRECISE", info.Id());
+}
+
+TEST(VCF_Header, can_lookup_sample)
+{
+    const VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto idx = hdr.IndexOfSample("UnnamedSample");
+    const auto sample = hdr.SampleAt(idx);
+    EXPECT_EQ("UnnamedSample", sample);
+}
+
+TEST(VCF_Header, add_duplicate_format_replaces_existing_definition)
+{
+    VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto initialFormat = hdr.FormatDefinition("GT");
+    EXPECT_EQ("Genotype", initialFormat.Description());
+
+    const FormatDefinition newFormat{"GT", "num", "type", "newDescription"};
+    hdr.AddFormatDefinition(newFormat);
+
+    const auto nowFormat = hdr.FormatDefinition("GT");
+    EXPECT_EQ("newDescription", nowFormat.Description());
+
+    // rest of defs unchanged
+    const auto& formatDefs = hdr.FormatDefinitions();
+    ASSERT_EQ(3, formatDefs.size());
+    EXPECT_EQ("AD", formatDefs.at(1).Id());
+    EXPECT_EQ("DP", formatDefs.at(2).Id());
+}
+
+TEST(VCF_Header, add_duplicate_info_replaces_existing_definition)
+{
+    VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto initialInfo = hdr.InfoDefinition("IMPRECISE");
+    EXPECT_EQ("Imprecise structural variant", initialInfo.Description());
+
+    const InfoDefinition newInfo{"IMPRECISE", "num", "type", "newInfo"};
+    hdr.AddInfoDefinition(newInfo);
+
+    const auto nowInfo = hdr.InfoDefinition("IMPRECISE");
+    EXPECT_EQ("newInfo", nowInfo.Description());
+
+    // rest of defs unchanged
+    const auto& infoDefs = hdr.InfoDefinitions();
+    ASSERT_EQ(5, infoDefs.size());
+    EXPECT_EQ("SVTYPE", infoDefs.at(1).Id());
+    EXPECT_EQ("END", infoDefs.at(2).Id());
+    EXPECT_EQ("SVLEN", infoDefs.at(3).Id());
+    EXPECT_EQ("SVANN", infoDefs.at(4).Id());
+}
diff --git a/tests/src/test_VcfQuery.cpp b/tests/src/test_VcfQuery.cpp

new file mode 100644 (file)

index 0000000..8898add
--- /dev/null
+++ b/tests/src/test_VcfQuery.cpp
@@ -0,0 +1,49 @@
+// Author: Derek Barnett
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfQuery.h>
+
+#include "PbbamTestData.h"
+
+using VcfFile = PacBio::VCF::VcfFile;
+using VcfQuery = PacBio::VCF::VcfQuery;
+using VcfVariant = PacBio::VCF::VcfVariant;
+
+namespace VcfQueryTests {
+
+static const std::vector<std::string> ExpectedIds{
+    "pbsv.INS.1",  "pbsv.DEL.2",  "pbsv.INS.3",  "pbsv.INS.4",  "pbsv.DEL.5",  "pbsv.DEL.6",
+    "pbsv.DEL.7",  "pbsv.INS.8",  "pbsv.INS.9",  "pbsv.INS.10", "pbsv.INS.11", "pbsv.INS.12",
+    "pbsv.INS.13", "pbsv.INS.14", "pbsv.INS.15", "pbsv.INS.16", "pbsv.INS.17", "pbsv.INS.18",
+    "pbsv.INS.19", "pbsv.DEL.20", "pbsv.INS.21"};
+
+static const std::string VcfFn{PacBio::BAM::PbbamTestsConfig::Data_Dir +
+                               "/vcf/structural_variants.vcf"};
+
+}  // namespace VcfQueryTests
+
+TEST(VCF_Query, can_use_range_over_input_filename)
+{
+    size_t i = 0;
+    VcfQuery query{VcfQueryTests::VcfFn};
+    for (const auto& var : query) {
+        EXPECT_EQ(VcfQueryTests::ExpectedIds.at(i), var.Id());
+        ++i;
+    }
+}
+
+TEST(VCF_Query, can_use_range_over_input_file_object)
+{
+    const VcfFile file{VcfQueryTests::VcfFn};
+
+    size_t i = 0;
+    VcfQuery query{file};
+    for (const auto& var : query) {
+        EXPECT_EQ(VcfQueryTests::ExpectedIds.at(i), var.Id());
+        ++i;
+    }
+}
diff --git a/tests/src/test_VcfReader.cpp b/tests/src/test_VcfReader.cpp

new file mode 100644 (file)

index 0000000..5050eee
--- /dev/null
+++ b/tests/src/test_VcfReader.cpp
@@ -0,0 +1,51 @@
+// Author: Derek Barnett
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfReader.h>
+
+#include "PbbamTestData.h"
+
+using VcfFile = PacBio::VCF::VcfFile;
+using VcfReader = PacBio::VCF::VcfReader;
+using VcfVariant = PacBio::VCF::VcfVariant;
+
+namespace VcfReaderTests {
+
+static const std::vector<std::string> ExpectedIds{
+    "pbsv.INS.1",  "pbsv.DEL.2",  "pbsv.INS.3",  "pbsv.INS.4",  "pbsv.DEL.5",  "pbsv.DEL.6",
+    "pbsv.DEL.7",  "pbsv.INS.8",  "pbsv.INS.9",  "pbsv.INS.10", "pbsv.INS.11", "pbsv.INS.12",
+    "pbsv.INS.13", "pbsv.INS.14", "pbsv.INS.15", "pbsv.INS.16", "pbsv.INS.17", "pbsv.INS.18",
+    "pbsv.INS.19", "pbsv.DEL.20", "pbsv.INS.21"};
+
+static const std::string VcfFn{PacBio::BAM::PbbamTestsConfig::Data_Dir +
+                               "/vcf/structural_variants.vcf"};
+
+}  // namespace VcfReaderTests
+
+TEST(VCF_Reader, can_fetch_variants_from_vcf_filename)
+{
+    size_t i = 0;
+    VcfReader rdr{VcfReaderTests::VcfFn};
+    VcfVariant var;
+    while (rdr.GetNext(var)) {
+        EXPECT_EQ(VcfReaderTests::ExpectedIds.at(i), var.Id());
+        ++i;
+    }
+}
+
+TEST(VCF_Reader, can_fetch_variants_from_vcf_file_object)
+{
+    const VcfFile file{VcfReaderTests::VcfFn};
+
+    size_t i = 0;
+    VcfReader rdr{file};
+    VcfVariant var;
+    while (rdr.GetNext(var)) {
+        EXPECT_EQ(VcfReaderTests::ExpectedIds.at(i), var.Id());
+        ++i;
+    }
+}
diff --git a/tests/src/test_VcfSort.cpp b/tests/src/test_VcfSort.cpp

new file mode 100644 (file)

index 0000000..4272991
--- /dev/null
+++ b/tests/src/test_VcfSort.cpp
@@ -0,0 +1,51 @@
+// Author: Derek Barnett
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/vcf/VcfQuery.h>
+#include <pbbam/vcf/VcfSort.h>
+
+#include "PbbamTestData.h"
+
+using VcfFile = PacBio::VCF::VcfFile;
+using VcfQuery = PacBio::VCF::VcfQuery;
+
+// clang-format off
+
+namespace VcfSortTests {
+
+static const std::string inputFn = PacBio::BAM::PbbamTestsConfig::Data_Dir +
+        "/vcf/unsorted.vcf";
+static const std::string outputFn = PacBio::BAM::PbbamTestsConfig::GeneratedData_Dir + "/sorted.vcf";
+
+} // namespace VcfSortTests
+
+TEST(VCF_Sort, sorts_input_file)
+{
+    const VcfFile file{VcfSortTests::inputFn};
+    PacBio::VCF::SortFile(file, VcfSortTests::outputFn);
+
+    const std::vector<std::string> expectedIds{
+        "variant0",
+        "variant5",
+        "variant1",
+        "variant3",
+        "variant4",
+        "variant2"
+    };
+
+    size_t i= 0;
+    VcfQuery query{VcfSortTests::outputFn};
+    for (const auto& var : query)
+    {
+        EXPECT_EQ(expectedIds.at(i), var.Id());
+        ++i;
+    }
+
+    // remove temp file
+    remove(VcfSortTests::outputFn.c_str());
+}
+
+// clang-format on
diff --git a/tests/src/test_VcfVariant.cpp b/tests/src/test_VcfVariant.cpp

new file mode 100644 (file)

index 0000000..879940d
--- /dev/null
+++ b/tests/src/test_VcfVariant.cpp
@@ -0,0 +1,234 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+#include <pbbam/vcf/VcfVariant.h>
+
+using InfoField = PacBio::VCF::InfoField;
+using VcfVariant = PacBio::VCF::VcfVariant;
+
+namespace VcfVariantTests {
+
+static const std::string BasicVariantText{
+    "chrXVI\t660831\tpbsv.INS.21\tC\tCAAAGGAATGGTAAAGATGGGGGGTCAACGGACAAGGGAAAGGATCCATGGGGGCA\t."
+    "\tPASS"
+    "\tIMPRECISE;SVTYPE=INS;END=660831;SVLEN=55;MULTI=1,2,3\tGT:AD:DP:AC\t0/1:2:5:1,2"};
+
+}  // namespace VcfVariantTests
+
+TEST(VCF_Variant, default_ctor_provides_proper_default_values)
+{
+    VcfVariant v;
+
+    EXPECT_TRUE(v.Chrom().empty());
+    EXPECT_EQ(PacBio::BAM::UnmappedPosition, v.Position());
+    EXPECT_TRUE(v.Id().empty());
+    EXPECT_TRUE(v.RefAllele().empty());
+    EXPECT_TRUE(v.AltAllele().empty());
+    EXPECT_TRUE(v.IsQualityMissing());
+    EXPECT_EQ("PASS", v.Filter());
+
+    EXPECT_FALSE(v.IsDeletion());
+    EXPECT_FALSE(v.IsInsertion());
+    EXPECT_FALSE(v.IsSnp());
+}
+
+TEST(VCF_Variant, can_create_snp)
+{
+    const VcfVariant v{"var_snp", "3", 3000, "C", "G"};
+
+    EXPECT_EQ("3", v.Chrom());
+    EXPECT_EQ(3000, v.Position());
+    EXPECT_EQ("var_snp", v.Id());
+    EXPECT_EQ("C", v.RefAllele());
+    EXPECT_EQ("G", v.AltAllele());
+    EXPECT_TRUE(v.IsQualityMissing());
+    EXPECT_EQ("PASS", v.Filter());
+
+    EXPECT_FALSE(v.IsDeletion());
+    EXPECT_FALSE(v.IsInsertion());
+    EXPECT_TRUE(v.IsSnp());
+}
+
+TEST(VCF_Variant, can_create_insertion)
+{
+    const VcfVariant v{"var_ins", "3", 3000, "C", "CTAG"};
+
+    EXPECT_EQ("3", v.Chrom());
+    EXPECT_EQ(3000, v.Position());
+    EXPECT_EQ("var_ins", v.Id());
+    EXPECT_EQ("C", v.RefAllele());
+    EXPECT_EQ("CTAG", v.AltAllele());
+    EXPECT_TRUE(v.IsQualityMissing());
+    EXPECT_EQ("PASS", v.Filter());
+
+    EXPECT_FALSE(v.IsDeletion());
+    EXPECT_TRUE(v.IsInsertion());
+    EXPECT_FALSE(v.IsSnp());
+}
+
+TEST(VCF_Variant, can_create_deletion)
+{
+    const VcfVariant v{"var_del", "3", 3000, "TCG", "T"};
+
+    EXPECT_EQ("3", v.Chrom());
+    EXPECT_EQ(3000, v.Position());
+    EXPECT_EQ("var_del", v.Id());
+    EXPECT_EQ("TCG", v.RefAllele());
+    EXPECT_EQ("T", v.AltAllele());
+    EXPECT_TRUE(v.IsQualityMissing());
+    EXPECT_EQ("PASS", v.Filter());
+
+    EXPECT_TRUE(v.IsDeletion());
+    EXPECT_FALSE(v.IsInsertion());
+    EXPECT_FALSE(v.IsSnp());
+}
+
+TEST(VCF_Variant, can_determine_if_info_field_is_present)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    EXPECT_TRUE(v.HasInfoField("SVLEN"));
+    EXPECT_FALSE(v.HasInfoField("nope"));
+}
+
+TEST(VCF_Variant, can_fetch_single_value_info_field)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    const auto& value = v.InfoValue("SVTYPE");
+    EXPECT_TRUE(value.is_initialized());
+    EXPECT_EQ("INS", value.get());
+}
+
+TEST(VCF_Variant, can_add_single_value_info_field)
+{
+    VcfVariant v{VcfVariantTests::BasicVariantText};
+
+    InfoField i;
+    i.id = "NEW";
+    i.value = "42";
+    v.AddInfoField(i);
+
+    EXPECT_TRUE(v.HasInfoField("NEW"));
+    EXPECT_EQ("42", v.InfoValue("NEW").get());
+}
+
+TEST(VCF_Variant, can_fetch_multi_value_info_field)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    const auto& values = v.InfoValues("MULTI");
+    EXPECT_TRUE(values.is_initialized());
+    EXPECT_EQ(3, values->size());
+    EXPECT_EQ("1", values->at(0));
+    EXPECT_EQ("2", values->at(1));
+    EXPECT_EQ("3", values->at(2));
+}
+
+TEST(VCF_Variant, can_edit_single_value_info_field)
+{
+    VcfVariant v{VcfVariantTests::BasicVariantText};
+    auto value = v.InfoValue("SVTYPE");
+    EXPECT_TRUE(value.is_initialized());
+    EXPECT_EQ("INS", value.get());
+
+    v.InfoValue("SVTYPE", std::string{"FOO"});
+
+    value = v.InfoValue("SVTYPE");
+    EXPECT_TRUE(value.is_initialized());
+    EXPECT_EQ("FOO", value.get());
+}
+
+TEST(VCF_Variant, can_edit_multi_value_info_field)
+{
+    VcfVariant v{VcfVariantTests::BasicVariantText};
+
+    auto values = v.InfoValues("MULTI");
+    EXPECT_TRUE(values.is_initialized());
+    EXPECT_EQ(3, values->size());
+    EXPECT_EQ("1", values->at(0));
+    EXPECT_EQ("2", values->at(1));
+    EXPECT_EQ("3", values->at(2));
+
+    std::vector<std::string> newData{"42", "42", "42"};
+    v.InfoValues("MULTI", newData);
+
+    values = v.InfoValues("MULTI");
+    EXPECT_TRUE(values.is_initialized());
+    EXPECT_EQ(3, values->size());
+    EXPECT_EQ("42", values->at(0));
+    EXPECT_EQ("42", values->at(1));
+    EXPECT_EQ("42", values->at(2));
+}
+
+TEST(VCF_Variant, can_add_multi_value_info_field)
+{
+    VcfVariant v{VcfVariantTests::BasicVariantText};
+    InfoField i;
+    i.id = "NEW";
+    i.values = std::vector<std::string>{"42", "42", "42"};
+    v.AddInfoField(i);
+
+    EXPECT_TRUE(v.HasInfoField("NEW"));
+    const auto& values = v.InfoValues("NEW");
+    EXPECT_EQ(3, values->size());
+    EXPECT_EQ("42", values->at(0));
+    EXPECT_EQ("42", values->at(1));
+    EXPECT_EQ("42", values->at(2));
+}
+
+TEST(VCF_Variant, can_remove_info_field)
+{
+    VcfVariant v{VcfVariantTests::BasicVariantText};
+
+    EXPECT_TRUE(v.HasInfoField("SVLEN"));
+    EXPECT_EQ("INS", v.InfoValue("SVTYPE").get());
+
+    v.RemoveInfoField("SVLEN");
+
+    EXPECT_FALSE(v.HasInfoField("SVLEN"));
+    EXPECT_EQ("INS", v.InfoValue("SVTYPE").get());
+}
+
+TEST(VCF_Variant, can_fetch_all_genotype_ids)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    const auto& genotypeIds = v.GenotypeIds();
+    ASSERT_EQ(4, genotypeIds.size());
+    EXPECT_EQ("GT", genotypeIds.at(0));
+    EXPECT_EQ("AD", genotypeIds.at(1));
+    EXPECT_EQ("DP", genotypeIds.at(2));
+    EXPECT_EQ("AC", genotypeIds.at(3));
+}
+
+TEST(VCF_Variant, can_fetch_all_genotype_fields)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    const auto& genotypeFields = v.Genotypes();
+    ASSERT_EQ(1, genotypeFields.size());
+}
+
+TEST(VCF_Variant, can_fetch_single_value_genotype_field)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    const auto& value = v.GenotypeValue(0, "AD");
+    EXPECT_TRUE(value.is_initialized());
+    EXPECT_EQ("2", value.get());
+}
+
+TEST(VCF_Variant, can_fetch_multi_value_genotype_field)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    const auto& values = v.GenotypeValues(0, "AC");
+    EXPECT_TRUE(values.is_initialized());
+    ASSERT_EQ(2, values->size());
+}
+
+TEST(VCF_Variant, can_determine_if_sample_is_heterozygous)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    EXPECT_TRUE(v.IsSampleHeterozygous(0));
+}
+
+TEST(VCF_Variant, can_determine_if_sample_is_phased)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    EXPECT_FALSE(v.IsSamplePhased(0));
+}
diff --git a/tests/src/test_VcfWriter.cpp b/tests/src/test_VcfWriter.cpp

new file mode 100644 (file)

index 0000000..9dbaa1e
--- /dev/null
+++ b/tests/src/test_VcfWriter.cpp
@@ -0,0 +1,55 @@
+// Author: Derek Barnett
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfFormat.h>
+#include <pbbam/vcf/VcfQuery.h>
+#include <pbbam/vcf/VcfWriter.h>
+
+#include "PbbamTestData.h"
+
+using VcfFile = PacBio::VCF::VcfFile;
+using VcfFormat = PacBio::VCF::VcfFormat;
+using VcfQuery = PacBio::VCF::VcfQuery;
+using VcfWriter = PacBio::VCF::VcfWriter;
+
+namespace VcfWriterTests {
+
+static const std::string VcfFn{PacBio::BAM::PbbamTestsConfig::Data_Dir +
+                               "/vcf/structural_variants.vcf"};
+
+}  // namespace VcfWriterTests
+
+TEST(VCF_Writer, correctly_copies_vcf_file)
+{
+    const std::string intitialFn{VcfWriterTests::VcfFn};
+    const std::string newFn{PacBio::BAM::PbbamTestsConfig::GeneratedData_Dir + "/temp.vcf"};
+
+    const VcfFile initialFile{VcfWriterTests::VcfFn};
+
+    const std::string expectedHeaderText = VcfFormat::FormattedHeader(initialFile.Header());
+    std::vector<std::string> expectedVariantsText;
+
+    {  // store contents of intitial file & write to a new file
+        VcfWriter writer{newFn, initialFile.Header()};
+        VcfQuery query{initialFile};
+        for (const auto& var : query) {
+            expectedVariantsText.push_back(VcfFormat::FormattedVariant(var));
+            writer.Write(var);
+        }
+    }
+    {  // read new file & compare against original
+
+        const VcfFile newFile{newFn};
+        EXPECT_EQ(expectedHeaderText, VcfFormat::FormattedHeader(newFile.Header()));
+
+        size_t i = 0;
+        for (const auto& var : VcfQuery{newFile}) {
+            EXPECT_EQ(expectedVariantsText.at(i), VcfFormat::FormattedVariant(var));
+            ++i;
+        }
+    }
+    ::remove(newFn.c_str());
+}
diff --git a/tests/src/test_Version.cpp b/tests/src/test_Version.cpp

new file mode 100644 (file)

index 0000000..bc16857
--- /dev/null
+++ b/tests/src/test_Version.cpp
@@ -0,0 +1,294 @@
+// Author: Derek Barnett
+
+#include <sstream>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "../src/Version.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace VersionTests {
+
+static inline Version MakeVersion(int x, int y, int z) { return Version(x, y, z); }
+
+}  // namespace VersionTests
+
+TEST(VersionTest, DefaultOk)
+{
+    Version v;
+    EXPECT_EQ(0, v.Major());
+    EXPECT_EQ(0, v.Minor());
+    EXPECT_EQ(0, v.Revision());
+}
+
+TEST(VersionTest, CopyAndMoveOk)
+{
+    {  // copy ctor
+        Version v1(3, 1, 1);
+        EXPECT_EQ(3, v1.Major());
+        EXPECT_EQ(1, v1.Minor());
+        EXPECT_EQ(1, v1.Revision());
+
+        Version v2(v1);
+        EXPECT_EQ(3, v2.Major());
+        EXPECT_EQ(1, v2.Minor());
+        EXPECT_EQ(1, v2.Revision());
+    }
+    {  // copy assign
+        Version v1(3, 1, 1);
+        EXPECT_EQ(3, v1.Major());
+        EXPECT_EQ(1, v1.Minor());
+        EXPECT_EQ(1, v1.Revision());
+
+        Version v2;
+        v2 = v1;
+        EXPECT_EQ(3, v2.Major());
+        EXPECT_EQ(1, v2.Minor());
+        EXPECT_EQ(1, v2.Revision());
+    }
+    {  // move ctor
+        Version v(VersionTests::MakeVersion(3, 1, 1));
+        EXPECT_EQ(3, v.Major());
+        EXPECT_EQ(1, v.Minor());
+        EXPECT_EQ(1, v.Revision());
+    }
+    {  // move assign
+        Version v1(3, 1, 1);
+        EXPECT_EQ(3, v1.Major());
+        EXPECT_EQ(1, v1.Minor());
+        EXPECT_EQ(1, v1.Revision());
+
+        Version v2;
+        v2 = std::move(v1);
+        EXPECT_EQ(3, v2.Major());
+        EXPECT_EQ(1, v2.Minor());
+        EXPECT_EQ(1, v2.Revision());
+    }
+}
+
+TEST(VersionTest, FromIntsOk)
+{
+    {  // normal
+        Version v(3, 1, 1);
+        EXPECT_EQ(3, v.Major());
+        EXPECT_EQ(1, v.Minor());
+        EXPECT_EQ(1, v.Revision());
+    }
+
+    // negatives
+    EXPECT_THROW(Version(-3, 1, 1), std::runtime_error);
+}
+
+TEST(VersionTest, FromStringOk)
+{
+    {  // normal
+        Version v("3.1.1");
+        EXPECT_EQ(3, v.Major());
+        EXPECT_EQ(1, v.Minor());
+        EXPECT_EQ(1, v.Revision());
+    }
+
+    // negatives
+    EXPECT_THROW(Version("-3.1.1"), std::runtime_error);
+
+    // non-numeric
+    EXPECT_THROW(Version("foo.bar.baz"), std::runtime_error);
+
+    // empty
+    EXPECT_THROW(Version(""), std::runtime_error);
+}
+
+TEST(VersionTest, SettersOk)
+{
+    Version v(3, 1, 1);
+
+    v.Major(4);
+
+    EXPECT_EQ(4, v.Major());
+    EXPECT_EQ(1, v.Minor());
+    EXPECT_EQ(1, v.Revision());
+
+    v.Minor(7);
+
+    EXPECT_EQ(4, v.Major());
+    EXPECT_EQ(7, v.Minor());
+    EXPECT_EQ(1, v.Revision());
+
+    v.Revision(23);
+
+    EXPECT_EQ(4, v.Major());
+    EXPECT_EQ(7, v.Minor());
+    EXPECT_EQ(23, v.Revision());
+
+    {  // invalid
+        Version v1(3, 1, 1);
+        Version v2(3, 1, 1);
+        Version v3(3, 1, 1);
+        EXPECT_THROW(v1.Major(-1), std::runtime_error);
+        EXPECT_THROW(v2.Minor(-1), std::runtime_error);
+        EXPECT_THROW(v3.Revision(-1), std::runtime_error);
+    }
+}
+
+TEST(VersionTest, ComparisonsOk)
+{
+    const Version v0_0_0 = Version(0, 0, 0);
+    const Version v0_0_4 = Version(0, 0, 4);
+    const Version v0_1_0 = Version(0, 1, 0);
+    const Version v0_1_4 = Version(0, 1, 4);
+    const Version v3_0_0 = Version(3, 0, 0);
+    const Version v3_0_4 = Version(3, 0, 4);
+    const Version v3_1_0 = Version(3, 1, 0);
+    const Version v3_1_4 = Version(3, 1, 4);
+    const Version v3_1_5 = Version(3, 1, 5);
+
+    // operator==
+    EXPECT_TRUE(v0_0_0 == v0_0_0);
+    EXPECT_TRUE(v3_0_0 == v3_0_0);
+    EXPECT_TRUE(v0_1_0 == v0_1_0);
+    EXPECT_TRUE(v0_0_4 == v0_0_4);
+    EXPECT_TRUE(v3_1_0 == v3_1_0);
+    EXPECT_TRUE(v3_1_4 == v3_1_4);
+
+    EXPECT_FALSE(v3_1_4 == v0_0_0);
+    EXPECT_FALSE(v3_1_4 == v3_0_0);
+    EXPECT_FALSE(v3_1_4 == v0_1_0);
+    EXPECT_FALSE(v3_1_4 == v0_0_4);
+    EXPECT_FALSE(v3_1_4 == v3_1_0);
+    EXPECT_FALSE(v3_1_4 == v3_1_5);
+
+    // operator!=
+    EXPECT_FALSE(v0_0_0 != v0_0_0);
+    EXPECT_FALSE(v3_0_0 != v3_0_0);
+    EXPECT_FALSE(v0_1_0 != v0_1_0);
+    EXPECT_FALSE(v0_0_4 != v0_0_4);
+    EXPECT_FALSE(v3_1_0 != v3_1_0);
+    EXPECT_FALSE(v3_1_4 != v3_1_4);
+
+    EXPECT_TRUE(v3_1_4 != v0_0_0);
+    EXPECT_TRUE(v3_1_4 != v3_0_0);
+    EXPECT_TRUE(v3_1_4 != v0_1_0);
+    EXPECT_TRUE(v3_1_4 != v0_0_4);
+    EXPECT_TRUE(v3_1_4 != v3_1_0);
+    EXPECT_TRUE(v3_1_4 != v3_1_5);
+
+    // operator<
+    EXPECT_FALSE(v0_0_0 < v0_0_0);
+    EXPECT_TRUE(v0_0_0 < v0_0_4);
+    EXPECT_TRUE(v0_0_0 < v0_1_0);
+    EXPECT_TRUE(v0_0_0 < v3_0_0);
+    EXPECT_TRUE(v0_0_0 < v0_1_4);
+    EXPECT_TRUE(v0_0_0 < v3_0_4);
+    EXPECT_TRUE(v0_0_0 < v3_1_0);
+    EXPECT_TRUE(v0_0_0 < v3_1_4);
+
+    EXPECT_TRUE(v0_0_4 < v3_1_4);
+    EXPECT_TRUE(v0_1_0 < v3_1_4);
+    EXPECT_TRUE(v0_1_4 < v3_1_4);
+    EXPECT_TRUE(v3_0_0 < v3_1_4);
+    EXPECT_TRUE(v3_0_4 < v3_1_4);
+    EXPECT_TRUE(v3_1_0 < v3_1_4);
+    EXPECT_FALSE(v3_1_4 < v3_1_4);
+    EXPECT_FALSE(v3_1_5 < v3_1_4);
+
+    EXPECT_FALSE(v3_1_4 < v0_0_0);
+
+    // operator<=
+    EXPECT_TRUE(v0_0_0 <= v0_0_0);
+    EXPECT_TRUE(v0_0_0 <= v0_0_4);
+    EXPECT_TRUE(v0_0_0 <= v0_1_0);
+    EXPECT_TRUE(v0_0_0 <= v3_0_0);
+    EXPECT_TRUE(v0_0_0 <= v0_1_4);
+    EXPECT_TRUE(v0_0_0 <= v3_0_4);
+    EXPECT_TRUE(v0_0_0 <= v3_1_0);
+    EXPECT_TRUE(v0_0_0 <= v3_1_4);
+
+    EXPECT_TRUE(v0_0_4 <= v3_1_4);
+    EXPECT_TRUE(v0_1_0 <= v3_1_4);
+    EXPECT_TRUE(v0_1_4 <= v3_1_4);
+    EXPECT_TRUE(v3_0_0 <= v3_1_4);
+    EXPECT_TRUE(v3_0_4 <= v3_1_4);
+    EXPECT_TRUE(v3_1_0 <= v3_1_4);
+    EXPECT_TRUE(v3_1_4 <= v3_1_4);
+    EXPECT_FALSE(v3_1_5 <= v3_1_4);
+
+    EXPECT_FALSE(v3_1_4 <= v0_0_0);
+
+    // operator>
+    EXPECT_FALSE(v0_0_0 > v0_0_0);
+    EXPECT_FALSE(v0_0_0 > v0_0_4);
+    EXPECT_FALSE(v0_0_0 > v0_1_0);
+    EXPECT_FALSE(v0_0_0 > v3_0_0);
+    EXPECT_FALSE(v0_0_0 > v0_1_4);
+    EXPECT_FALSE(v0_0_0 > v3_0_4);
+    EXPECT_FALSE(v0_0_0 > v3_1_0);
+    EXPECT_FALSE(v0_0_0 > v3_1_4);
+
+    EXPECT_FALSE(v0_0_4 > v3_1_4);
+    EXPECT_FALSE(v0_1_0 > v3_1_4);
+    EXPECT_FALSE(v0_1_4 > v3_1_4);
+    EXPECT_FALSE(v3_0_0 > v3_1_4);
+    EXPECT_FALSE(v3_0_4 > v3_1_4);
+    EXPECT_FALSE(v3_1_0 > v3_1_4);
+    EXPECT_FALSE(v3_1_4 > v3_1_4);
+    EXPECT_TRUE(v3_1_5 > v3_1_4);
+
+    EXPECT_TRUE(v3_1_4 > v0_0_0);
+
+    // operator>=
+    EXPECT_TRUE(v0_0_0 >= v0_0_0);
+    EXPECT_FALSE(v0_0_0 >= v0_0_4);
+    EXPECT_FALSE(v0_0_0 >= v0_1_0);
+    EXPECT_FALSE(v0_0_0 >= v3_0_0);
+    EXPECT_FALSE(v0_0_0 >= v0_1_4);
+    EXPECT_FALSE(v0_0_0 >= v3_0_4);
+    EXPECT_FALSE(v0_0_0 >= v3_1_0);
+    EXPECT_FALSE(v0_0_0 >= v3_1_4);
+
+    EXPECT_FALSE(v0_0_4 >= v3_1_4);
+    EXPECT_FALSE(v0_1_0 >= v3_1_4);
+    EXPECT_FALSE(v0_1_4 >= v3_1_4);
+    EXPECT_FALSE(v3_0_0 >= v3_1_4);
+    EXPECT_FALSE(v3_0_4 >= v3_1_4);
+    EXPECT_FALSE(v3_1_0 >= v3_1_4);
+    EXPECT_TRUE(v3_1_4 >= v3_1_4);
+    EXPECT_TRUE(v3_1_5 >= v3_1_4);
+
+    EXPECT_TRUE(v3_1_4 >= v0_0_0);
+}
+
+TEST(VersionTest, ToStringOk)
+{
+    {
+        Version v(0, 0, 0);
+        EXPECT_EQ(std::string("0.0.0"), v.ToString());
+    }
+    {
+        Version v(3, 1, 4);
+        EXPECT_EQ(std::string("3.1.4"), v.ToString());
+    }
+    {
+        Version v;
+        v.Major(4);
+        EXPECT_EQ(std::string("4.0.0"), v.ToString());
+    }
+    {
+        const std::string s = "1.2.3";
+        Version v(s);
+        EXPECT_EQ(s, v.ToString());
+    }
+}
+
+TEST(VersionTest, OutputStreamOk)
+{
+    Version v(3, 1, 4);
+    Version v2(4, 10, 0);
+
+    std::ostringstream s;
+    s << v << ", " << v2 << ", " << v << std::endl;
+
+    EXPECT_EQ(std::string("3.1.4, 4.10.0, 3.1.4\n"), s.str());
+}
diff --git a/tests/src/test_WhitelistedZmwReadStitcher.cpp b/tests/src/test_WhitelistedZmwReadStitcher.cpp

new file mode 100644 (file)

index 0000000..64359d9
--- /dev/null
+++ b/tests/src/test_WhitelistedZmwReadStitcher.cpp
@@ -0,0 +1,220 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamFile.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/PbiRawData.h>
+#include <pbbam/virtual/WhitelistedZmwReadStitcher.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace WhitelistedZmwReadStitcherTests {
+
+static void Compare(const BamRecord& b1, const BamRecord& b2)
+{
+    EXPECT_TRUE(b1.HasDeletionQV());
+    EXPECT_TRUE(b1.HasDeletionTag());
+    EXPECT_TRUE(b1.HasInsertionQV());
+    EXPECT_TRUE(b1.HasMergeQV());
+    EXPECT_TRUE(b1.HasSubstitutionQV());
+    EXPECT_TRUE(b1.HasSubstitutionTag());
+    EXPECT_TRUE(b1.HasLabelQV());
+    EXPECT_TRUE(b1.HasAltLabelQV());
+    EXPECT_TRUE(b1.HasAltLabelTag());
+    EXPECT_TRUE(b1.HasPkmean());
+    EXPECT_TRUE(b1.HasPkmid());
+    EXPECT_TRUE(b1.HasPulseCall());
+    EXPECT_TRUE(b1.HasIPD());
+    EXPECT_TRUE(b1.HasPulseWidth());
+    EXPECT_TRUE(b1.HasPrePulseFrames());
+    EXPECT_TRUE(b1.HasPulseCallWidth());
+    EXPECT_TRUE(b1.HasPulseMergeQV());
+
+    EXPECT_TRUE(b2.HasDeletionQV());
+    EXPECT_TRUE(b2.HasDeletionTag());
+    EXPECT_TRUE(b2.HasInsertionQV());
+    EXPECT_TRUE(b2.HasMergeQV());
+    EXPECT_TRUE(b2.HasSubstitutionQV());
+    EXPECT_TRUE(b2.HasSubstitutionTag());
+    EXPECT_TRUE(b2.HasLabelQV());
+    EXPECT_TRUE(b2.HasAltLabelQV());
+    EXPECT_TRUE(b2.HasAltLabelTag());
+    EXPECT_TRUE(b2.HasPkmean());
+    EXPECT_TRUE(b2.HasPkmid());
+    EXPECT_TRUE(b2.HasPulseCall());
+    EXPECT_TRUE(b2.HasIPD());
+    EXPECT_TRUE(b2.HasPulseWidth());
+    EXPECT_TRUE(b2.HasPrePulseFrames());
+    EXPECT_TRUE(b2.HasPulseCallWidth());
+    EXPECT_TRUE(b2.HasPulseMergeQV());
+
+    EXPECT_EQ(b1.FullName(), b2.FullName());
+    EXPECT_EQ(b1.HoleNumber(), b2.HoleNumber());
+    EXPECT_EQ(b1.NumPasses(), b2.NumPasses());
+    EXPECT_EQ(b1.Sequence(), b2.Sequence());
+    EXPECT_EQ(b1.Qualities(), b2.Qualities());
+    EXPECT_EQ(b1.DeletionQV(), b2.DeletionQV());
+    EXPECT_EQ(b1.DeletionTag(), b2.DeletionTag());
+    EXPECT_EQ(b1.InsertionQV(), b2.InsertionQV());
+    EXPECT_EQ(b1.MergeQV(), b2.MergeQV());
+    EXPECT_EQ(b1.SubstitutionQV(), b2.SubstitutionQV());
+    EXPECT_EQ(b1.SubstitutionTag(), b2.SubstitutionTag());
+    EXPECT_EQ(b1.LabelQV(), b2.LabelQV());
+    EXPECT_EQ(b1.AltLabelQV(), b2.AltLabelQV());
+    EXPECT_EQ(b1.AltLabelTag(), b2.AltLabelTag());
+    EXPECT_EQ(b1.Pkmean(), b2.Pkmean());
+    EXPECT_EQ(b1.Pkmid(), b2.Pkmid());
+    EXPECT_EQ(b1.PulseCall(), b2.PulseCall());
+    EXPECT_EQ(b1.IPD(), b2.IPD());
+    EXPECT_EQ(b1.PulseWidth(), b2.PulseWidth());
+    EXPECT_EQ(b1.PrePulseFrames(), b2.PrePulseFrames());
+    EXPECT_EQ(b1.PulseCallWidth(), b2.PulseCallWidth());
+    EXPECT_EQ(b1.ReadGroup(), b2.ReadGroup());
+    EXPECT_EQ(b1.PulseMergeQV(), b2.PulseMergeQV());
+}
+
+}  // namespace WhitelistedZmwReadStitcherTests
+
+TEST(WhitelistedZmwReadStitching, EmptyList)
+{
+    const std::vector<int32_t> whitelist = {};
+    WhitelistedZmwReadStitcher stitcher(
+        whitelist, PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+        PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+    EXPECT_FALSE(stitcher.HasNext());
+    EXPECT_TRUE(stitcher.NextRaw().empty());
+}
+
+TEST(WhitelistedZmwReadStitching, SingleValue)
+{
+    const std::vector<int32_t> whitelist = {200000};
+    WhitelistedZmwReadStitcher stitcher(
+        whitelist, PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+        PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+
+    // create virtual record
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // fetch original polymerase read (2nd record)
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    ++begin;
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin++;
+
+    EXPECT_EQ(200000, virtualRecord.HoleNumber());
+
+    WhitelistedZmwReadStitcherTests::Compare(polyRecord, virtualRecord);
+}
+
+TEST(WhitelistedZmwReadStitching, UnknownZmw)
+{
+    const std::vector<int32_t> whitelist{42};  // ZMW not in our files
+    WhitelistedZmwReadStitcher stitcher(
+        whitelist, PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+        PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+
+    EXPECT_FALSE(stitcher.HasNext());
+    EXPECT_TRUE(stitcher.NextRaw().empty());
+}
+
+TEST(WhitelistedZmwReadStitching, MultiValue)
+{
+    const std::vector<int32_t> whitelist = {100000, 300000};
+    WhitelistedZmwReadStitcher stitcher(
+        whitelist, PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+        PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+
+    // create virtual records
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord1 = stitcher.Next();
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord2 = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // fetch original polymerase reads (2nd record)
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+
+    EXPECT_TRUE(begin != end);
+    auto polyRecord1 = *begin++;
+    EXPECT_TRUE(begin != end);
+    ++begin;
+    EXPECT_TRUE(begin != end);
+    auto polyRecord2 = *begin++;
+    EXPECT_TRUE(begin == end);
+
+    EXPECT_EQ(100000, virtualRecord1.HoleNumber());
+    EXPECT_EQ(300000, virtualRecord2.HoleNumber());
+
+    WhitelistedZmwReadStitcherTests::Compare(polyRecord1, virtualRecord1);
+    WhitelistedZmwReadStitcherTests::Compare(polyRecord2, virtualRecord2);
+}
+
+TEST(WhitelistedZmwReadStitching, MultiValue_MixedKnownAndUnknown)
+{
+    const std::vector<int32_t> whitelist{42, 200000, 24};
+    WhitelistedZmwReadStitcher stitcher(
+        whitelist, PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+        PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+
+    // everything below should behave exactly as 'SingleValueOk' test,
+    // as the unknown ZMWs will have been removed during construction
+
+    // create virtual record
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // fetch original polymerase read (2nd record)
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    ++begin;
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin++;
+
+    EXPECT_EQ(200000, virtualRecord.HoleNumber());
+
+    WhitelistedZmwReadStitcherTests::Compare(polyRecord, virtualRecord);
+}
+
+TEST(WhitelistedZmwReadStitching, EmptyScrapsFileOk)
+{
+    const std::vector<int32_t> whitelist = {10944689, 10944690};
+    const std::string primaryBamFn =
+        PbbamTestsConfig::Data_Dir + "/polymerase/scrapless.subreads.bam";
+    const std::string scrapsBamFn = PbbamTestsConfig::Data_Dir + "/polymerase/scrapless.scraps.bam";
+
+    int count = 0;
+    WhitelistedZmwReadStitcher stitcher(whitelist, primaryBamFn, scrapsBamFn);
+    while (stitcher.HasNext()) {
+        auto record = stitcher.Next();
+        //        ()record;
+        ++count;
+    }
+    EXPECT_EQ(2, count);
+
+    const BamFile primaryBam(primaryBamFn);
+    const BamFile scrapsBam(scrapsBamFn);
+    const PbiRawData primaryIdx(primaryBam.PacBioIndexFilename());
+    const PbiRawData scrapsIdx(scrapsBam.PacBioIndexFilename());
+    EXPECT_EQ(3, primaryIdx.NumReads());
+    EXPECT_EQ(0, scrapsIdx.NumReads());
+}
diff --git a/tests/src/test_ZmwChunkedFastxReader.cpp b/tests/src/test_ZmwChunkedFastxReader.cpp

new file mode 100644 (file)

index 0000000..9ac0d4c
--- /dev/null
+++ b/tests/src/test_ZmwChunkedFastxReader.cpp
@@ -0,0 +1,121 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include "pbbam/ZmwChunkedFastaReader.h"
+#include "pbbam/ZmwChunkedFastqReader.h"
+
+#include "FastxTests.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(ZmwChunkedFastaReader, standard_fasta_from_chunk)
+{
+    ZmwChunkedFastaReader reader{FastxTests::chunkingFastaFn, 5};
+
+    {
+        const std::vector<std::string> expectedNames{"seq/0", "seq/1", "seq/2", "seq/3",
+                                                     "seq/4", "seq/5", "seq/6"};
+
+        reader.Chunk(0);
+
+        std::vector<std::string> names;
+        for (const auto& seq : reader)
+            names.push_back(seq.Name());
+
+        ASSERT_EQ(expectedNames.size(), names.size());
+        for (size_t i = 0; i < names.size(); ++i)
+            EXPECT_EQ(expectedNames.at(i), names.at(i));
+    }
+
+    {
+        const std::vector<std::string> expectedNames{"seq/14", "seq/15", "seq/16",
+                                                     "seq/17", "seq/18", "seq/19"};
+
+        reader.Chunk(2);
+
+        std::vector<std::string> names;
+        for (const auto& seq : reader)
+            names.push_back(seq.Name());
+
+        ASSERT_EQ(expectedNames.size(), names.size());
+        for (size_t i = 0; i < names.size(); ++i)
+            EXPECT_EQ(expectedNames.at(i), names.at(i));
+    }
+
+    {
+        const std::vector<std::string> expectedNames{
+            "seq/50",          "seq/100/0_100",      "seq/100/100_200",
+            "seq/100/200_300", "seq/100/300_400",    "seq/110/ccs",
+            "seq/120/ccs",     "seq/130/transcript", "seq/140/transcript"};
+
+        reader.Chunk(4);
+
+        std::vector<std::string> names;
+        for (const auto& seq : reader)
+            names.push_back(seq.Name());
+
+        ASSERT_EQ(expectedNames.size(), names.size());
+        for (size_t i = 0; i < names.size(); ++i)
+            EXPECT_EQ(expectedNames.at(i), names.at(i));
+    }
+}
+
+TEST(ZmwChunkedFastqReader, standard_fastq_from_chunk)
+{
+    ZmwChunkedFastqReader reader{FastxTests::chunkingFastqFn, 5};
+
+    {
+        const std::vector<std::string> expectedNames{"seq/0", "seq/1", "seq/2", "seq/3",
+                                                     "seq/4", "seq/5", "seq/6"};
+
+        reader.Chunk(0);
+
+        std::vector<std::string> names;
+        for (const auto& seq : reader)
+            names.push_back(seq.Name());
+
+        ASSERT_EQ(expectedNames.size(), names.size());
+        for (size_t i = 0; i < names.size(); ++i)
+            EXPECT_EQ(expectedNames.at(i), names.at(i));
+    }
+
+    {
+        const std::vector<std::string> expectedNames{"seq/14", "seq/15", "seq/16",
+                                                     "seq/17", "seq/18", "seq/19"};
+
+        reader.Chunk(2);
+
+        std::vector<std::string> names;
+        for (const auto& seq : reader)
+            names.push_back(seq.Name());
+
+        ASSERT_EQ(expectedNames.size(), names.size());
+        for (size_t i = 0; i < names.size(); ++i)
+            EXPECT_EQ(expectedNames.at(i), names.at(i));
+    }
+
+    {
+        const std::vector<std::string> expectedNames{
+            "seq/50",          "seq/100/0_100",      "seq/100/100_200",
+            "seq/100/200_300", "seq/100/300_400",    "seq/110/ccs",
+            "seq/120/ccs",     "seq/130/transcript", "seq/140/transcript"};
+
+        reader.Chunk(4);
+
+        std::vector<std::string> names;
+        for (const auto& seq : reader)
+            names.push_back(seq.Name());
+
+        ASSERT_EQ(expectedNames.size(), names.size());
+        for (size_t i = 0; i < names.size(); ++i)
+            EXPECT_EQ(expectedNames.at(i), names.at(i));
+    }
+}
+\ No newline at end of file
diff --git a/tests/src/test_ZmwQuery.cpp b/tests/src/test_ZmwQuery.cpp

new file mode 100644 (file)

index 0000000..77ee1c2
--- /dev/null
+++ b/tests/src/test_ZmwQuery.cpp
@@ -0,0 +1,31 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/ZmwQuery.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+//TEST(EntireFileQueryTest, CountRecords)
+//{
+//    EXPECT_NO_THROW(
+//    {
+//        // open input BAM file
+//        BamFile bamFile(inputBamFn);
+
+//        // count records
+//        int count = 0;
+//        EntireFileQuery entireFile(bamFile);
+//        for (const BamRecord& record : entireFile) {
+//            ()record;
+//            ++count;
+//        }
+
+//        EXPECT_EQ(3307, count);
+//    });
+//}
diff --git a/tests/src/test_ZmwReadStitcher.cpp b/tests/src/test_ZmwReadStitcher.cpp

new file mode 100644 (file)

index 0000000..2e634e4
--- /dev/null
+++ b/tests/src/test_ZmwReadStitcher.cpp
@@ -0,0 +1,486 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/PbiFilter.h>
+#include <pbbam/virtual/VirtualPolymeraseCompositeReader.h>
+#include <pbbam/virtual/VirtualPolymeraseReader.h>
+#include <pbbam/virtual/ZmwReadStitcher.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace ZmwReadStitcherTests {
+
+static void Compare(const BamRecord& b1, const BamRecord& b2)
+{
+    EXPECT_TRUE(b1.HasDeletionQV());
+    EXPECT_TRUE(b1.HasDeletionTag());
+    EXPECT_TRUE(b1.HasInsertionQV());
+    EXPECT_TRUE(b1.HasMergeQV());
+    EXPECT_TRUE(b1.HasSubstitutionQV());
+    EXPECT_TRUE(b1.HasSubstitutionTag());
+    EXPECT_TRUE(b1.HasLabelQV());
+    EXPECT_TRUE(b1.HasAltLabelQV());
+    EXPECT_TRUE(b1.HasAltLabelTag());
+    EXPECT_TRUE(b1.HasPkmean());
+    EXPECT_TRUE(b1.HasPkmid());
+    EXPECT_TRUE(b1.HasPulseCall());
+    EXPECT_TRUE(b1.HasIPD());
+    EXPECT_TRUE(b1.HasPulseWidth());
+    EXPECT_TRUE(b1.HasPrePulseFrames());
+    EXPECT_TRUE(b1.HasPulseCallWidth());
+    EXPECT_TRUE(b1.HasPulseMergeQV());
+
+    EXPECT_TRUE(b2.HasDeletionQV());
+    EXPECT_TRUE(b2.HasDeletionTag());
+    EXPECT_TRUE(b2.HasInsertionQV());
+    EXPECT_TRUE(b2.HasMergeQV());
+    EXPECT_TRUE(b2.HasSubstitutionQV());
+    EXPECT_TRUE(b2.HasSubstitutionTag());
+    EXPECT_TRUE(b2.HasLabelQV());
+    EXPECT_TRUE(b2.HasAltLabelQV());
+    EXPECT_TRUE(b2.HasAltLabelTag());
+    EXPECT_TRUE(b2.HasPkmean());
+    EXPECT_TRUE(b2.HasPkmid());
+    EXPECT_TRUE(b2.HasPulseCall());
+    EXPECT_TRUE(b2.HasIPD());
+    EXPECT_TRUE(b2.HasPulseWidth());
+    EXPECT_TRUE(b2.HasPrePulseFrames());
+    EXPECT_TRUE(b2.HasPulseCallWidth());
+    EXPECT_TRUE(b2.HasPulseMergeQV());
+
+    EXPECT_EQ(b1.FullName(), b2.FullName());
+    EXPECT_EQ(b1.HoleNumber(), b2.HoleNumber());
+    EXPECT_EQ(b1.NumPasses(), b2.NumPasses());
+    EXPECT_EQ(b1.Sequence(), b2.Sequence());
+    EXPECT_EQ(b1.Qualities(), b2.Qualities());
+    EXPECT_EQ(b1.DeletionQV(), b2.DeletionQV());
+    EXPECT_EQ(b1.DeletionTag(), b2.DeletionTag());
+    EXPECT_EQ(b1.InsertionQV(), b2.InsertionQV());
+    EXPECT_EQ(b1.MergeQV(), b2.MergeQV());
+    EXPECT_EQ(b1.SubstitutionQV(), b2.SubstitutionQV());
+    EXPECT_EQ(b1.SubstitutionTag(), b2.SubstitutionTag());
+    EXPECT_EQ(b1.LabelQV(), b2.LabelQV());
+    EXPECT_EQ(b1.AltLabelQV(), b2.AltLabelQV());
+    EXPECT_EQ(b1.AltLabelTag(), b2.AltLabelTag());
+    EXPECT_EQ(b1.Pkmean(), b2.Pkmean());
+    EXPECT_EQ(b1.Pkmid(), b2.Pkmid());
+    EXPECT_EQ(b1.PulseCall(), b2.PulseCall());
+    EXPECT_EQ(b1.IPD(), b2.IPD());
+    EXPECT_EQ(b1.PulseWidth(), b2.PulseWidth());
+    EXPECT_EQ(b1.PrePulseFrames(), b2.PrePulseFrames());
+    EXPECT_EQ(b1.PulseCallWidth(), b2.PulseCallWidth());
+    EXPECT_EQ(b1.ReadGroup(), b2.ReadGroup());
+    EXPECT_EQ(b1.PulseMergeQV(), b2.PulseMergeQV());
+}
+
+static size_t NumVirtualRecords(const std::string& primaryBamFn, const std::string& scrapsBamFn)
+{
+    ZmwReadStitcher stitcher(primaryBamFn, scrapsBamFn);
+    size_t count = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        //        ()record;
+        ++count;
+    }
+    return count;
+}
+
+}  // namespace ZmwReadStitcherTests
+
+TEST(ZmwReadStitching, FromBams_NoFilter)
+{
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+    size_t count = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        //        ()record;
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(ZmwReadStitching, FromBams_Filtered)
+{
+    PbiFilter filter{PbiZmwFilter{100000}};  // setup to match DataSet w/ filter
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam",
+                             filter);
+    size_t count = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        EXPECT_EQ(100000, record.HoleNumber());
+        ++count;
+    }
+    EXPECT_EQ(1, count);
+}
+
+TEST(ZmwReadStitching, FromDataSet_NoFilter)
+{
+    // dataset contains these resources (subreads/scraps + hqregion/scraps BAMs)
+    const std::string primaryFn1 =
+        PbbamTestsConfig::Data_Dir + "/polymerase/production.subreads.bam";
+    const std::string scrapsFn1 = PbbamTestsConfig::Data_Dir + "/polymerase/production.scraps.bam";
+    const std::string primaryFn2 =
+        PbbamTestsConfig::Data_Dir + "/polymerase/production_hq.hqregion.bam";
+    const std::string scrapsFn2 =
+        PbbamTestsConfig::Data_Dir + "/polymerase/production_hq.scraps.bam";
+    const size_t numExpectedRecords =
+        ZmwReadStitcherTests::NumVirtualRecords(primaryFn1, scrapsFn1) +
+        ZmwReadStitcherTests::NumVirtualRecords(primaryFn2, scrapsFn2);
+
+    const std::string datasetFn =
+        PbbamTestsConfig::Data_Dir + "/polymerase/multiple_resources.subread.dataset.xml";
+
+    DataSet ds{datasetFn};
+    ZmwReadStitcher stitcher{ds};
+    size_t numObservedRecords = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        //        ()record;
+        ++numObservedRecords;
+    }
+    EXPECT_EQ(numExpectedRecords, numObservedRecords);
+}
+
+TEST(ZmwReadStitching, FromDataSet_Filtered)
+{
+    // dataset contains these resources (subreads/scraps + hqregion/scraps BAMs)
+    const std::string primaryFn1 =
+        PbbamTestsConfig::Data_Dir + "/polymerase/production.subreads.bam";
+    const std::string scrapsFn1 = PbbamTestsConfig::Data_Dir + "/polymerase/production.scraps.bam";
+    const std::string primaryFn2 = PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam";
+    const std::string scrapsFn2 = PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam";
+    const std::string primaryFn3 =
+        PbbamTestsConfig::Data_Dir + "/polymerase/production_hq.hqregion.bam";
+    const std::string scrapsFn3 =
+        PbbamTestsConfig::Data_Dir + "/polymerase/production_hq.scraps.bam";
+    const size_t totalRecords = ZmwReadStitcherTests::NumVirtualRecords(primaryFn1, scrapsFn1) +
+                                ZmwReadStitcherTests::NumVirtualRecords(primaryFn2, scrapsFn2) +
+                                ZmwReadStitcherTests::NumVirtualRecords(primaryFn3, scrapsFn3);
+    EXPECT_EQ(5, totalRecords);
+
+    // our filter will remove the 2 "production" BAM pairs
+    // using a ZMW filter that only the "internal" pair should pass
+    const std::string datasetFn =
+        PbbamTestsConfig::Data_Dir + "/polymerase/filtered_resources.subread.dataset.xml";
+
+    DataSet ds{datasetFn};
+    ZmwReadStitcher stitcher{ds};
+    size_t numObservedRecords = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        //        ()record;
+        ++numObservedRecords;
+    }
+    EXPECT_EQ(1, numObservedRecords);
+}
+
+TEST(ZmwReadStitching, FromDataSet_EmptyDataSet)
+{
+    ZmwReadStitcher stitcher{DataSet{}};
+    EXPECT_FALSE(stitcher.HasNext());
+}
+
+TEST(ZmwReadStitching, EmptyScrapsFile)
+{
+    const std::string primaryBamFn =
+        PbbamTestsConfig::Data_Dir + "/polymerase/scrapless.subreads.bam";
+    const std::string scrapsBamFn = PbbamTestsConfig::Data_Dir + "/polymerase/scrapless.scraps.bam";
+
+    const BamFile primaryBam(primaryBamFn);
+    const BamFile scrapsBam(scrapsBamFn);
+    const PbiRawData primaryIdx(primaryBam.PacBioIndexFilename());
+    const PbiRawData scrapsIdx(scrapsBam.PacBioIndexFilename());
+    EXPECT_EQ(3, primaryIdx.NumReads());
+    EXPECT_EQ(0, scrapsIdx.NumReads());
+
+    int count = 0;
+    ZmwReadStitcher stitcher(primaryBamFn, scrapsBamFn);
+    while (stitcher.HasNext()) {
+        auto record = stitcher.Next();
+        //        ()record;
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(ZmwReadStitching, VirtualRegions)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+    auto virtualRecord = stitcher.Next();
+
+    auto regionMap = virtualRecord.VirtualRegionsMap();
+    auto adapter = virtualRecord.VirtualRegionsTable(VirtualRegionType::ADAPTER);
+
+    // Compare different accessors to same source
+    EXPECT_EQ(regionMap[VirtualRegionType::ADAPTER], adapter);
+
+    // Compare to truth
+    EXPECT_EQ(3047, adapter[0].beginPos);
+    EXPECT_EQ(3095, adapter[0].endPos);
+    EXPECT_EQ(3650, adapter[1].beginPos);
+    EXPECT_EQ(3700, adapter[1].endPos);
+    EXPECT_EQ(4289, adapter[2].beginPos);
+    EXPECT_EQ(4335, adapter[2].endPos);
+    EXPECT_EQ(4888, adapter[3].beginPos);
+    EXPECT_EQ(4939, adapter[3].endPos);
+    EXPECT_EQ(5498, adapter[4].beginPos);
+    EXPECT_EQ(5546, adapter[4].endPos);
+    EXPECT_EQ(6116, adapter[5].beginPos);
+    EXPECT_EQ(6173, adapter[5].endPos);
+    EXPECT_EQ(6740, adapter[6].beginPos);
+    EXPECT_EQ(6790, adapter[6].endPos);
+
+    auto barcode = virtualRecord.VirtualRegionsTable(VirtualRegionType::BARCODE);
+    EXPECT_EQ(regionMap[VirtualRegionType::BARCODE], barcode);
+    EXPECT_EQ(3025, barcode[0].beginPos);
+    EXPECT_EQ(3047, barcode[0].endPos);
+    EXPECT_EQ(3095, barcode[1].beginPos);
+    EXPECT_EQ(3116, barcode[1].endPos);
+    EXPECT_EQ(3628, barcode[2].beginPos);
+    EXPECT_EQ(3650, barcode[2].endPos);
+    EXPECT_EQ(3700, barcode[3].beginPos);
+    EXPECT_EQ(3722, barcode[3].endPos);
+    EXPECT_EQ(4267, barcode[4].beginPos);
+    EXPECT_EQ(4289, barcode[4].endPos);
+    EXPECT_EQ(4335, barcode[5].beginPos);
+    EXPECT_EQ(4356, barcode[5].endPos);
+    EXPECT_EQ(4864, barcode[6].beginPos);
+    EXPECT_EQ(4888, barcode[6].endPos);
+    EXPECT_EQ(4939, barcode[7].beginPos);
+    EXPECT_EQ(4960, barcode[7].endPos);
+    EXPECT_EQ(5477, barcode[8].beginPos);
+    EXPECT_EQ(5498, barcode[8].endPos);
+    EXPECT_EQ(5546, barcode[9].beginPos);
+    EXPECT_EQ(5571, barcode[9].endPos);
+    EXPECT_EQ(6087, barcode[10].beginPos);
+    EXPECT_EQ(6116, barcode[10].endPos);
+    EXPECT_EQ(6173, barcode[11].beginPos);
+    EXPECT_EQ(6199, barcode[11].endPos);
+    EXPECT_EQ(6719, barcode[12].beginPos);
+    EXPECT_EQ(6740, barcode[12].endPos);
+    EXPECT_EQ(6790, barcode[13].beginPos);
+    EXPECT_EQ(6812, barcode[13].endPos);
+
+    auto lqregion = virtualRecord.VirtualRegionsTable(VirtualRegionType::LQREGION);
+    EXPECT_EQ(regionMap[VirtualRegionType::LQREGION], lqregion);
+    EXPECT_EQ(0, lqregion[0].beginPos);
+    EXPECT_EQ(2659, lqregion[0].endPos);
+    EXPECT_EQ(7034, lqregion[1].beginPos);
+    EXPECT_EQ(7035, lqregion[1].endPos);
+
+    auto hqregion = virtualRecord.VirtualRegionsTable(VirtualRegionType::HQREGION);
+    EXPECT_EQ(regionMap[VirtualRegionType::HQREGION], hqregion);
+    EXPECT_EQ(2659, hqregion[0].beginPos);
+    EXPECT_EQ(7034, hqregion[0].endPos);
+}
+
+TEST(ZmwReadStitching, InternalSubreadsToOriginal)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+
+    // Read original polymerase read
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin;
+
+    // check
+    ZmwReadStitcherTests::Compare(polyRecord, virtualRecord);
+}
+
+TEST(ZmwReadStitching, InternalHQToOriginal)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/internal.hqregions.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/internal.lqregions.bam");
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+
+    // Read original polymerase read
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin;
+
+    // check
+    ZmwReadStitcherTests::Compare(polyRecord, virtualRecord);
+}
+
+TEST(ZmwReadStitching, ProductionSubreadsToOriginal)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/production.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/production.scraps.bam");
+
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // Read original polymerase read
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/production.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin;
+
+    EXPECT_EQ(polyRecord.FullName(), virtualRecord.FullName());
+    EXPECT_EQ(polyRecord.HoleNumber(), virtualRecord.HoleNumber());
+    EXPECT_FLOAT_EQ(polyRecord.ReadAccuracy(), virtualRecord.ReadAccuracy());
+    EXPECT_EQ(polyRecord.NumPasses(), virtualRecord.NumPasses());
+    EXPECT_EQ(polyRecord.Sequence(), virtualRecord.Sequence());
+    EXPECT_EQ(polyRecord.Qualities(), virtualRecord.Qualities());
+    EXPECT_EQ(polyRecord.DeletionQV(), virtualRecord.DeletionQV());
+    EXPECT_EQ(polyRecord.DeletionTag(), virtualRecord.DeletionTag());
+    EXPECT_EQ(polyRecord.InsertionQV(), virtualRecord.InsertionQV());
+    EXPECT_EQ(polyRecord.MergeQV(), virtualRecord.MergeQV());
+    EXPECT_EQ(polyRecord.SubstitutionQV(), virtualRecord.SubstitutionQV());
+    EXPECT_EQ(polyRecord.SubstitutionTag(), virtualRecord.SubstitutionTag());
+    EXPECT_EQ(polyRecord.IPD(), virtualRecord.IPDV1Frames());
+    EXPECT_EQ(polyRecord.ReadGroup(), virtualRecord.ReadGroup());
+}
+
+TEST(ZmwReadStitching, ProductionHQToOriginal)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/production_hq.hqregion.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/production_hq.scraps.bam");
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // Read original polymerase read
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/production.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin;
+
+    EXPECT_FALSE(polyRecord.HasPulseCall());
+    EXPECT_FALSE(virtualRecord.HasPulseCall());
+    EXPECT_EQ(polyRecord.FullName(), virtualRecord.FullName());
+    EXPECT_EQ(polyRecord.HoleNumber(), virtualRecord.HoleNumber());
+    EXPECT_EQ(polyRecord.ReadAccuracy(), virtualRecord.ReadAccuracy());
+    EXPECT_EQ(polyRecord.NumPasses(), virtualRecord.NumPasses());
+    EXPECT_EQ(polyRecord.Sequence(), virtualRecord.Sequence());
+    EXPECT_EQ(polyRecord.Qualities(), virtualRecord.Qualities());
+    EXPECT_EQ(polyRecord.DeletionQV(), virtualRecord.DeletionQV());
+    EXPECT_EQ(polyRecord.DeletionTag(), virtualRecord.DeletionTag());
+    EXPECT_EQ(polyRecord.InsertionQV(), virtualRecord.InsertionQV());
+    EXPECT_EQ(polyRecord.MergeQV(), virtualRecord.MergeQV());
+    EXPECT_EQ(polyRecord.SubstitutionQV(), virtualRecord.SubstitutionQV());
+    EXPECT_EQ(polyRecord.SubstitutionTag(), virtualRecord.SubstitutionTag());
+    EXPECT_EQ(polyRecord.IPD(), virtualRecord.IPDV1Frames());
+    EXPECT_EQ(polyRecord.ReadGroup(), virtualRecord.ReadGroup());
+
+    EXPECT_TRUE(polyRecord.HasDeletionQV());
+    EXPECT_TRUE(polyRecord.HasDeletionTag());
+    EXPECT_TRUE(polyRecord.HasInsertionQV());
+    EXPECT_TRUE(polyRecord.HasMergeQV());
+    EXPECT_TRUE(polyRecord.HasSubstitutionQV());
+    EXPECT_TRUE(polyRecord.HasSubstitutionTag());
+    EXPECT_TRUE(polyRecord.HasIPD());
+    EXPECT_FALSE(polyRecord.HasLabelQV());
+    EXPECT_FALSE(polyRecord.HasAltLabelQV());
+    EXPECT_FALSE(polyRecord.HasAltLabelTag());
+    EXPECT_FALSE(polyRecord.HasPkmean());
+    EXPECT_FALSE(polyRecord.HasPkmid());
+    EXPECT_FALSE(polyRecord.HasPulseCall());
+    EXPECT_FALSE(polyRecord.HasPulseWidth());
+    EXPECT_FALSE(polyRecord.HasPrePulseFrames());
+    EXPECT_FALSE(polyRecord.HasPulseCallWidth());
+
+    EXPECT_TRUE(virtualRecord.HasDeletionQV());
+    EXPECT_TRUE(virtualRecord.HasDeletionTag());
+    EXPECT_TRUE(virtualRecord.HasInsertionQV());
+    EXPECT_TRUE(virtualRecord.HasMergeQV());
+    EXPECT_TRUE(virtualRecord.HasSubstitutionQV());
+    EXPECT_TRUE(virtualRecord.HasSubstitutionTag());
+    EXPECT_TRUE(virtualRecord.HasIPD());
+    EXPECT_FALSE(virtualRecord.HasLabelQV());
+    EXPECT_FALSE(virtualRecord.HasAltLabelQV());
+    EXPECT_FALSE(virtualRecord.HasAltLabelTag());
+    EXPECT_FALSE(virtualRecord.HasPkmean());
+    EXPECT_FALSE(virtualRecord.HasPkmid());
+    EXPECT_FALSE(virtualRecord.HasPulseCall());
+    EXPECT_FALSE(virtualRecord.HasPulseWidth());
+    EXPECT_FALSE(virtualRecord.HasPrePulseFrames());
+    EXPECT_FALSE(virtualRecord.HasPulseCallWidth());
+}
+
+TEST(ZmwReadStitching, VirtualRecord_VirtualRegionsTable)
+{
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/production.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/production.scraps.bam");
+    EXPECT_TRUE(stitcher.HasNext());
+    const auto virtualRecord = stitcher.Next();
+
+    const auto subreads = virtualRecord.VirtualRegionsTable(VirtualRegionType::SUBREAD);
+    const auto adapters = virtualRecord.VirtualRegionsTable(VirtualRegionType::ADAPTER);
+    const auto hqRegions = virtualRecord.VirtualRegionsTable(VirtualRegionType::HQREGION);
+    const auto lqRegions = virtualRecord.VirtualRegionsTable(VirtualRegionType::LQREGION);
+    const auto barcodes = virtualRecord.VirtualRegionsTable(VirtualRegionType::BARCODE);
+    const auto filtered = virtualRecord.VirtualRegionsTable(VirtualRegionType::FILTERED);
+
+    EXPECT_FALSE(subreads.empty());
+    EXPECT_FALSE(adapters.empty());
+    EXPECT_FALSE(hqRegions.empty());
+    EXPECT_FALSE(lqRegions.empty());
+    EXPECT_FALSE(barcodes.empty());
+    EXPECT_TRUE(filtered.empty());  // this type not present in this data
+}
+
+TEST(ZmwReadStitching, LegacyTypedefsOk)
+{
+    {
+        VirtualPolymeraseReader reader(
+            PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+            PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+        size_t count = 0;
+        while (reader.HasNext()) {
+            const auto record = reader.Next();
+            //            ()record;
+            ++count;
+        }
+        EXPECT_EQ(3, count);
+    }
+
+    {
+        VirtualPolymeraseCompositeReader reader{DataSet{}};
+        EXPECT_FALSE(reader.HasNext());
+    }
+}
+
+TEST(ZmwReadStitching, EmptyScrapSorting)
+{
+    ZmwReadStitcher stitcher{PbbamTestsConfig::Data_Dir + "/stitching/test_qstart.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/stitching/test_qstart.scraps.bam"};
+    ASSERT_TRUE(stitcher.HasNext());
+
+    const auto record = stitcher.Next();
+    EXPECT_EQ(0, record.QueryStart());
+    EXPECT_EQ(1397, record.QueryEnd());
+}
diff --git a/tools/bam2sam/src/Bam2SamSettings.cpp b/tools/bam2sam/src/Bam2SamSettings.cpp

new file mode 100644 (file)

index 0000000..a62d191
--- /dev/null
+++ b/tools/bam2sam/src/Bam2SamSettings.cpp
@@ -0,0 +1,81 @@
+// Author: Derek Barnett
+
+#include "Bam2SamSettings.h"
+
+#include <stdexcept>
+
+#include "Bam2SamVersion.h"
+
+namespace PacBio {
+namespace Bam2Sam {
+namespace Options {
+
+// clang-format off
+const CLI_v2::Option NoHeader{
+R"({
+    "names" : ["no-header"],
+    "description" : "Omit header from output."
+})"};
+
+const CLI_v2::Option HeaderOnly{
+R"({
+    "names" : ["header-only"],
+    "description" : "Print only the header (no records)."
+})"};
+
+const CLI_v2::PositionalArgument InputFile{
+R"({
+    "name" : "IN.bam",
+    "description" : "Input BAM file. If not provided, stdin will be used as input.",
+    "type" : "file",
+    "required" : false
+})"};
+// clang-format on
+
+}  // namespace Options
+
+CLI_v2::Interface Settings::CreateCLI()
+{
+    // clang-format off
+    const std::string description{
+        "bam2sam converts a BAM file to SAM. It is essentially a stripped-down\n"
+        "'samtools view', mostly useful for testing/debugging without requiring samtools.\n"
+        "Input BAM file is read from a file or stdin, and SAM output is written to stdout."};
+
+    CLI_v2::Interface interface{"bam2sam", description, Bam2Sam::Version};
+    interface.DisableLogFileOption()
+             .DisableLogLevelOption()
+             .DisableNumThreadsOption();
+
+    interface.AddOptionGroup("Options",
+    {
+        Options::NoHeader,
+        Options::HeaderOnly
+    });
+    interface.AddPositionalArguments({
+        Options::InputFile
+    });
+    // clang-format on
+
+    return interface;
+}
+
+Settings::Settings(const CLI_v2::Results& args)
+    : NoHeader{args[Options::NoHeader]}, HeaderOnly{args[Options::HeaderOnly]}
+{
+    // input file
+    const auto& posArgs = args.PositionalArguments();
+    if (posArgs.empty())
+        InputFilename = "-";
+    else
+        InputFilename = posArgs.front();
+
+    // validate header print mode
+    if (NoHeader && HeaderOnly) {
+        throw std::runtime_error{
+            "conflicting arguments requested '--no-header' and '--header-only'"};
+    }
+}
+
+}  // namespace Bam2Sam
+}  // namespace PacBio
diff --git a/tools/bam2sam/src/Bam2SamSettings.h b/tools/bam2sam/src/Bam2SamSettings.h

new file mode 100644 (file)

index 0000000..68dc6b9
--- /dev/null
+++ b/tools/bam2sam/src/Bam2SamSettings.h
@@ -0,0 +1,27 @@
+// Author: Derek Barnett
+
+#ifndef BAM2SAM_SETTINGS_H
+#define BAM2SAM_SETTINGS_H
+
+#include <string>
+
+#include <pbcopper/cli2/CLI.h>
+
+namespace PacBio {
+namespace Bam2Sam {
+
+struct Settings
+{
+    static CLI_v2::Interface CreateCLI();
+
+    explicit Settings(const CLI_v2::Results& args);
+
+    std::string InputFilename;
+    bool NoHeader = false;
+    bool HeaderOnly = false;
+};
+
+}  // namespace Bam2Sam
+}  // namespace PacBio
+
+#endif  // BAM2SAM_SETTINGS_H
diff --git a/tools/bam2sam/src/Bam2SamVersion.h.in b/tools/bam2sam/src/Bam2SamVersion.h.in

new file mode 100644 (file)

index 0000000..6f4bb9b
--- /dev/null
+++ b/tools/bam2sam/src/Bam2SamVersion.h.in
@@ -0,0 +1,16 @@
+// Author: Derek Barnett
+
+#ifndef BAM2SAM_VERSION_H
+#define BAM2SAM_VERSION_H
+
+#include <string>
+
+namespace PacBio {
+namespace Bam2Sam {
+
+const std::string Version = std::string("@Bam2Sam_VERSION@");
+
+} // namespace Bam2Sam
+} // namespace PacBio
+
+#endif // BAM2SAM_VERSION_H
diff --git a/tools/bam2sam/src/Bam2SamWorkflow.cpp b/tools/bam2sam/src/Bam2SamWorkflow.cpp

new file mode 100644 (file)

index 0000000..dd4d385
--- /dev/null
+++ b/tools/bam2sam/src/Bam2SamWorkflow.cpp
@@ -0,0 +1,88 @@
+// Author: Derek Barnett
+
+#include "Bam2SamWorkflow.h"
+
+#include <memory>
+#include <stdexcept>
+
+#include <htslib/sam.h>
+
+#include "Bam2SamSettings.h"
+
+namespace PacBio {
+namespace Bam2Sam {
+namespace {
+
+struct HtslibFileDeleter
+{
+    void operator()(samFile* file)
+    {
+        if (file) sam_close(file);
+        file = nullptr;
+    }
+};
+
+struct HtslibHeaderDeleter
+{
+    void operator()(bam_hdr_t* hdr)
+    {
+        if (hdr) bam_hdr_destroy(hdr);
+        hdr = nullptr;
+    }
+};
+
+struct HtslibRecordDeleter
+{
+    void operator()(bam1_t* b)
+    {
+        if (b) bam_destroy1(b);
+        b = nullptr;
+    }
+};
+
+}  // namespace
+
+int Workflow::Runner(const CLI_v2::Results& args)
+{
+    const Settings settings{args};
+
+    int htslibResult = 0;
+
+    // open files
+
+    std::unique_ptr<samFile, HtslibFileDeleter> inFileWrapper(
+        sam_open(settings.InputFilename.c_str(), "rb"));
+    samFile* in = inFileWrapper.get();
+    if (!in || !in->fp.bgzf) throw std::runtime_error("could not read from stdin");
+
+    std::unique_ptr<samFile, HtslibFileDeleter> outFileWrapper(sam_open("-", "w"));
+    samFile* out = outFileWrapper.get();
+    if (!out) throw std::runtime_error("could not write to stdout");
+
+    // fetch & write header
+
+    std::unique_ptr<bam_hdr_t, HtslibHeaderDeleter> headerWrapper(bam_hdr_read(in->fp.bgzf));
+    bam_hdr_t* hdr = headerWrapper.get();
+    if (!hdr) throw std::runtime_error("could not read header");
+
+    if (!settings.NoHeader) {
+        htslibResult = sam_hdr_write(out, hdr);
+        if (htslibResult != 0) throw std::runtime_error("could not write header");
+        if (settings.HeaderOnly) return EXIT_SUCCESS;
+    }
+
+    // fetch & write records
+
+    std::unique_ptr<bam1_t, HtslibRecordDeleter> recordWrapper(bam_init1());
+    bam1_t* b = recordWrapper.get();
+
+    while ((htslibResult = sam_read1(in, hdr, b)) >= 0) {
+        htslibResult = sam_write1(out, hdr, b);
+        if (htslibResult < 0) throw std::runtime_error("error writing record to stdout");
+    }
+
+    return EXIT_SUCCESS;
+}
+
+}  // namespace Bam2Sam
+}  // namespace PacBio
diff --git a/tools/bam2sam/src/Bam2SamWorkflow.h b/tools/bam2sam/src/Bam2SamWorkflow.h

new file mode 100644 (file)

index 0000000..71d5fec
--- /dev/null
+++ b/tools/bam2sam/src/Bam2SamWorkflow.h
@@ -0,0 +1,19 @@
+// Author: Derek Barnett
+
+#ifndef BAM2SAM_WORKFLOW_H
+#define BAM2SAM_WORKFLOW_H
+
+#include <pbcopper/cli2/Results.h>
+
+namespace PacBio {
+namespace Bam2Sam {
+
+struct Workflow
+{
+    static int Runner(const CLI_v2::Results& args);
+};
+
+}  // namespace Bam2Sam
+}  // namespace PacBio
+
+#endif  // BAM2SAM_WORKFLOW_H
diff --git a/tools/bam2sam/src/main.cpp b/tools/bam2sam/src/main.cpp

new file mode 100644 (file)

index 0000000..be5df58
--- /dev/null
+++ b/tools/bam2sam/src/main.cpp
@@ -0,0 +1,21 @@
+// Author: Derek Barnett
+
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+
+#include <pbcopper/cli2/CLI.h>
+
+#include "Bam2SamSettings.h"
+#include "Bam2SamWorkflow.h"
+
+int main(int argc, char* argv[])
+{
+    try {
+        return PacBio::CLI_v2::Run(argc, argv, PacBio::Bam2Sam::Settings::CreateCLI(),
+                                   &PacBio::Bam2Sam::Workflow::Runner);
+    } catch (const std::exception& e) {
+        std::cerr << "bam2sam ERROR: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tools/check-formatting b/tools/check-formatting

new file mode 100755 (executable)

index 0000000..95a52e8
--- /dev/null
+++ b/tools/check-formatting
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+PLATFORM=$(uname)
+TOOLSPATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
+CLANGFORMAT="${TOOLSPATH}/${PLATFORM}/clang-format -style=file"
+
+if [ "$1" == "--all" ]
+then
+    find include src tests/src tools \( -name *.cpp -or -name *.h \) -not -name pugi* -print0 \
+    | xargs -n1 -0 ${CLANGFORMAT} -output-replacements-xml \
+    | grep -c "<replacement " > /dev/null
+    grepCode=$?
+elif [ "$1" == "--staged" ]
+then
+    git diff --cached --name-only --diff-filter=ACMRT | grep -e '.*\.h$' -e '.*\.cpp' -v '**third-party/*' \
+    | xargs -n1 ${CLANGFORMAT} -output-replacements-xml \
+    | grep -c "<replacement " >/dev/null
+    grepCode=$?
+else
+    echo "Please specify --all or --staged"
+    exit 1
+fi
+
+# grep exits 0 => found needed formatting changes
+if [ $grepCode -ne 0 ]
+then
+    echo "Formatting looks good!"
+    exit 0
+else
+    echo "****************************************************"
+    echo "Code needs formatting!  Please use 'tools/format-all'"
+    echo "****************************************************"
+    exit 1
+fi
diff --git a/tools/format-all b/tools/format-all

new file mode 100755 (executable)

index 0000000..27a11b4
--- /dev/null
+++ b/tools/format-all
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+# This command can be run by the user to clang-format everything.
+
+PLATFORM=$(uname)
+TOOLSPATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
+CLANGFORMAT="${TOOLSPATH}/${PLATFORM}/clang-format -style=file"
+
+find include src tests/src tools \( -name *.cpp -or -name *.h \) -not -name pugi* -print0 | xargs -n1 -0 ${CLANGFORMAT} -i
diff --git a/tools/git-clang-format b/tools/git-clang-format

new file mode 100755 (executable)

index 0000000..0c45762
--- /dev/null
+++ b/tools/git-clang-format
@@ -0,0 +1,485 @@
+#!/usr/bin/env python
+#
+#===- git-clang-format - ClangFormat Git Integration ---------*- python -*--===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===------------------------------------------------------------------------===#
+
+r"""                                                                             
+clang-format git integration                                                     
+============================                                                     
+                                                                                 
+This file provides a clang-format integration for git. Put it somewhere in your  
+path and ensure that it is executable. Then, "git clang-format" will invoke      
+clang-format on the changes in current files or a specific commit.               
+                                                                                 
+For further details, run:                                                        
+git clang-format -h                                                              
+                                                                                 
+Requires Python 2.7                                                              
+"""               
+
+import argparse
+import collections
+import contextlib
+import errno
+import os
+import re
+import subprocess
+import sys
+
+usage = 'git clang-format [OPTIONS] [<commit>] [--] [<file>...]'
+
+desc = '''
+Run clang-format on all lines that differ between the working directory
+and <commit>, which defaults to HEAD.  Changes are only applied to the working
+directory.
+
+The following git-config settings set the default of the corresponding option:
+  clangFormat.binary
+  clangFormat.commit
+  clangFormat.extension
+  clangFormat.style
+'''
+
+# Name of the temporary index file in which save the output of clang-format.
+# This file is created within the .git directory.
+temp_index_basename = 'clang-format-index'
+
+
+Range = collections.namedtuple('Range', 'start, count')
+
+
+def main():
+  config = load_git_config()
+
+  # In order to keep '--' yet allow options after positionals, we need to
+  # check for '--' ourselves.  (Setting nargs='*' throws away the '--', while
+  # nargs=argparse.REMAINDER disallows options after positionals.)
+  argv = sys.argv[1:]
+  try:
+    idx = argv.index('--')
+  except ValueError:
+    dash_dash = []
+  else:
+    dash_dash = argv[idx:]
+    argv = argv[:idx]
+
+  default_extensions = ','.join([
+      # From clang/lib/Frontend/FrontendOptions.cpp, all lower case
+      'c', 'h',  # C
+      'm',  # ObjC
+      'mm',  # ObjC++
+      'cc', 'cp', 'cpp', 'c++', 'cxx', 'hpp',  # C++
+      # Other languages that clang-format supports
+      'proto', 'protodevel',  # Protocol Buffers
+      'js',  # JavaScript
+      'ts',  # TypeScript
+      ])
+
+  p = argparse.ArgumentParser(
+    usage=usage, formatter_class=argparse.RawDescriptionHelpFormatter,
+    description=desc)
+  p.add_argument('--binary',
+                 default=config.get('clangformat.binary', 'clang-format'),
+                 help='path to clang-format'),
+  p.add_argument('--commit',
+                 default=config.get('clangformat.commit', 'HEAD'),
+                 help='default commit to use if none is specified'),
+  p.add_argument('--diff', action='store_true',
+                 help='print a diff instead of applying the changes')
+  p.add_argument('--extensions',
+                 default=config.get('clangformat.extensions',
+                                    default_extensions),
+                 help=('comma-separated list of file extensions to format, '
+                       'excluding the period and case-insensitive')),
+  p.add_argument('-f', '--force', action='store_true',
+                 help='allow changes to unstaged files')
+  p.add_argument('-p', '--patch', action='store_true',
+                 help='select hunks interactively')
+  p.add_argument('-q', '--quiet', action='count', default=0,
+                 help='print less information')
+  p.add_argument('--style',
+                 default=config.get('clangformat.style', None),
+                 help='passed to clang-format'),
+  p.add_argument('-v', '--verbose', action='count', default=0,
+                 help='print extra information')
+  # We gather all the remaining positional arguments into 'args' since we need
+  # to use some heuristics to determine whether or not <commit> was present.
+  # However, to print pretty messages, we make use of metavar and help.
+  p.add_argument('args', nargs='*', metavar='<commit>',
+                 help='revision from which to compute the diff')
+  p.add_argument('ignored', nargs='*', metavar='<file>...',
+                 help='if specified, only consider differences in these files')
+  opts = p.parse_args(argv)
+
+  opts.verbose -= opts.quiet
+  del opts.quiet
+
+  commit, files = interpret_args(opts.args, dash_dash, opts.commit)
+  changed_lines = compute_diff_and_extract_lines(commit, files)
+  if opts.verbose >= 1:
+    ignored_files = set(changed_lines)
+  filter_by_extension(changed_lines, opts.extensions.lower().split(','))
+  if opts.verbose >= 1:
+    ignored_files.difference_update(changed_lines)
+    if ignored_files:
+      print 'Ignoring changes in the following files (wrong extension):'
+      for filename in ignored_files:
+        print '   ', filename
+    if changed_lines:
+      print 'Running clang-format on the following files:'
+      for filename in changed_lines:
+        print '   ', filename
+  if not changed_lines:
+    print 'no modified files to format'
+    return
+  # The computed diff outputs absolute paths, so we must cd before accessing
+  # those files.
+  cd_to_toplevel()
+  old_tree = create_tree_from_workdir(changed_lines)
+  new_tree = run_clang_format_and_save_to_tree(changed_lines,
+                                               binary=opts.binary,
+                                               style=opts.style)
+  if opts.verbose >= 1:
+    print 'old tree:', old_tree
+    print 'new tree:', new_tree
+  if old_tree == new_tree:
+    if opts.verbose >= 0:
+      print 'clang-format did not modify any files'
+  elif opts.diff:
+    print_diff(old_tree, new_tree)
+  else:
+    changed_files = apply_changes(old_tree, new_tree, force=opts.force,
+                                  patch_mode=opts.patch)
+    if (opts.verbose >= 0 and not opts.patch) or opts.verbose >= 1:
+      print 'changed files:'
+      for filename in changed_files:
+        print '   ', filename
+
+
+def load_git_config(non_string_options=None):
+  """Return the git configuration as a dictionary.
+
+  All options are assumed to be strings unless in `non_string_options`, in which
+  is a dictionary mapping option name (in lower case) to either "--bool" or
+  "--int"."""
+  if non_string_options is None:
+    non_string_options = {}
+  out = {}
+  for entry in run('git', 'config', '--list', '--null').split('\0'):
+    if entry:
+      name, value = entry.split('\n', 1)
+      if name in non_string_options:
+        value = run('git', 'config', non_string_options[name], name)
+      out[name] = value
+  return out
+
+
+def interpret_args(args, dash_dash, default_commit):
+  """Interpret `args` as "[commit] [--] [files...]" and return (commit, files).
+
+  It is assumed that "--" and everything that follows has been removed from
+  args and placed in `dash_dash`.
+
+  If "--" is present (i.e., `dash_dash` is non-empty), the argument to its
+  left (if present) is taken as commit.  Otherwise, the first argument is
+  checked if it is a commit or a file.  If commit is not given,
+  `default_commit` is used."""
+  if dash_dash:
+    if len(args) == 0:
+      commit = default_commit
+    elif len(args) > 1:
+      die('at most one commit allowed; %d given' % len(args))
+    else:
+      commit = args[0]
+    object_type = get_object_type(commit)
+    if object_type not in ('commit', 'tag'):
+      if object_type is None:
+        die("'%s' is not a commit" % commit)
+      else:
+        die("'%s' is a %s, but a commit was expected" % (commit, object_type))
+    files = dash_dash[1:]
+  elif args:
+    if disambiguate_revision(args[0]):
+      commit = args[0]
+      files = args[1:]
+    else:
+      commit = default_commit
+      files = args
+  else:
+    commit = default_commit
+    files = []
+  return commit, files
+
+
+def disambiguate_revision(value):
+  """Returns True if `value` is a revision, False if it is a file, or dies."""
+  # If `value` is ambiguous (neither a commit nor a file), the following
+  # command will die with an appropriate error message.
+  run('git', 'rev-parse', value, verbose=False)
+  object_type = get_object_type(value)
+  if object_type is None:
+    return False
+  if object_type in ('commit', 'tag'):
+    return True
+  die('`%s` is a %s, but a commit or filename was expected' %
+      (value, object_type))
+
+
+def get_object_type(value):
+  """Returns a string description of an object's type, or None if it is not
+  a valid git object."""
+  cmd = ['git', 'cat-file', '-t', value]
+  p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+  stdout, stderr = p.communicate()
+  if p.returncode != 0:
+    return None
+  return stdout.strip()
+
+
+def compute_diff_and_extract_lines(commit, files):
+  """Calls compute_diff() followed by extract_lines()."""
+  diff_process = compute_diff(commit, files)
+  changed_lines = extract_lines(diff_process.stdout)
+  diff_process.stdout.close()
+  diff_process.wait()
+  if diff_process.returncode != 0:
+    # Assume error was already printed to stderr.
+    sys.exit(2)
+  return changed_lines
+
+
+def compute_diff(commit, files):
+  """Return a subprocess object producing the diff from `commit`.
+
+  The return value's `stdin` file object will produce a patch with the
+  differences between the working directory and `commit`, filtered on `files`
+  (if non-empty).  Zero context lines are used in the patch."""
+  cmd = ['git', 'diff-index', '-p', '-U0', commit, '--']
+  cmd.extend(files)
+  p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+  p.stdin.close()
+  return p
+
+
+def extract_lines(patch_file):
+  """Extract the changed lines in `patch_file`.
+
+  The return value is a dictionary mapping filename to a list of (start_line,
+  line_count) pairs.
+
+  The input must have been produced with ``-U0``, meaning unidiff format with
+  zero lines of context.  The return value is a dict mapping filename to a
+  list of line `Range`s."""
+  matches = {}
+  for line in patch_file:
+    match = re.search(r'^\+\+\+\ [^/]+/(.*)', line)
+    if match:
+      filename = match.group(1).rstrip('\r\n')
+    match = re.search(r'^@@ -[0-9,]+ \+(\d+)(,(\d+))?', line)
+    if match:
+      start_line = int(match.group(1))
+      line_count = 1
+      if match.group(3):
+        line_count = int(match.group(3))
+      if line_count > 0:
+        matches.setdefault(filename, []).append(Range(start_line, line_count))
+  return matches
+
+
+def filter_by_extension(dictionary, allowed_extensions):
+  """Delete every key in `dictionary` that doesn't have an allowed extension.
+
+  `allowed_extensions` must be a collection of lowercase file extensions,
+  excluding the period."""
+  allowed_extensions = frozenset(allowed_extensions)
+  for filename in dictionary.keys():
+    base_ext = filename.rsplit('.', 1)
+    if len(base_ext) == 1 or base_ext[1].lower() not in allowed_extensions:
+      del dictionary[filename]
+
+
+def cd_to_toplevel():
+  """Change to the top level of the git repository."""
+  toplevel = run('git', 'rev-parse', '--show-toplevel')
+  os.chdir(toplevel)
+
+
+def create_tree_from_workdir(filenames):
+  """Create a new git tree with the given files from the working directory.
+
+  Returns the object ID (SHA-1) of the created tree."""
+  return create_tree(filenames, '--stdin')
+
+
+def run_clang_format_and_save_to_tree(changed_lines, binary='clang-format',
+                                      style=None):
+  """Run clang-format on each file and save the result to a git tree.
+
+  Returns the object ID (SHA-1) of the created tree."""
+  def index_info_generator():
+    for filename, line_ranges in changed_lines.iteritems():
+      mode = oct(os.stat(filename).st_mode)
+      blob_id = clang_format_to_blob(filename, line_ranges, binary=binary,
+                                     style=style)
+      yield '%s %s\t%s' % (mode, blob_id, filename)
+  return create_tree(index_info_generator(), '--index-info')
+
+
+def create_tree(input_lines, mode):
+  """Create a tree object from the given input.
+
+  If mode is '--stdin', it must be a list of filenames.  If mode is
+  '--index-info' is must be a list of values suitable for "git update-index
+  --index-info", such as "<mode> <SP> <sha1> <TAB> <filename>".  Any other mode
+  is invalid."""
+  assert mode in ('--stdin', '--index-info')
+  cmd = ['git', 'update-index', '--add', '-z', mode]
+  with temporary_index_file():
+    p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
+    for line in input_lines:
+      p.stdin.write('%s\0' % line)
+    p.stdin.close()
+    if p.wait() != 0:
+      die('`%s` failed' % ' '.join(cmd))
+    tree_id = run('git', 'write-tree')
+    return tree_id
+
+
+def clang_format_to_blob(filename, line_ranges, binary='clang-format',
+                         style=None):
+  """Run clang-format on the given file and save the result to a git blob.
+
+  Returns the object ID (SHA-1) of the created blob."""
+  clang_format_cmd = [binary, filename]
+  if style:
+    clang_format_cmd.extend(['-style='+style])
+  clang_format_cmd.extend([
+      '-lines=%s:%s' % (start_line, start_line+line_count-1)
+      for start_line, line_count in line_ranges])
+  try:
+    clang_format = subprocess.Popen(clang_format_cmd, stdin=subprocess.PIPE,
+                                    stdout=subprocess.PIPE)
+  except OSError as e:
+    if e.errno == errno.ENOENT:
+      die('cannot find executable "%s"' % binary)
+    else:
+      raise
+  clang_format.stdin.close()
+  hash_object_cmd = ['git', 'hash-object', '-w', '--path='+filename, '--stdin']
+  hash_object = subprocess.Popen(hash_object_cmd, stdin=clang_format.stdout,
+                                 stdout=subprocess.PIPE)
+  clang_format.stdout.close()
+  stdout = hash_object.communicate()[0]
+  if hash_object.returncode != 0:
+    die('`%s` failed' % ' '.join(hash_object_cmd))
+  if clang_format.wait() != 0:
+    die('`%s` failed' % ' '.join(clang_format_cmd))
+  return stdout.rstrip('\r\n')
+
+
+@contextlib.contextmanager
+def temporary_index_file(tree=None):
+  """Context manager for setting GIT_INDEX_FILE to a temporary file and deleting
+  the file afterward."""
+  index_path = create_temporary_index(tree)
+  old_index_path = os.environ.get('GIT_INDEX_FILE')
+  os.environ['GIT_INDEX_FILE'] = index_path
+  try:
+    yield
+  finally:
+    if old_index_path is None:
+      del os.environ['GIT_INDEX_FILE']
+    else:
+      os.environ['GIT_INDEX_FILE'] = old_index_path
+    os.remove(index_path)
+
+
+def create_temporary_index(tree=None):
+  """Create a temporary index file and return the created file's path.
+
+  If `tree` is not None, use that as the tree to read in.  Otherwise, an
+  empty index is created."""
+  gitdir = run('git', 'rev-parse', '--git-dir')
+  path = os.path.join(gitdir, temp_index_basename)
+  if tree is None:
+    tree = '--empty'
+  run('git', 'read-tree', '--index-output='+path, tree)
+  return path
+
+
+def print_diff(old_tree, new_tree):
+  """Print the diff between the two trees to stdout."""
+  # We use the porcelain 'diff' and not plumbing 'diff-tree' because the output
+  # is expected to be viewed by the user, and only the former does nice things
+  # like color and pagination.
+  subprocess.check_call(['git', 'diff', old_tree, new_tree, '--'])
+
+
+def apply_changes(old_tree, new_tree, force=False, patch_mode=False):
+  """Apply the changes in `new_tree` to the working directory.
+
+  Bails if there are local changes in those files and not `force`.  If
+  `patch_mode`, runs `git checkout --patch` to select hunks interactively."""
+  changed_files = run('git', 'diff-tree', '-r', '-z', '--name-only', old_tree,
+                      new_tree).rstrip('\0').split('\0')
+  if not force:
+    unstaged_files = run('git', 'diff-files', '--name-status', *changed_files)
+    if unstaged_files:
+      print >>sys.stderr, ('The following files would be modified but '
+                           'have unstaged changes:')
+      print >>sys.stderr, unstaged_files
+      print >>sys.stderr, 'Please commit, stage, or stash them first.'
+      sys.exit(2)
+  if patch_mode:
+    # In patch mode, we could just as well create an index from the new tree
+    # and checkout from that, but then the user will be presented with a
+    # message saying "Discard ... from worktree".  Instead, we use the old
+    # tree as the index and checkout from new_tree, which gives the slightly
+    # better message, "Apply ... to index and worktree".  This is not quite
+    # right, since it won't be applied to the user's index, but oh well.
+    with temporary_index_file(old_tree):
+      subprocess.check_call(['git', 'checkout', '--patch', new_tree])
+    index_tree = old_tree
+  else:
+    with temporary_index_file(new_tree):
+      run('git', 'checkout-index', '-a', '-f')
+  return changed_files
+
+
+def run(*args, **kwargs):
+  stdin = kwargs.pop('stdin', '')
+  verbose = kwargs.pop('verbose', True)
+  strip = kwargs.pop('strip', True)
+  for name in kwargs:
+    raise TypeError("run() got an unexpected keyword argument '%s'" % name)
+  p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                       stdin=subprocess.PIPE)
+  stdout, stderr = p.communicate(input=stdin)
+  if p.returncode == 0:
+    if stderr:
+      if verbose:
+        print >>sys.stderr, '`%s` printed to stderr:' % ' '.join(args)
+      print >>sys.stderr, stderr.rstrip()
+    if strip:
+      stdout = stdout.rstrip('\r\n')
+    return stdout
+  if verbose:
+    print >>sys.stderr, '`%s` returned %s' % (' '.join(args), p.returncode)
+  if stderr:
+    print >>sys.stderr, stderr.rstrip()
+  sys.exit(2)
+
+
+def die(message):
+  print >>sys.stderr, 'error:', message
+  sys.exit(2)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/tools/meson.build b/tools/meson.build

new file mode 100644 (file)

index 0000000..4f267c5
--- /dev/null
+++ b/tools/meson.build
@@ -0,0 +1,239 @@
+###########
+# bam2sam #
+###########
+
+pbbam_Bam2SamVersion_h_config = configuration_data()
+pbbam_Bam2SamVersion_h_config.set('Bam2Sam_VERSION', meson.project_version())
+pbbam_Bam2SamVersion_h_config.set('PacBioBAM_BinDir', meson.current_build_dir())
+pbbam_Bam2SamVersion_h_config.set('PacBioBAM_TestsDir', join_paths([meson.current_source_dir(), '../tests']))
+
+pbbam_Bam2SamVersion_h = configure_file(
+  input : files('bam2sam/src/Bam2SamVersion.h.in'),
+  output : 'Bam2SamVersion.h',
+  configuration : pbbam_Bam2SamVersion_h_config)
+
+pbbam_bam2sam_cpp_sources = [pbbam_Bam2SamVersion_h]
+pbbam_bam2sam_cpp_sources += files([
+  'bam2sam/src/main.cpp',
+  'bam2sam/src/Bam2SamSettings.cpp',
+  'bam2sam/src/Bam2SamWorkflow.cpp'])
+
+pbbam_bam2sam = executable(
+  'bam2sam',
+  pbbam_bam2sam_cpp_sources,
+  dependencies : [pbbam_htslib_dep, pbbam_thread_dep, pbbam_zlib_dep, pbbam_boost_dep, pbbam_pbcopper_dep],
+  include_directories : [pbbam_include_directories, include_directories('bam2sam')],
+  install : get_option('build-tools'),
+  cpp_args : pbbam_warning_flags)
+
+# tests
+if get_option('tests')
+  bam2sam_t = configure_file(
+    input : pbbam_cram_bam2sam_t_in,
+    output : 'bam2sam.t',
+    configuration : pbbam_Bam2SamVersion_h_config)
+
+  test(
+    'bam2sam_CramTests',
+    pbbam_cram_script,
+    args : [
+      '--xunit-file=' + join_paths(meson.build_root(), 'pbbam-cram-bam2sam.xml'),
+      '--verbose'] + [
+        bam2sam_t],
+    timeout : 1800)
+endif
+
+###########
+# pbindex #
+###########
+
+pbbam_PbIndexVersion_h_config = configuration_data()
+pbbam_PbIndexVersion_h_config.set('PbIndex_VERSION', meson.project_version())
+pbbam_PbIndexVersion_h = configure_file(
+  input : files('pbindex/src/PbIndexVersion.h.in'),
+  output : 'PbIndexVersion.h',
+  configuration : pbbam_PbIndexVersion_h_config)
+
+pbbam_pbindex_cpp_sources = [pbbam_PbIndexVersion_h]
+pbbam_pbindex_cpp_sources += files([
+  'pbindex/src/main.cpp',
+  'pbindex/src/PbIndexSettings.cpp',
+  'pbindex/src/PbIndexWorkflow.cpp'])
+
+pbbam_pbindex = executable(
+  'pbindex',
+  pbbam_pbindex_cpp_sources,
+  dependencies : [pbbam_htslib_dep, pbbam_zlib_dep, pbbam_boost_dep, pbbam_pbcopper_dep],
+  include_directories : [pbbam_include_directories, include_directories('pbindex')],
+  link_with : pbbam_lib,
+  install : get_option('build-tools'),
+  cpp_args : pbbam_warning_flags)
+
+###############
+# pbindexdump #
+###############
+
+pbbam_PbIndexDumpVersion_h_config = configuration_data()
+pbbam_PbIndexDumpVersion_h_config.set('PbIndexDump_VERSION', meson.project_version())
+pbbam_PbIndexDumpVersion_h_config.set('PacBioBAM_BinDir', meson.current_build_dir())
+pbbam_PbIndexDumpVersion_h_config.set('PacBioBAM_TestsDir', join_paths([meson.current_source_dir(), '../tests']))
+pbbam_PbIndexDumpVersion_h = configure_file(
+  input : files('pbindexdump/src/PbIndexDumpVersion.h.in'),
+  output : 'PbIndexDumpVersion.h',
+  configuration : pbbam_PbIndexDumpVersion_h_config)
+
+pbbam_pbindexdump_cpp_sources = [pbbam_PbIndexDumpVersion_h]
+pbbam_pbindexdump_cpp_sources += files([
+  'pbindexdump/src/CppFormatter.cpp',
+  'pbindexdump/src/JsonFormatter.cpp',
+  'pbindexdump/src/PbIndexDumpSettings.cpp',
+  'pbindexdump/src/PbIndexDumpWorkflow.cpp',
+  'pbindexdump/src/main.cpp'])
+
+pbbam_pbindexdump = executable(
+  'pbindexdump',
+  pbbam_pbindexdump_cpp_sources,
+  dependencies : [pbbam_htslib_dep, pbbam_zlib_dep, pbbam_boost_dep, pbbam_pbcopper_dep],
+  include_directories : [pbbam_include_directories, include_directories('pbindexdump')],
+  link_with : pbbam_lib,
+  install : get_option('build-tools'),
+  cpp_args : pbbam_warning_flags)
+
+# tests
+if get_option('tests')
+  pbindexdump_json_t = configure_file(
+    input : pbbam_cram_pbindexdump_json_t_in,
+    output : 'pbindexdump_json.t',
+    configuration : pbbam_PbIndexDumpVersion_h_config)
+  pbindexdump_cpp_t = configure_file(
+    input : pbbam_cram_pbindexdump_cpp_t_in,
+    output : 'pbindexdump_cpp.t',
+    configuration : pbbam_PbIndexDumpVersion_h_config)
+
+  test(
+    'pbindexdump_CramTests',
+    pbbam_cram_script,
+    args : [
+      '--xunit-file=' + join_paths(meson.build_root(), 'pbbam-cram-pbindexdump.xml'),
+      '--verbose'] + [
+        pbindexdump_json_t,
+        pbindexdump_cpp_t],
+    timeout : 1800)
+endif
+
+###########
+# pbmerge #
+###########
+
+pbbam_PbMergeVersion_h_config = configuration_data()
+pbbam_PbMergeVersion_h_config.set('PbMerge_VERSION', meson.project_version())
+pbbam_PbMergeVersion_h_config.set('PacBioBAM_VERSION', meson.project_version())
+pbbam_PbMergeVersion_h_config.set('PacBioBAM_BinDir', meson.current_build_dir())
+pbbam_PbMergeVersion_h_config.set('PacBioBAM_TestsDir', join_paths([meson.current_source_dir(), '../tests']))
+pbbam_PbMergeVersion_h_config.set('GeneratedTestDataDir', join_paths(meson.current_build_dir()))
+pbbam_PbMergeVersion_h = configure_file(
+  input : files('pbmerge/src/PbMergeVersion.h.in'),
+  output : 'PbMergeVersion.h',
+  configuration : pbbam_PbMergeVersion_h_config)
+
+pbbam_pbmerge_cpp_sources = [pbbam_PbMergeVersion_h]
+pbbam_pbmerge_cpp_sources += files([
+  'pbmerge/src/main.cpp',
+  'pbmerge/src/PbMergeSettings.cpp',
+  'pbmerge/src/PbMergeWorkflow.cpp'])
+
+pbbam_pbmerge = executable(
+  'pbmerge',
+  pbbam_pbmerge_cpp_sources,
+  dependencies : [pbbam_boost_dep, pbbam_htslib_dep, pbbam_zlib_dep, pbbam_pbcopper_dep],
+  include_directories : [pbbam_include_directories, include_directories('pbmerge')],
+  link_with : pbbam_lib,
+  install : get_option('build-tools'),
+  cpp_args : pbbam_warning_flags)
+
+# tests
+if get_option('tests') and not get_option('auto-validate')
+  pbmerge_pacbio_ordering_t = configure_file(
+    input : pbbam_cram_pbmerge_pacbio_ordering_t_in,
+    output : 'pbmerge_pacbio_ordering.t', configuration : pbbam_PbMergeVersion_h_config)
+  pbmerge_aligned_ordering_t = configure_file(
+    input : pbbam_cram_pbmerge_aligned_ordering_t_in,
+    output : 'pbmerge_aligned_ordering.t', configuration : pbbam_PbMergeVersion_h_config)
+  pbmerge_mixed_ordering_t = configure_file(
+    input : pbbam_cram_pbmerge_mixed_ordering_t_in,
+    output : 'pbmerge_mixed_ordering.t', configuration : pbbam_PbMergeVersion_h_config)
+  pbmerge_dataset_t = configure_file(
+    input : pbbam_cram_pbmerge_dataset_t_in,
+    output : 'pbmerge_dataset.t', configuration : pbbam_PbMergeVersion_h_config)
+  pbmerge_fofn_t = configure_file(
+    input : pbbam_cram_pbmerge_fofn_t_in,
+    output : 'pbmerge_fofn.t', configuration : pbbam_PbMergeVersion_h_config)
+
+  test(
+    'pbmerge_CramTests',
+    pbbam_cram_script,
+    args : [
+      '--xunit-file=' + join_paths(meson.build_root(), 'pbbam-cram-pbmerge.xml'),
+      '--verbose'] + [
+        pbmerge_pacbio_ordering_t,
+        pbmerge_aligned_ordering_t,
+        pbmerge_mixed_ordering_t,
+        pbmerge_dataset_t,
+        pbmerge_fofn_t],
+    timeout : 1800)
+endif
+
+############
+# pbbamify #
+############
+
+pbbam_PbBamifyVersion_h_config = configuration_data()
+pbbam_PbBamifyVersion_h_config.set('PbBamify_VERSION', meson.project_version())
+pbbam_PbBamifyVersion_h_config.set('PacBioBAM_VERSION', meson.project_version())
+pbbam_PbBamifyVersion_h_config.set('PacBioBAM_BinDir', meson.current_build_dir())
+pbbam_PbBamifyVersion_h_config.set('PacBioBAM_TestsDir', join_paths([meson.current_source_dir(), '../tests']))
+pbbam_PbBamifyVersion_h_config.set('GeneratedTestDataDir', join_paths(meson.current_build_dir()))
+pbbam_PbBamifyVersion_h_config.set('GeneratedDir', join_paths(meson.current_build_dir(), '../tests'))
+pbbam_PbBamifyVersion_h = configure_file(
+  input : files('pbbamify/src/PbBamifyVersion.h.in'),
+  output : 'PbBamifyVersion.h',
+  configuration : pbbam_PbBamifyVersion_h_config)
+
+pbbam_pbbamify_cpp_sources = [pbbam_PbBamifyVersion_h]
+pbbam_pbbamify_cpp_sources += files([
+  'pbbamify/src/main.cpp',
+  'pbbamify/src/PbBamifySettings.cpp',
+  'pbbamify/src/PbBamifyWorkflow.cpp',
+  'pbbamify/src/QueryLookup.cpp'
+  ])
+
+pbbam_pbbamify = executable(
+  'pbbamify',
+  pbbam_pbbamify_cpp_sources,
+  dependencies : [pbbam_boost_dep, pbbam_htslib_dep, pbbam_zlib_dep, pbbam_pbcopper_dep],
+  include_directories : [pbbam_include_directories, include_directories('pbbamify')],
+  link_with : pbbam_lib,
+  install : get_option('build-tools'),
+  cpp_args : pbbam_warning_flags)
+
+
+# tests
+if get_option('tests')
+  pbbam_test_samtools = find_program('samtools', required : true)
+
+  pbbamify_t = configure_file(
+    input : pbbam_cram_pbbamify_t_in,
+    output : 'pbbamify.t',
+    configuration : pbbam_PbBamifyVersion_h_config)
+
+  test(
+    'pbbamify_CramTests',
+    pbbam_cram_script,
+    args : [
+      '--xunit-file=' + join_paths(meson.build_root(), 'pbbam-cram-pbbamify.xml'),
+      '--verbose'] + [
+        pbbamify_t],
+    env : [
+      'SAMTOOLS=' + pbbam_test_samtools.path()],
+    timeout : 1800)
+endif
diff --git a/tools/pbbamify/src/PbBamifySettings.cpp b/tools/pbbamify/src/PbBamifySettings.cpp

new file mode 100644 (file)

index 0000000..bf628c6
--- /dev/null
+++ b/tools/pbbamify/src/PbBamifySettings.cpp
@@ -0,0 +1,120 @@
+// Author: Ivan Sovic
+
+#include "PbBamifySettings.h"
+
+#include <stdexcept>
+
+#include <pbcopper/data/CigarOperation.h>
+
+#include "PbBamifyVersion.h"
+
+namespace PacBio {
+namespace PbBamify {
+namespace Options {
+
+// clang-format off
+const CLI_v2::Option InputFile{
+R"({
+    "names" : ["input"],
+    "description" : "The aligned non-PacBio BAM file. If not provided, stdin will be used as input.",
+    "type" : "file",
+    "default" : ""
+})"};
+
+const CLI_v2::Option OutputFile{
+R"({
+    "names" : ["output"],
+    "description" : "Path to the output BAM file. If not specified, output will be to the stdout.",
+    "type" : "file",
+    "default" : ""
+})"};
+
+const CLI_v2::Option VerboseLevel{
+R"({
+    "names" : ["verbose-level"],
+    "description" : [
+        "Specifies the level of info which will be output produced on stderr. ",
+        "0 turns all output off, 1 outputs only warnings, while levels 2 and ",
+        "above outputs a status message every 1000000 (2), 100000 (3), 1000 (4), ",
+        "100 (5), 10 (6) and 1 (7) reads."
+    ],
+    "type" : "int",
+    "default" : 3
+})"};
+
+const CLI_v2::PositionalArgument ReferenceFile{
+R"({
+    "name" : "ref.fa",
+    "description" : "Reference used to align the input.",
+    "type" : "file"
+})"};
+
+const CLI_v2::PositionalArgument PbbamReadFile{
+R"({
+    "name" : "IN.bam",
+    "description" : "Input file(s). Maybe one of: DataSetXML, BAM file(s), or FOFN",
+    "type" : "file"
+})"};
+
+// clang-format on
+
+}  // namespace Options
+
+CLI_v2::Interface Settings::CreateCLI()
+{
+    // clang-format off
+    const std::string description{
+        "pbbamify converts an arbitray aligned BAM file to a PacBio-compatible BAM file."
+        "Input BAM file is read from a file or stdin, the raw-reads PacBio BAM is given"
+        "as a parameter, and BAM output is written to stdout."
+    };
+
+    CLI_v2::Interface interface{"pbbamify", description, PbBamify::Version};
+    interface.DisableNumThreadsOption();
+
+    interface.AddOptions({
+        Options::InputFile,
+        Options::OutputFile,
+        Options::VerboseLevel
+    });
+    interface.AddPositionalArguments({
+        Options::ReferenceFile,
+        Options::PbbamReadFile
+    });
+
+    Logging::LogConfig logConfig{Logging::LogLevel::INFO};
+    logConfig.Fields = Logging::LogField::TIMESTAMP | Logging::LogField::LOG_LEVEL;
+    interface.LogConfig(logConfig);
+    interface.DefaultLogLevel(Logging::LogLevel::INFO); // fixme
+    // clang-format on
+
+    return interface;
+}
+
+Settings::Settings(const CLI_v2::Results& args)
+    : InputFilename(args[Options::InputFile])
+    , OutputFilename(args[Options::OutputFile])
+    , VerboseLevel{args[Options::VerboseLevel]}
+{
+    // Reference & unaligned PacBio BAM files
+    const auto& posArgs = args.PositionalArguments();
+    if (posArgs.size() != 2)
+        throw std::runtime_error{"exactly two positional arguments must be provided"};
+    ReferenceFilename = posArgs[0];
+    PbbamFilename = posArgs[1];
+
+    // Input non-PacBio BAM
+    if (InputFilename.empty()) InputFilename = "-";
+
+    // Output aligned PacBio BAM
+    if (OutputFilename.empty()) OutputFilename = "-";
+
+    // Verbosity
+    if (VerboseLevel < 0) VerboseLevel = 0;
+
+    // Allow 'M' tags
+    Data::CigarOperation::DisableAutoValidation();
+}
+
+}  // namespace PbBamify
+}  // namespace PacBio
diff --git a/tools/pbbamify/src/PbBamifySettings.h b/tools/pbbamify/src/PbBamifySettings.h

new file mode 100644 (file)

index 0000000..6ad1ea0
--- /dev/null
+++ b/tools/pbbamify/src/PbBamifySettings.h
@@ -0,0 +1,32 @@
+// Author: Ivan Sovic
+
+#ifndef PBBAMIFY_SETTINGS_H
+#define PBBAMIFY_SETTINGS_H
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <pbcopper/cli2/CLI.h>
+
+namespace PacBio {
+namespace PbBamify {
+
+struct Settings
+{
+    static CLI_v2::Interface CreateCLI();
+
+    explicit Settings(const CLI_v2::Results& args);
+
+    std::string InputFilename;
+    std::string OutputFilename;
+    std::string ReferenceFilename;
+    std::string PbbamFilename;
+    std::vector<std::string> Errors;
+    int32_t VerboseLevel;
+};
+
+}  // namespace PbBamify
+}  // namespace PacBio
+
+#endif  // PBBAMIFY_SETTINGS_H
diff --git a/tools/pbbamify/src/PbBamifyVersion.h.in b/tools/pbbamify/src/PbBamifyVersion.h.in

new file mode 100644 (file)

index 0000000..3c53f7e
--- /dev/null
+++ b/tools/pbbamify/src/PbBamifyVersion.h.in
@@ -0,0 +1,16 @@
+// Author: Ivan Sovic
+
+#ifndef PBBAMIFY_VERSION_H
+#define PBBAMIFY_VERSION_H
+
+#include <string>
+
+namespace PacBio {
+namespace PbBamify {
+
+const std::string Version = std::string("@PbBamify_VERSION@");
+
+} // namespace PbBamify
+} // namespace PacBio
+
+#endif // PBBAMIFY_VERSION_H
diff --git a/tools/pbbamify/src/PbBamifyWorkflow.cpp b/tools/pbbamify/src/PbBamifyWorkflow.cpp

new file mode 100644 (file)

index 0000000..71c0c51
--- /dev/null
+++ b/tools/pbbamify/src/PbBamifyWorkflow.cpp
@@ -0,0 +1,404 @@
+// Author: Ivan Sovic
+
+#include "PbBamifyWorkflow.h"
+
+#include <cstdint>
+#include <ctime>
+
+#include <istream>
+#include <ostream>
+#include <string>
+
+#include <pbbam/BamReader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/DataSet.h>
+#include <pbbam/FastaReader.h>
+#include <pbbam/IndexedFastaReader.h>
+#include <pbbam/MD5.h>
+#include <pbbam/PbiFilter.h>
+#include <pbbam/PbiFilterQuery.h>
+#include <pbbam/PbiFilterTypes.h>
+
+#include <pbcopper/data/Cigar.h>
+#include <pbcopper/logging/Logging.h>
+#include <pbcopper/utility/SequenceUtils.h>
+
+#include "PbBamifySettings.h"
+#include "PbBamifyVersion.h"
+
+namespace PacBio {
+namespace PbBamify {
+
+int Workflow::Runner(const CLI_v2::Results& args)
+{
+    const Settings settings{args};
+
+    // setup our @PG entry to add to header
+    BAM::ProgramInfo pbbamifyProgram;
+    pbbamifyProgram.Id(std::string{"pbbamify-"} + PbBamify::Version)
+        .Name("pbbamify")
+        .Version(PbBamify::Version);
+
+    BAM::DataSet dataset{settings.PbbamFilename};
+    BAM::BamReader inputBamReader{settings.InputFilename};
+    BAM::BamHeader newHeader;
+
+    {  // A separate block to close the reference file after the header is formed.
+        // Using a sequential reader to construct the header SN lines in order, fast.
+        BAM::FastaReader ref_reader{settings.ReferenceFilename};
+        newHeader = ComposeHeader(dataset, ref_reader, inputBamReader);
+    }
+
+    auto queryLookup = std::make_shared<QueryLookup>(std::move(dataset));
+    queryLookup->Load();
+
+    {  // A block is used here to close the bamWriter and the reference reader.
+        // (Even though this will be done as soon as the 'try' block ends, this safeguards if any
+        // code should be added in between at some point.)
+        BAM::IndexedFastaReader indexedRefReader{settings.ReferenceFilename};
+        BAM::BamWriter bamWriter{settings.OutputFilename, newHeader};
+        const bool result = AugmentAlignments(queryLookup, indexedRefReader, inputBamReader,
+                                              bamWriter, settings.VerboseLevel);
+        if (result == false) return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
+
+BAM::BamHeader Workflow::ComposeHeader(const BAM::DataSet& dataset, BAM::FastaReader& refReader,
+                                       const BAM::BamReader& input)
+{
+
+    BAM::BamHeader retHeader;
+    bool headerInitialized = false;
+
+    // Merge all the read groups and additional PacBio info.
+    const auto& bamFiles = dataset.BamFiles();
+    for (auto& bamFile : bamFiles) {
+
+        auto header = bamFile.Header();
+        if (!headerInitialized) {
+            retHeader = header.DeepCopy();
+            headerInitialized = true;
+        } else
+            retHeader += header;
+    }
+
+    // Merge the alignment PG to the header.
+    auto inputHeader = input.Header();
+    for (auto& program : inputHeader.Programs())
+        retHeader.AddProgram(program);
+
+    // Add the sequence info to the header.
+    BAM::FastaSequence record;
+    while (refReader.GetNext(record)) {
+
+        // Convert the sequence length to string,
+        // as required by SequenceInfo.
+        std::ostringstream ossLength;
+        ossLength << record.Bases().size();
+
+        // Clip on whitespace.
+        std::istringstream issHeader{record.Name()};
+        std::string header;
+        issHeader >> header;
+
+        // Calculate the MD5 and append to retHeader.
+        BAM::SequenceInfo seq{header, ossLength.str()};
+        auto hash = BAM::MD5Hash(record.Bases());
+        seq.Checksum(hash);
+        retHeader.AddSequence(seq);
+    }
+
+    return retHeader;
+}
+
+bool Workflow::IsHardClipped(const Data::Cigar& cigarData)
+{
+    // If it's empty, just return.
+    if (cigarData.size() == 0) return false;
+
+    // If there is no hard clipping, just return.
+    if (cigarData.front().Type() == Data::CigarOperationType::HARD_CLIP ||
+        cigarData.back().Type() == Data::CigarOperationType::HARD_CLIP) {
+        return true;
+    }
+
+    return false;
+}
+
+Data::Cigar Workflow::ConvertHardToSoftClipping(const Data::Cigar& cigarData)
+{
+    Data::Cigar softCigar;
+
+    // If it's empty, just return.
+    if (cigarData.size() == 0) return softCigar;
+
+    Data::CigarOperationType prevOp = Data::CigarOperationType::UNKNOWN_OP;
+
+    for (const auto& cigar : cigarData) {
+
+        // Change H to S.
+        Data::CigarOperationType op = (cigar.Type() == Data::CigarOperationType::HARD_CLIP)
+                                          ? Data::CigarOperationType::SOFT_CLIP
+                                          : cigar.Type();
+        auto len = cigar.Length();
+
+        // Merge or add.
+        if (softCigar.size() > 0 && op == prevOp) {
+            auto prevLen = softCigar.back().Length();
+            softCigar.back() = Data::CigarOperation{op, len + prevLen};
+        } else
+            softCigar.emplace_back(Data::CigarOperation{op, len});
+
+        prevOp = op;
+    }
+
+    return softCigar;
+}
+
+size_t Workflow::SequenceLengthFromCigar(const Data::Cigar& cigarData)
+{
+    size_t len = 0;
+
+    if (cigarData.size() == 0) return len;
+
+    for (const auto& cigar : cigarData) {
+        if (Data::ConsumesQuery(cigar.Type()) ||
+            cigar.Type() == Data::CigarOperationType::HARD_CLIP) {
+            len += cigar.Length();
+        }
+    }
+
+    return len;
+}
+
+bool Workflow::CheckIsCigarBasic(const Data::Cigar& cigarData)
+{
+    for (const auto& cigar : cigarData) {
+        if (cigar.Type() == Data::CigarOperationType::ALIGNMENT_MATCH) return true;
+    }
+    return false;
+}
+
+/*
+ * Takes the pre-calculated cigarData object so that it's
+ * more efficient (it could always be obtained from the record
+ * at any time).
+*/
+Data::Cigar Workflow::BasicToExtendedCigar(const BAM::IndexedFastaReader& indexedRefReader,
+                                           const BAM::BamRecord& record,
+                                           const Data::Cigar& cigarData)
+{
+    Data::Cigar extCigar;
+
+    std::string qseq{record.Impl().Sequence()};
+    std::string rseq =
+        indexedRefReader.ReferenceSubsequence(record, BAM::Orientation::GENOMIC, false, false);
+
+    size_t qpos = 0, rpos = 0;  // The rpos should be 0 because the reference portion is yanked out.
+    for (const auto& cigar : cigarData) {
+
+        // This shouldn't happen, but let's keep it safe.
+        if (cigar.Length() == 0) continue;
+
+        if (cigar.Type() == Data::CigarOperationType::ALIGNMENT_MATCH) {
+            // Decode the prev op.
+            Data::CigarOperationType prevOp = (qseq[qpos] == rseq[rpos])
+                                                  ? Data::CigarOperationType::SEQUENCE_MATCH
+                                                  : Data::CigarOperationType::SEQUENCE_MISMATCH;
+            uint32_t prevCount = 0;
+            for (size_t i = 0; i < cigar.Length(); ++i) {
+
+                // Decode the new op.
+                Data::CigarOperationType op = (qseq[qpos + i] == rseq[rpos + i])
+                                                  ? Data::CigarOperationType::SEQUENCE_MATCH
+                                                  : Data::CigarOperationType::SEQUENCE_MISMATCH;
+
+                if (op == prevOp) {
+                    ++prevCount;
+                } else {
+                    extCigar.emplace_back(Data::CigarOperation{prevOp, prevCount});
+                    prevOp = op;
+                    prevCount = 1;
+                }
+            }
+
+            // Add the last operation.
+            extCigar.emplace_back(Data::CigarOperation{prevOp, prevCount});
+        } else
+            extCigar.emplace_back(cigar);
+
+        if (Data::ConsumesQuery(cigar.Type())) qpos += cigar.Length();
+        if (Data::ConsumesReference(cigar.Type())) rpos += cigar.Length();
+    }
+
+    return extCigar;
+}
+
+bool Workflow::AugmentAlignments(const std::shared_ptr<QueryLookup> queryLookup,
+                                 const BAM::IndexedFastaReader& indexedRefReader,
+                                 BAM::BamReader& input, BAM::BamWriter& writer,
+                                 int32_t verboseLevel)
+{
+
+    // Clock is just for the verbose functionality.
+    clock_t timerStart = clock();
+
+    // Sets the frequency of the proof of life when
+    // processing larger input BAMs.
+    int32_t verboseFrequency =
+        (verboseLevel <= 2)
+            ? 1000000
+            : (verboseLevel == 3)
+                  ? 100000
+                  : (verboseLevel == 4)
+                        ? 10000
+                        : (verboseLevel == 5)
+                              ? 1000
+                              : (verboseLevel == 6) ? 100 : (verboseLevel == 7) ? 10 : 1;
+
+    // Counters for verbose output.
+    size_t numRecords = 0, numWithoutSeq = 0;
+
+    // Holder for the current record.
+    BAM::BamRecord record;
+    while (input.GetNext(record)) {
+        ++numRecords;
+
+        // Proof of life.
+        if (verboseLevel > 1 && (numRecords % verboseFrequency) == 0) {
+            double elapsedTime =
+                static_cast<double>(clock() - timerStart) / (60.0 * CLOCKS_PER_SEC);
+            elapsedTime = static_cast<int64_t>(elapsedTime * 100.0) / 100.0;
+            PBLOG_INFO << "Processed " << numRecords << " alignments in " << elapsedTime << " min.";
+        }
+
+        // Some mappers do not output sequences for secondary alignments.
+        if (record.Impl().SequenceLength() == 0) {
+            ++numWithoutSeq;
+            continue;
+        }
+
+        // Update the BAM record with additional data from the PacBio dataset.
+        // In case of failure, skip the alignment. Failures should be reported by AugmentAlignment.
+        const bool rv = AugmentAlignment(queryLookup, indexedRefReader, record, verboseLevel);
+        if (rv == false) continue;
+
+        // Finally, write the output.
+        writer.Write(record);
+    }
+
+    if (verboseLevel > 0 && numWithoutSeq) {
+        PBLOG_WARN << "Found " << numWithoutSeq
+                   << " alignments without a seq field which were not converted (most likely "
+                      "secondary alignments).";
+    }
+
+    if (verboseLevel > 1) {
+        double elapsedTime = static_cast<double>(clock() - timerStart) / (60.0 * CLOCKS_PER_SEC);
+        elapsedTime = static_cast<int64_t>(elapsedTime * 100.0) / 100.0;
+        PBLOG_INFO << "Done processing " << numRecords << " alignments in " << elapsedTime
+                   << " min.";
+    }
+
+    return true;
+}
+
+bool Workflow::AugmentAlignment(const std::shared_ptr<QueryLookup> queryLookup,
+                                const BAM::IndexedFastaReader& indexedRefReader,
+                                BAM::BamRecord& record, int32_t verboseLevel)
+{
+
+    // Find the BAM record in the original PacBio dataset.
+    BAM::BamRecord datasetRecord;
+    const bool isFound = queryLookup->Find(record.FullName(), datasetRecord);
+    if (!isFound) {
+        if (verboseLevel > 0)
+            PBLOG_WARN << "No records found for query '" << record.FullName() << "'. Skipping.";
+        return false;
+    }
+
+    // If it's not mapped, just output the original.
+    if (!record.IsMapped()) {
+        record = datasetRecord;
+        return true;
+    }
+
+    // Keep the cigar object since we'll reuse it. More efficient.
+    auto cigar = record.Impl().CigarData();
+
+    // Sanity check that the mapper did not produce something funky.
+    const size_t recordSeqLen = SequenceLengthFromCigar(cigar);
+    if (recordSeqLen != datasetRecord.Impl().SequenceLength()) {
+        if (verboseLevel > 0) {
+            PBLOG_WARN << "Sequence '" << record.FullName() << "' (length " << recordSeqLen
+                       << ") is not of the same length as the PacBio BAM sequence (length "
+                       << datasetRecord.Impl().SequenceLength() << ")! Skipping.";
+        }
+        return false;
+    }
+
+    // Update the CIGAR only if necessary.
+    if (CheckIsCigarBasic(cigar)) {
+        cigar = BasicToExtendedCigar(indexedRefReader, record, cigar);
+        record.Impl().CigarData(cigar);
+    }
+
+    // Stomp over any existing tags with matching IDs and add those
+    // which do not yet exist in the aligned BAM. We consider the PacBio
+    // dataset to be the correct answer to any of these. The rest are
+    // produced by a mapper.
+    // For example, BLASR will generate a RG tag even if the input was FASTA.
+    for (auto& tag : datasetRecord.Impl().Tags()) {
+        if (record.Impl().Tags().Contains(tag.first))
+            record.Impl().EditTag(tag.first, tag.second);
+        else
+            record.Impl().AddTag(tag.first, tag.second);
+    }
+
+    // Some downstream tools might not work well with the
+    // "undefined" mapping quality value of 255. Here
+    // we set it to a valid arbitrary value.
+    if (record.Impl().MapQuality() == 255) record.Impl().MapQuality(254);
+
+    // If the alignment has hard clipping, simply take both the seq and
+    // qual fields from the dataset. This will stomp over any custom
+    // qual values in the input BAM file.
+    if (IsHardClipped(cigar)) {
+        // Take the seq and qual fields from the dataset to override
+        // any hard clippings induced by the mapper.
+        std::string qseq{datasetRecord.Impl().Sequence()};
+        std::string quals{datasetRecord.Impl().Qualities().Fastq()};
+
+        // Reverse if needed.
+        if (record.Impl().IsReverseStrand()) {
+            Utility::ReverseComplement(qseq);
+            std::reverse(quals.begin(), quals.end());
+        }
+
+        // PacBio datasets, when converted to SAM, contain '!' ASCII QVs.
+        // In case QVs aren't provided otherwise, this block adds the '!' values.
+        if (quals.size() == 0) quals = std::string(qseq.size(), '!');
+
+        // Replace the seq, qual, & cigar fields.
+        record.Impl().SetSequenceAndQualities(qseq, quals);
+        cigar = ConvertHardToSoftClipping(cigar);
+        record.Impl().CigarData(cigar);
+
+    } else {
+        // PacBio datasets, when converted to SAM, contain '!' ASCII QVs.
+        // In case QVs aren't provided otherwise, this block adds the '!' values.
+        if (record.Impl().Qualities().size() == 0) {
+            std::string qseq{record.Impl().Sequence()};
+            std::string quals(qseq.size(), '!');
+            record.Impl().SetSequenceAndQualities(qseq, quals);
+        }
+    }
+
+    return true;
+}
+
+}  // namespace PbBamify
+}  // namespace PacBio
diff --git a/tools/pbbamify/src/PbBamifyWorkflow.h b/tools/pbbamify/src/PbBamifyWorkflow.h

new file mode 100644 (file)

index 0000000..6b75f81
--- /dev/null
+++ b/tools/pbbamify/src/PbBamifyWorkflow.h
@@ -0,0 +1,125 @@
+// Author: Ivan Sovic
+
+#ifndef PBBAMIFY_WORKFLOW_H
+#define PBBAMIFY_WORKFLOW_H
+
+#include <pbcopper/cli2/Results.h>
+
+#include "QueryLookup.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamHeader;
+class BamReader;
+class BamRecord;
+class BamWriter;
+class DataSet;
+class FastaReader;
+class IndexedFastaReader;
+
+}  // namespace BAM
+
+namespace Data {
+
+class Cigar;
+
+}  // namespace Data
+
+namespace PbBamify {
+
+struct Workflow
+{
+    ///
+    /// \brief Takes a PacBio dataset, a reference file and an input arbitrary
+    ///        aligned BAM. Produces a new PacBio-compatible aligned BAM.
+    ///
+    /// \throws std::runtime_error
+    ///
+    static int Runner(const CLI_v2::Results& args);
+
+    ///
+    /// \brief Merges all the headers from the dataset and the input, adds the
+    ///        SQ fields with lengths and MD5 checksums.
+    ///
+    /// \returns A BAM header which is composed of: merged headers from BAMs in
+    ///          the dataset, ProgramInfo from the input BAM, and SQ lines
+    ///          formed from the refReader object (together with their length and
+    ///          MD5 checksum).
+    ///
+    static BAM::BamHeader ComposeHeader(const BAM::DataSet& dataset, BAM::FastaReader& refReader,
+                                        const BAM::BamReader& input);
+
+    ///
+    /// \brief Converts a set of generic BAM records into a PacBio compatible
+    ///        BAM by calling AugmentAlignment for each BAM record in the input
+    ///        BAM file. If a BAM record was not mapped, then the original record
+    ///        from the dataset will be set to `record`.
+    ///
+    /// \returns true if the record was successfully augmented, false otherwise.
+    ///
+    static bool AugmentAlignments(const std::shared_ptr<QueryLookup> queryLookup,
+                                  const BAM::IndexedFastaReader& indexedRefReader,
+                                  BAM::BamReader& input, BAM::BamWriter& writer,
+                                  int32_t verboseLevel);
+
+    ///
+    /// \brief Converts a generic BAM record into a PacBio compatible BAM by:
+    ///        adding tags from the PacBio dataset, replacing the read group,
+    ///        clipping the tags if needed, converting the CIGAR from basic to
+    ///        extended format if needed, changing the mapq from 255 to another
+    ///        value to avoid potential downstream issues, etc.
+    ///
+    /// \returns true if the record was successfully augmented, false otherwise.
+    ///
+    static bool AugmentAlignment(const std::shared_ptr<QueryLookup> queryLookup,
+                                 const BAM::IndexedFastaReader& indexedRefReader,
+                                 BAM::BamRecord& record, int32_t verboseLevel);
+
+    ///
+    /// \brief Checks whether the alignment was hard clipped.
+    ///
+    /// \returns true if the front or back CIGAR op is 'H', false otherwise.
+    ///
+    static bool IsHardClipped(const Data::Cigar& cigarData);
+
+    ///
+    /// \brief If the CIGAR string contains hard clipping operation at the beginning
+    ///        or end of the cigarData vector, these are turned to soft clips and
+    ///        merged with any potential existin soft clipping operations.
+    ///
+    /// \returns a new CIGAR string with only soft clipped bases.
+    ///
+    static Data::Cigar ConvertHardToSoftClipping(const Data::Cigar& cigarData);
+
+    ///
+    /// \brief Calculates the total sequence length from CIGAR (including clipping),
+    ///        and not just the aligned length. This is used for sanity checking
+    ///        the input BAM records.
+    ///
+    /// \returns The length of the query sequence calculated from the CIGAR string.
+    ///
+    static size_t SequenceLengthFromCigar(const Data::Cigar& cigarData);
+
+    ///
+    /// \brief Linear pass over the Cigar operations to see if there are any 'M' ops.
+    ///
+    /// \returns true if there are 'M' operations in the CIGAR object.
+    ///
+    static bool CheckIsCigarBasic(const Data::Cigar& cigarData);
+
+    ///
+    /// \brief Takes the index and a BAM record, and creates a new Cigar object
+    ///        with extended CIGAR operations ('=' and 'X' instead of 'M').
+    ///
+    /// \returns A new Cigar object with '=' and 'X' operations instead of 'M's.
+    ///
+    static Data::Cigar BasicToExtendedCigar(const BAM::IndexedFastaReader& indexedRefReader,
+                                            const BAM::BamRecord& record,
+                                            const Data::Cigar& cigarData);
+};
+
+}  // namespace PbBamify
+}  // namespace PacBio
+
+#endif  // PBBAMIFY_WORKFLOW_H
diff --git a/tools/pbbamify/src/QueryLookup.cpp b/tools/pbbamify/src/QueryLookup.cpp

new file mode 100644 (file)

index 0000000..ba6e559
--- /dev/null
+++ b/tools/pbbamify/src/QueryLookup.cpp
@@ -0,0 +1,104 @@
+// Author: Ivan Sovic
+
+#include "QueryLookup.h"
+
+#include <iostream>
+#include <ostream>
+#include <string>
+
+#include <pbbam/BamFile.h>
+#include <pbbam/BamHeader.h>
+#include <pbbam/PbiRawData.h>
+#include <pbbam/ReadGroupInfo.h>
+
+namespace PacBio {
+namespace PbBamify {
+
+QueryLookup::QueryLookup(BAM::DataSet dataset) : dataset_{std::move(dataset)} {}
+
+void QueryLookup::Load()
+{
+    std::vector<BAM::BamFile> bamFiles{dataset_.BamFiles()};
+
+    // Merge all the read groups for a unified read group lookup.
+    BAM::BamHeader jointHeader;
+    bool headerInitialized = false;
+    for (auto& bamFile : bamFiles) {
+        auto header = bamFile.Header();
+        if (!headerInitialized) {
+            jointHeader = header.DeepCopy();
+            headerInitialized = true;
+        } else
+            jointHeader += header;
+    }
+
+    // Set-up a vector of readers for each BAM in the PacBio dataset
+    // to allow for random access.
+    readers_.clear();
+    for (auto& file : bamFiles) {
+        auto new_reader = std::make_shared<BAM::BamReader>(file);
+        readers_.push_back(new_reader);
+    }
+
+    // Get the PacBio index.
+    const BAM::PbiRawData pbi{dataset_};
+    const auto& basicData = pbi.BasicData();
+
+    // Clear everything just in case the user called Load() twice.
+    lookup_.clear();
+
+    // Process each read in the dataset and reconstruct it's original
+    // qname. Place the read in the lookup, together with the ID
+    // of the source BAM file and the virtual file offset where
+    // the read is located.
+    std::ostringstream out;
+    for (size_t i = 0; i < pbi.NumReads(); ++i) {
+        const auto zmw = basicData.holeNumber_.at(i);
+        const auto qStart = basicData.qStart_.at(i);
+        const auto qEnd = basicData.qEnd_.at(i);
+
+        const auto rgId = basicData.rgId_.at(i);
+        const auto rgString = BAM::ReadGroupInfo::IntToId(rgId);
+        const auto rgInfo = jointHeader.ReadGroup(rgString);
+        const auto movieName = rgInfo.MovieName();
+        std::string type{rgInfo.ReadType()};
+        std::transform(type.begin(), type.end(), type.begin(), ::tolower);
+
+        out.str("");
+        if (type == "subread")
+            out << movieName << '/' << zmw << '/' << qStart << '_' << qEnd;
+        else if (type == "ccs")
+            out << movieName << '/' << zmw << '/' << "ccs";
+        else {
+            out << "Unknown read group type '" << type << "'.";
+            throw std::runtime_error(out.str());
+        }
+
+        // Sanity check.
+        const auto qName = out.str();
+        const auto found = lookup_.find(qName);
+        if (found != lookup_.end()) {
+            const std::string message = std::string{"More than 1 occurrence of qname '"} + qName +
+                                        std::string{"'. Duplicate reads in the dataset?"};
+            throw std::runtime_error(message);
+        }
+
+        const auto fileNumber = basicData.fileNumber_.at(i);
+        const auto fileOffset = basicData.fileOffset_.at(i);
+        lookup_[qName] = QueryLocation{fileNumber, fileOffset};
+    }
+}
+
+bool QueryLookup::Find(const std::string& qName, BAM::BamRecord& record) const
+{
+    const auto it = lookup_.find(qName);
+    if (it == lookup_.end()) return false;
+
+    readers_.at(it->second.fileNumber)->VirtualSeek(it->second.fileOffset);
+    if (!readers_.at(it->second.fileNumber)->GetNext(record)) return false;
+
+    return true;
+}
+
+}  // namespace PbBamify
+}  // namespace PacBio
+\ No newline at end of file
diff --git a/tools/pbbamify/src/QueryLookup.h b/tools/pbbamify/src/QueryLookup.h

new file mode 100644 (file)

index 0000000..6efe820
--- /dev/null
+++ b/tools/pbbamify/src/QueryLookup.h
@@ -0,0 +1,66 @@
+// Author: Ivan Sovic
+
+#ifndef PBBAMIFY_QUERYLOOKUP_H
+#define PBBAMIFY_QUERYLOOKUP_H
+
+#include <cstdint>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <pbbam/BamReader.h>
+#include <pbbam/DataSet.h>
+
+namespace PacBio {
+namespace PbBamify {
+
+///
+/// \brief A simple container to hold the location of a read.
+///
+struct QueryLocation
+{
+    uint16_t fileNumber = 0;
+    int64_t fileOffset = 0;
+};
+
+///
+/// \brief QueryLookup parses all reads from PacBio indexes and creates a
+///        hash lookup where the key is the read's qname, and the value is a
+///        QueryLocation object pointing to the exact location of the read. The BAM
+///        record can then be loaded by setting the virtual offset and calling GetNext().
+///
+class QueryLookup
+{
+public:
+    explicit QueryLookup(BAM::DataSet dataset);
+    QueryLookup(const QueryLookup&) = delete;
+    QueryLookup& operator=(const QueryLookup&) = delete;
+
+    ///
+    /// \brief  Load() performs the work of setting up the BamReaders and constructing
+    ///         the hash table lookup.
+    ///
+    /// \throws std::runtime_error if there are more than 1 record for a given qname.
+    ///
+    void Load();
+
+    ///
+    /// \brief Find(...) attempts to find a given qName in the lookup and return
+    ///        the related BAM record. If it cannot be found, the function returns false.
+    ///
+    /// \returns true if the record was found and loaded, false otherwise.
+    ///
+    bool Find(const std::string& qName, BAM::BamRecord& record) const;
+
+private:
+    BAM::DataSet dataset_;
+    std::vector<std::shared_ptr<BAM::BamReader>> readers_;
+    std::unordered_map<std::string, QueryLocation> lookup_;
+};
+
+}  // namespace PbBamify
+}  // namespace PacBio
+
+#endif  // PBBAMIFY_QUERYLOOKUP_H
diff --git a/tools/pbbamify/src/main.cpp b/tools/pbbamify/src/main.cpp

new file mode 100644 (file)

index 0000000..5c0e034
--- /dev/null
+++ b/tools/pbbamify/src/main.cpp
@@ -0,0 +1,21 @@
+// Author: Ivan Sovic
+
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+
+#include <pbcopper/cli2/CLI.h>
+
+#include "PbBamifySettings.h"
+#include "PbBamifyWorkflow.h"
+
+int main(int argc, char* argv[])
+{
+    try {
+        return PacBio::CLI_v2::Run(argc, argv, PacBio::PbBamify::Settings::CreateCLI(),
+                                   &PacBio::PbBamify::Workflow::Runner);
+    } catch (const std::exception& e) {
+        std::cerr << "pbbamify ERROR: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tools/pbindex/src/PbIndexSettings.cpp b/tools/pbindex/src/PbIndexSettings.cpp

new file mode 100644 (file)

index 0000000..b85f352
--- /dev/null
+++ b/tools/pbindex/src/PbIndexSettings.cpp
@@ -0,0 +1,49 @@
+// Author: Derek Barnett
+
+#include "PbIndexSettings.h"
+
+#include <stdexcept>
+
+#include "PbIndexVersion.h"
+
+namespace PacBio {
+namespace PbIndex {
+namespace Options {
+
+// clang-format off
+const CLI_v2::PositionalArgument InputFile{
+R"({
+    "name" : "IN.bam",
+    "description" : "Input BAM file",
+    "type" : "file"
+})"};
+// clang-format on
+
+}  // namespace Options
+
+PacBio::CLI_v2::Interface Settings::CreateCLI()
+{
+    // clang-format off
+    const std::string description{
+        "pbindex creates a index file that enables random-access to PacBio-specific "
+        "data in BAM files. Generated index filename will be the same as input BAM "
+        "plus .pbi suffix."
+    };
+
+    CLI_v2::Interface interface{"pbindex", description, PbIndex::Version};
+    interface.DisableLogFileOption()
+             .DisableLogLevelOption()
+             .DisableNumThreadsOption();
+
+    interface.AddPositionalArguments({
+        Options::InputFile
+    });
+    // clang-format on
+
+    return interface;
+}
+
+Settings::Settings(const CLI_v2::Results& args) : InputFile(args[Options::InputFile]) {}
+
+}  // namespace PbIndex
+}  // namespace PacBio
diff --git a/tools/pbindex/src/PbIndexSettings.h b/tools/pbindex/src/PbIndexSettings.h

new file mode 100644 (file)

index 0000000..fc2dfc9
--- /dev/null
+++ b/tools/pbindex/src/PbIndexSettings.h
@@ -0,0 +1,25 @@
+// Author: Derek Barnett
+
+#ifndef PBINDEX_SETTINGS_H
+#define PBINDEX_SETTINGS_H
+
+#include <string>
+
+#include <pbcopper/cli2/CLI.h>
+
+namespace PacBio {
+namespace PbIndex {
+
+struct Settings
+{
+    static CLI_v2::Interface CreateCLI();
+
+    explicit Settings(const CLI_v2::Results& args);
+
+    std::string InputFile;
+};
+
+}  // namespace PbIndex
+}  // namespace PacBio
+
+#endif  // PBINDEX_SETTINGS_H
diff --git a/tools/pbindex/src/PbIndexVersion.h.in b/tools/pbindex/src/PbIndexVersion.h.in

new file mode 100644 (file)

index 0000000..9964f49
--- /dev/null
+++ b/tools/pbindex/src/PbIndexVersion.h.in
@@ -0,0 +1,16 @@
+// Author: Derek Barnett
+
+#ifndef PBINDEX_VERSION_H
+#define PBINDEX_VERSION_H
+
+#include <string>
+
+namespace PacBio {
+namespace PbIndex {
+
+const std::string Version = std::string("@PbIndex_VERSION@");
+
+} // namespace PbIndex
+} // namespace PacBio
+
+#endif // PBINDEX_VERSION_H
diff --git a/tools/pbindex/src/PbIndexWorkflow.cpp b/tools/pbindex/src/PbIndexWorkflow.cpp

new file mode 100644 (file)

index 0000000..88e31b4
--- /dev/null
+++ b/tools/pbindex/src/PbIndexWorkflow.cpp
@@ -0,0 +1,21 @@
+// Author: Derek Barnett
+
+#include "PbIndexWorkflow.h"
+
+#include <pbbam/BamFile.h>
+#include <pbbam/PbiFile.h>
+
+#include "PbIndexSettings.h"
+
+namespace PacBio {
+namespace PbIndex {
+
+int Workflow::Runner(const CLI_v2::Results& args)
+{
+    const Settings settings{args};
+    BAM::PbiFile::CreateFrom(settings.InputFile);
+    return EXIT_SUCCESS;
+}
+
+}  // namespace PbIndex
+}  // namespace PacBio
+\ No newline at end of file
diff --git a/tools/pbindex/src/PbIndexWorkflow.h b/tools/pbindex/src/PbIndexWorkflow.h

new file mode 100644 (file)

index 0000000..4202f31
--- /dev/null
+++ b/tools/pbindex/src/PbIndexWorkflow.h
@@ -0,0 +1,19 @@
+// Author: Derek Barnett
+
+#ifndef PBINDEX_WORKFLOW_H
+#define PBINDEX_WORKFLOW_H
+
+#include <pbcopper/cli2/Results.h>
+
+namespace PacBio {
+namespace PbIndex {
+
+struct Workflow
+{
+    static int Runner(const CLI_v2::Results& args);
+};
+
+}  // namespace PbIndex
+}  // namespace PacBio
+
+#endif  // PBINDEX_WORKFLOW_H
diff --git a/tools/pbindex/src/main.cpp b/tools/pbindex/src/main.cpp

new file mode 100644 (file)

index 0000000..f58e8db
--- /dev/null
+++ b/tools/pbindex/src/main.cpp
@@ -0,0 +1,21 @@
+// Author: Derek Barnett
+
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+
+#include <pbcopper/cli2/CLI.h>
+
+#include "PbIndexSettings.h"
+#include "PbIndexWorkflow.h"
+
+int main(int argc, char* argv[])
+{
+    try {
+        return PacBio::CLI_v2::Run(argc, argv, PacBio::PbIndex::Settings::CreateCLI(),
+                                   &PacBio::PbIndex::Workflow::Runner);
+    } catch (const std::exception& e) {
+        std::cerr << "pbindex ERROR: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tools/pbindexdump/src/CppFormatter.cpp b/tools/pbindexdump/src/CppFormatter.cpp

new file mode 100644 (file)

index 0000000..53aee2a
--- /dev/null
+++ b/tools/pbindexdump/src/CppFormatter.cpp
@@ -0,0 +1,145 @@
+// Author: Derek Barnett
+
+#include "CppFormatter.h"
+
+#include <cstdint>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <pbbam/PbiFile.h>
+#include <pbbam/PbiRawData.h>
+
+namespace PacBio {
+namespace PbIndexDump {
+namespace {
+
+std::string printReferenceData(const BAM::PbiRawReferenceData& referenceData)
+{
+    std::ostringstream out;
+    for (const auto& entry : referenceData.entries_) {
+        if (!out.str().empty()) out << ",\n";
+
+        out << "    PbiReferenceEntry{" << entry.tId_ << "," << entry.beginRow_ << ","
+            << entry.endRow_ << "}";
+    }
+    if (!out.str().empty()) out << '\n';
+    return out.str();
+}
+
+template <typename T>
+std::string printField(const std::vector<T>& c)
+{
+    std::ostringstream out;
+    for (const auto& e : c)
+        out << e << ",";
+    auto result = out.str();
+    if (!result.empty()) result.pop_back();  // remove final comma
+    return result;
+}
+
+template <>
+std::string printField(const std::vector<uint8_t>& c)
+{
+    std::ostringstream out;
+    for (const auto& e : c)
+        out << static_cast<uint16_t>(e)
+            << ",";  // cast to larger uint, force print as number not character
+    auto result = out.str();
+    if (!result.empty()) result.pop_back();  // remove final comma
+    return result;
+}
+
+template <>
+std::string printField(const std::vector<int8_t>& c)
+{
+    std::ostringstream out;
+    for (const auto& e : c)
+        out << static_cast<int16_t>(e)
+            << ",";  // cast to larger int, force print as number not character
+    auto result = out.str();
+    if (!result.empty()) result.pop_back();  // remove final comma
+    return result;
+}
+
+}  // namespace
+
+void CppFormatter::Run(const Settings& settings)
+{
+    const BAM::PbiRawData rawData{settings.InputFile};
+    const auto& barcodeData = rawData.BarcodeData();
+    const auto& basicData = rawData.BasicData();
+    const auto& mappedData = rawData.MappedData();
+    const auto& referenceData = rawData.ReferenceData();
+
+    std::string version;
+    switch (rawData.Version()) {
+        case BAM::PbiFile::Version_3_0_0:
+            version = "PbiFile::Version_3_0_0";
+            break;
+        case BAM::PbiFile::Version_3_0_1:
+            version = "PbiFile::Version_3_0_1";
+            break;
+        case BAM::PbiFile::Version_3_0_2:
+            version = "PbiFile::Version_3_0_2";
+            break;
+        default:
+            throw std::runtime_error("unsupported PBI version encountered");
+    }
+
+    std::string fileSections{"PbiFile::BASIC"};
+    if (rawData.HasBarcodeData()) fileSections += std::string{" | PbiFile::BARCODE"};
+    if (rawData.HasMappedData()) fileSections += std::string{" | PbiFile::MAPPED"};
+    if (rawData.HasReferenceData()) fileSections += std::string{" | PbiFile::REFERENCE"};
+
+    std::ostringstream s;
+    s << "PbiRawData rawData;\n"
+      << "rawData.Version(" << version << ");\n"
+      << "rawData.FileSections(" << fileSections << ");\n"
+      << "rawData.NumReads(" << rawData.NumReads() << ");\n"
+      << '\n'
+      << "PbiRawBasicData& basicData = rawData.BasicData();\n"
+      << "basicData.rgId_       = {" << printField(basicData.rgId_) << "};\n"
+      << "basicData.qStart_     = {" << printField(basicData.qStart_) << "};\n"
+      << "basicData.qEnd_       = {" << printField(basicData.qEnd_) << "};\n"
+      << "basicData.holeNumber_ = {" << printField(basicData.holeNumber_) << "};\n"
+      << "basicData.readQual_   = {" << printField(basicData.readQual_) << "};\n"
+      << "basicData.ctxtFlag_   = {" << printField(basicData.ctxtFlag_) << "};\n"
+      << "basicData.fileOffset_ = {" << printField(basicData.fileOffset_) << "};\n";
+
+    if (rawData.HasBarcodeData()) {
+        s << '\n'
+          << "PbiRawBarcodeData& barcodeData = rawData.BarcodeData();\n"
+          << "barcodeData.bcForward_ = {" << printField(barcodeData.bcForward_) << "};\n"
+          << "barcodeData.bcReverse_ = {" << printField(barcodeData.bcReverse_) << "};\n"
+          << "barcodeData.bcQual_    = {" << printField(barcodeData.bcQual_) << "};\n";
+    }
+
+    if (rawData.HasMappedData()) {
+        s << '\n'
+          << "PbiRawMappedData& mappedData = rawData.MappedData();" << std::endl
+          << "mappedData.tId_       = {" << printField(mappedData.tId_) << "};\n"
+          << "mappedData.tStart_    = {" << printField(mappedData.tStart_) << "};\n"
+          << "mappedData.tEnd_      = {" << printField(mappedData.tEnd_) << "};\n"
+          << "mappedData.aStart_    = {" << printField(mappedData.aStart_) << "};\n"
+          << "mappedData.aEnd_      = {" << printField(mappedData.aEnd_) << "};\n"
+          << "mappedData.revStrand_ = {" << printField(mappedData.revStrand_) << "};\n"
+          << "mappedData.nM_        = {" << printField(mappedData.nM_) << "};\n"
+          << "mappedData.nMM_       = {" << printField(mappedData.nMM_) << "};\n"
+          << "mappedData.mapQV_     = {" << printField(mappedData.mapQV_) << "};\n";
+    }
+
+    if (rawData.HasReferenceData()) {
+        s << '\n'
+          << "PbiRawReferenceData& referenceData = rawData.ReferenceData();\n"
+          << "referenceData.entries_ = { \n"
+          << printReferenceData(referenceData) << "};\n";
+    }
+
+    std::cout << s.str();
+}
+
+}  // namespace PbIndexDump
+}  // namespace PacBio
diff --git a/tools/pbindexdump/src/CppFormatter.h b/tools/pbindexdump/src/CppFormatter.h

new file mode 100644 (file)

index 0000000..9787c1f
--- /dev/null
+++ b/tools/pbindexdump/src/CppFormatter.h
@@ -0,0 +1,19 @@
+// Author: Derek Barnett
+
+#ifndef PBINDEXDUMP_CPPFORMATTER_H
+#define PBINDEXDUMP_CPPFORMATTER_H
+
+#include "PbIndexDumpSettings.h"
+
+namespace PacBio {
+namespace PbIndexDump {
+
+struct CppFormatter
+{
+    static void Run(const Settings& settings);
+};
+
+}  // namespace PbIndexDump
+}  // namespace PacBio
+
+#endif  // PBINDEXDUMP_CPPFORMATTER_H
diff --git a/tools/pbindexdump/src/JsonFormatter.cpp b/tools/pbindexdump/src/JsonFormatter.cpp

new file mode 100644 (file)

index 0000000..6f3e604
--- /dev/null
+++ b/tools/pbindexdump/src/JsonFormatter.cpp
@@ -0,0 +1,171 @@
+// Author: Derek Barnett
+
+#include "JsonFormatter.h"
+
+#include <cmath>
+#include <cstdint>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include <pbbam/PbiFile.h>
+#include <pbbam/PbiRawData.h>
+
+#include <pbcopper/json/JSON.h>
+
+namespace PacBio {
+namespace PbIndexDump {
+namespace {
+
+void FormatMetadata(const BAM::PbiRawData& index, JSON::Json& result)
+{
+    std::string version;
+    switch (index.Version()) {
+        case BAM::PbiFile::Version_3_0_0:
+            version = "3.0.0";
+            break;
+        case BAM::PbiFile::Version_3_0_1:
+            version = "3.0.1";
+            break;
+        case BAM::PbiFile::Version_3_0_2:
+            version = "3.0.2";
+            break;
+        default:
+            throw std::runtime_error{"unsupported PBI version encountered"};
+    }
+
+    JSON::Json fileSections;
+    fileSections.push_back("BasicData");
+    if (index.HasBarcodeData()) fileSections.push_back("BarcodeData");
+    if (index.HasMappedData()) fileSections.push_back("MappedData");
+    if (index.HasReferenceData()) fileSections.push_back("ReferenceData");
+
+    result["version"] = version;
+    result["fileSections"] = fileSections;
+    result["numReads"] = index.NumReads();
+}
+
+void FormatRaw(const BAM::PbiRawData& index, JSON::Json& result)
+{
+    const BAM::PbiRawBasicData& basicData = index.BasicData();
+    result["basicData"]["rgId"] = basicData.rgId_;
+    result["basicData"]["qStart"] = basicData.qStart_;
+    result["basicData"]["qEnd"] = basicData.qEnd_;
+    result["basicData"]["holeNumber"] = basicData.holeNumber_;
+    result["basicData"]["readQual"] = basicData.readQual_;
+    result["basicData"]["ctxtFlag"] = basicData.ctxtFlag_;
+    result["basicData"]["fileOffset"] = basicData.fileOffset_;
+
+    if (index.HasBarcodeData()) {
+        const BAM::PbiRawBarcodeData& barcodeData = index.BarcodeData();
+        result["barcodeData"]["bcForward"] = barcodeData.bcForward_;
+        result["barcodeData"]["bcReverse"] = barcodeData.bcReverse_;
+        result["barcodeData"]["bcQuality"] = barcodeData.bcQual_;
+    }
+
+    if (index.HasMappedData()) {
+        const BAM::PbiRawMappedData& mappedData = index.MappedData();
+
+        // casts to force -1 if unmapped
+        result["mappedData"]["tId"] = mappedData.tId_;
+        result["mappedData"]["tStart"] = mappedData.tStart_;
+        result["mappedData"]["tEnd"] = mappedData.tEnd_;
+
+        result["mappedData"]["aStart"] = mappedData.aStart_;
+        result["mappedData"]["aEnd"] = mappedData.aEnd_;
+        result["mappedData"]["revStrand"] = mappedData.revStrand_;
+        result["mappedData"]["nM"] = mappedData.nM_;
+        result["mappedData"]["nMM"] = mappedData.nMM_;
+        result["mappedData"]["mapQV"] = mappedData.mapQV_;
+    }
+}
+
+void FormatRecords(const BAM::PbiRawData& index, JSON::Json& result)
+{
+    JSON::Json reads;
+    const uint32_t numReads = index.NumReads();
+    const bool hasBarcodeData = index.HasBarcodeData();
+    const bool hasMappedData = index.HasMappedData();
+
+    for (uint32_t i = 0; i < numReads; ++i) {
+
+        JSON::Json read;
+
+        // common data
+        const BAM::PbiRawBasicData& basicData = index.BasicData();
+        read["rgId"] = basicData.rgId_[i];
+        read["qStart"] = basicData.qStart_[i];
+        read["qEnd"] = basicData.qEnd_[i];
+        read["holeNumber"] = basicData.holeNumber_[i];
+        read["readQuality"] = basicData.readQual_[i];
+        read["contextFlag"] = basicData.ctxtFlag_[i];
+        read["fileOffset"] = basicData.fileOffset_[i];
+
+        // barcode data, if present
+        if (hasBarcodeData) {
+            const BAM::PbiRawBarcodeData& barcodeData = index.BarcodeData();
+            read["bcForward"] = barcodeData.bcForward_[i];
+            read["bcReverse"] = barcodeData.bcReverse_[i];
+            read["bcQuality"] = barcodeData.bcQual_[i];
+        }
+
+        // mapping data, if present
+        if (hasMappedData) {
+            const BAM::PbiRawMappedData& mappedData = index.MappedData();
+
+            // casts to force -1 if unmapped
+            read["tId"] = static_cast<int32_t>(mappedData.tId_[i]);
+            read["tStart"] = static_cast<int32_t>(mappedData.tStart_[i]);
+            read["tEnd"] = static_cast<int32_t>(mappedData.tEnd_[i]);
+
+            read["aStart"] = mappedData.aStart_[i];
+            read["aEnd"] = mappedData.aEnd_[i];
+            read["nM"] = mappedData.nM_[i];
+            read["nMM"] = mappedData.nMM_[i];
+            read["mapQuality"] = mappedData.mapQV_[i];
+            read["reverseStrand"] = mappedData.revStrand_[i];
+        }
+
+        reads.push_back(std::move(read));
+    }
+    result["reads"] = reads;
+}
+
+void FormatReferences(const BAM::PbiRawData& index, JSON::Json& result)
+{
+    if (index.HasReferenceData()) {
+        JSON::Json references;
+        const auto& referenceData = index.ReferenceData();
+        for (const auto& entry : referenceData.entries_) {
+            JSON::Json element;
+            element["tId"] = static_cast<int32_t>(entry.tId_);
+            element["beginRow"] = static_cast<int32_t>(entry.beginRow_);
+            element["endRow"] = static_cast<int32_t>(entry.endRow_);
+            references.push_back(std::move(element));
+        }
+        result["references"] = references;
+    }
+}
+}  // namespace
+
+void JsonFormatter::Run(const Settings& settings)
+{
+    const BAM::PbiRawData index{settings.InputFile};
+    JSON::Json result;
+
+    FormatMetadata(index, result);
+    FormatReferences(index, result);
+
+    if (settings.JsonRaw)
+        FormatRaw(index, result);
+    else
+        FormatRecords(index, result);
+
+    // print
+    std::cout << result.dump(settings.JsonIndentLevel) << '\n';
+}
+
+}  // namespace PbIndexDump
+}  // namespace PacBio
diff --git a/tools/pbindexdump/src/JsonFormatter.h b/tools/pbindexdump/src/JsonFormatter.h

new file mode 100644 (file)

index 0000000..dcd79ec
--- /dev/null
+++ b/tools/pbindexdump/src/JsonFormatter.h
@@ -0,0 +1,19 @@
+// Author: Derek Barnett
+
+#ifndef PBINDEXDUMP_JSONFORMATTER_H
+#define PBINDEXDUMP_JSONFORMATTER_H
+
+#include "PbIndexDumpSettings.h"
+
+namespace PacBio {
+namespace PbIndexDump {
+
+struct JsonFormatter
+{
+    static void Run(const Settings& settings);
+};
+
+}  // namespace PbIndexDump
+}  // namespace PacBio
+
+#endif  // PBINDEXDUMP_JSONFORMATTER_H
diff --git a/tools/pbindexdump/src/PbIndexDumpSettings.cpp b/tools/pbindexdump/src/PbIndexDumpSettings.cpp

new file mode 100644 (file)

index 0000000..15e1015
--- /dev/null
+++ b/tools/pbindexdump/src/PbIndexDumpSettings.cpp
@@ -0,0 +1,113 @@
+// Author: Derek Barnett
+
+#include "PbIndexDumpSettings.h"
+
+#include <stdexcept>
+
+#include "PbIndexDumpVersion.h"
+
+namespace PacBio {
+namespace PbIndexDump {
+namespace Options {
+
+// clang-format off
+const CLI_v2::PositionalArgument InputFile{
+R"({
+    "name" : "input.bam.pbi",
+    "description" : "Input PBI file. If not provided, stdin will be used as input.",
+    "type" : "file",
+    "required" : false
+})"};
+
+const CLI_v2::Option Format{
+R"({
+    "names" : ["format"],
+    "description" : "Output format.",
+    "type" : "string",
+    "choices" : ["json", "cpp"],
+    "default" : "json"
+})"};
+
+const CLI_v2::Option JsonIndentLevel{
+R"({
+    "names" : ["json-indent-level"],
+    "description" : "JSON indent level.",
+    "type" : "int",
+    "default" : 4
+})"};
+
+const CLI_v2::Option JsonRaw{
+R"({
+    "names" : ["json-raw"],
+    "description" : [
+        "Print fields in a layout that more closely reflects the PBI ",
+        "file format (per-field columns, not per-record objects."
+    ]
+})"};
+
+// clang-format on
+
+}  // namespace Options
+
+CLI_v2::Interface Settings::CreateCLI()
+{
+    // clang-format off
+    const std::string description{
+        "pbindexdump prints a human-readable view of PBI data to stdout."
+    };
+
+    CLI_v2::Interface interface{"pbindexdump", description, PbIndexDump::Version};
+    interface.DisableLogFileOption()
+             .DisableLogLevelOption()
+             .DisableNumThreadsOption();
+
+    interface.AddPositionalArguments({
+        Options::InputFile
+    });
+
+    interface.AddOptionGroup("Output Options", {
+        Options::Format,
+        Options::JsonIndentLevel,
+        Options::JsonRaw
+    });
+
+    interface.HelpFooter({
+        "Supported output formats:\n"
+        "  json: 'pretty-printed' JSON\n"
+        "  cpp:  copy/paste-able C++ code that can be used to construct the\n"
+        "        equivalent BAM::PbiRawData object."
+    });
+    // clang-format on
+
+    return interface;
+}
+
+Settings::Settings(const CLI_v2::Results& args)
+    : Format(args[Options::Format])
+    , JsonIndentLevel{args[Options::JsonIndentLevel]}
+    , JsonRaw(args[Options::JsonRaw])
+{
+    // input file
+    const auto& posArgs = args.PositionalArguments();
+    if (posArgs.empty())
+        InputFile = "-";
+    else if (posArgs.size() == 1)
+        InputFile = posArgs[0];
+    else
+        throw std::runtime_error{"too many arguments provided."};
+
+    // format sanity check
+    if (Format != "json" && Format != "cpp")
+        throw std::runtime_error{"unsupported format requested: '" + Format + "'"};
+
+    // JSON options sanity check
+    if (Format != "json") {
+        if (args[Options::JsonRaw].IsUserProvided() ||
+            args[Options::JsonIndentLevel].IsUserProvided()) {
+            throw std::runtime_error{"JSON formatting options are not valid on non-JSON output"};
+        }
+    }
+}
+
+}  // namespace PbIndexDump
+}  // namespace PacBio
diff --git a/tools/pbindexdump/src/PbIndexDumpSettings.h b/tools/pbindexdump/src/PbIndexDumpSettings.h

new file mode 100644 (file)

index 0000000..665eea7
--- /dev/null
+++ b/tools/pbindexdump/src/PbIndexDumpSettings.h
@@ -0,0 +1,28 @@
+// Author: Derek Barnett
+
+#ifndef PBINDEXDUMP_SETTINGS_H
+#define PBINDEXDUMP_SETTINGS_H
+
+#include <string>
+
+#include <pbcopper/cli2/CLI.h>
+
+namespace PacBio {
+namespace PbIndexDump {
+
+struct Settings
+{
+    static CLI_v2::Interface CreateCLI();
+
+    explicit Settings(const CLI_v2::Results& args);
+
+    std::string InputFile;
+    std::string Format;
+    int JsonIndentLevel = 4;
+    bool JsonRaw = false;
+};
+
+}  // namespace PbIndexDump
+}  // namespace PacBio
+
+#endif  // PBINDEXDUMP_SETTINGS_H
+\ No newline at end of file
diff --git a/tools/pbindexdump/src/PbIndexDumpVersion.h.in b/tools/pbindexdump/src/PbIndexDumpVersion.h.in

new file mode 100644 (file)

index 0000000..1cd1b25
--- /dev/null
+++ b/tools/pbindexdump/src/PbIndexDumpVersion.h.in
@@ -0,0 +1,16 @@
+// Author: Derek Barnett
+
+#ifndef PBINDEXDUMP_VERSION_H
+#define PBINDEXDUMP_VERSION_H
+
+#include <string>
+
+namespace PacBio {
+namespace PbIndexDump {
+
+const std::string Version = std::string("@PbIndexDump_VERSION@");
+
+} // namespace PbIndexDump
+} // namespace PacBio
+
+#endif // PBINDEXDUMP_VERSION_H
diff --git a/tools/pbindexdump/src/PbIndexDumpWorkflow.cpp b/tools/pbindexdump/src/PbIndexDumpWorkflow.cpp

new file mode 100644 (file)

index 0000000..15a16ad
--- /dev/null
+++ b/tools/pbindexdump/src/PbIndexDumpWorkflow.cpp
@@ -0,0 +1,28 @@
+// Author: Derek Barnett
+
+#include "PbIndexDumpWorkflow.h"
+
+#include <cassert>
+#include <cstdlib>
+
+#include "CppFormatter.h"
+#include "JsonFormatter.h"
+#include "PbIndexDumpSettings.h"
+
+namespace PacBio {
+namespace PbIndexDump {
+
+int Workflow::Runner(const CLI_v2::Results& args)
+{
+    const Settings settings{args};
+    if (settings.Format == "json") {
+        JsonFormatter::Run(settings);
+    } else {
+        assert(settings.Format == "cpp");
+        CppFormatter::Run(settings);
+    }
+    return EXIT_SUCCESS;
+}
+
+}  // namespace PbIndexDump
+}  // namespace PacBio
+\ No newline at end of file
diff --git a/tools/pbindexdump/src/PbIndexDumpWorkflow.h b/tools/pbindexdump/src/PbIndexDumpWorkflow.h

new file mode 100644 (file)

index 0000000..3941f25
--- /dev/null
+++ b/tools/pbindexdump/src/PbIndexDumpWorkflow.h
@@ -0,0 +1,19 @@
+// Author: Derek Barnett
+
+#ifndef PBINDEXDUMP_WORKFLOW_H
+#define PBINDEXDUMP_WORKFLOW_H
+
+#include <pbcopper/cli2/Results.h>
+
+namespace PacBio {
+namespace PbIndexDump {
+
+struct Workflow
+{
+    static int Runner(const CLI_v2::Results& args);
+};
+
+}  // namespace PbIndexDump
+}  // namespace PacBio
+
+#endif  // PBINDEXDUMP_WORKFLOW_H
diff --git a/tools/pbindexdump/src/main.cpp b/tools/pbindexdump/src/main.cpp

new file mode 100644 (file)

index 0000000..29d8610
--- /dev/null
+++ b/tools/pbindexdump/src/main.cpp
@@ -0,0 +1,21 @@
+// Author: Derek Barnett
+
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+
+#include <pbcopper/cli2/CLI.h>
+
+#include "PbIndexDumpSettings.h"
+#include "PbIndexDumpWorkflow.h"
+
+int main(int argc, char* argv[])
+{
+    try {
+        return PacBio::CLI_v2::Run(argc, argv, PacBio::PbIndexDump::Settings::CreateCLI(),
+                                   &PacBio::PbIndexDump::Workflow::Runner);
+    } catch (const std::exception& e) {
+        std::cerr << "pbindexdump ERROR: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tools/pbmerge/src/PbMergeSettings.cpp b/tools/pbmerge/src/PbMergeSettings.cpp

new file mode 100644 (file)

index 0000000..d464df9
--- /dev/null
+++ b/tools/pbmerge/src/PbMergeSettings.cpp
@@ -0,0 +1,84 @@
+// Author: Derek Barnett
+
+#include "PbMergeSettings.h"
+
+#include <stdexcept>
+
+#include "PbMergeVersion.h"
+
+namespace PacBio {
+namespace PbMerge {
+namespace Options {
+
+// clang-format off
+const CLI_v2::Option OutputFile{
+R"({
+    "names" : ["o"],
+    "description" : "Output BAM filename. Writes to stdout if not provided.",
+    "type" : "string",
+    "default" : ""
+})"};
+
+const CLI_v2::Option NoPbi{
+R"({
+    "names" : ["no-pbi"],
+    "description" : "Disables creation of PBI index file. PBI always disabled when writing to stdout."
+})"};
+
+const CLI_v2::PositionalArgument InputFiles{
+R"({
+    "name" : "INPUT",
+    "description" : "Input file(s). Maybe one of: DataSetXML, BAM file(s), or FOFN",
+    "type" : "file"
+})"};
+// clang-format on
+
+}  // namespace Options
+
+CLI_v2::Interface Settings::CreateCLI()
+{
+    // clang-format off
+    const std::string description{
+        "pbmerge merges PacBio BAM files. If the input is DataSetXML, any filters will be applied."
+    };
+
+    CLI_v2::Interface interface{"pbmerge", description, PbMerge::Version};
+    interface.DisableLogFileOption()
+             .DisableLogLevelOption()
+             .DisableNumThreadsOption();
+
+    interface.AddOptionGroup("Input/Output", {
+        Options::OutputFile,
+        Options::NoPbi
+    });
+    interface.AddPositionalArguments({
+        Options::InputFiles
+    });
+    interface.HelpFooter(R"(Examples:
+    $ pbmerge -o merged.bam data.subreadset.xml
+    $ pbmerge -o merged.bam data_1.bam data_2.bam data_3.bam
+    $ pbmerge -o merged.bam data_bams.fofn)");
+
+    // clang-format on
+    return interface;
+}
+
+Settings::Settings(const CLI_v2::Results& args) : OutputFile(args[Options::OutputFile])
+{
+    // input file(s)
+    const auto& posArgs = args.PositionalArguments();
+    if (posArgs.empty()) throw std::runtime_error{"at least input one file must be specified"};
+    InputFiles = posArgs;
+
+    // output (stdout?)
+    if (OutputFile.empty()) OutputFile = "-";
+
+    // create PBI?
+    if (OutputFile == "-")
+        CreatePbi = false;  // always skip PBI if writing to stdout
+    else
+        CreatePbi = !args[Options::NoPbi];  // create PBI unless requested
+}
+
+}  // namespace PbMerge
+}  // namespace PacBio
diff --git a/tools/pbmerge/src/PbMergeSettings.h b/tools/pbmerge/src/PbMergeSettings.h

new file mode 100644 (file)

index 0000000..81274fd
--- /dev/null
+++ b/tools/pbmerge/src/PbMergeSettings.h
@@ -0,0 +1,29 @@
+// Author: Derek Barnett
+
+#ifndef PBMERGE_SETTINGS_H
+#define PBMERGE_SETTINGS_H
+
+#include <string>
+#include <vector>
+
+#include <pbcopper/cli2/CLI.h>
+
+namespace PacBio {
+namespace PbMerge {
+
+struct Settings
+{
+    static CLI_v2::Interface CreateCLI();
+
+    explicit Settings(const CLI_v2::Results& args);
+
+    std::vector<std::string> InputFiles;
+    std::string OutputFile;
+    bool CreatePbi;
+    std::vector<std::string> errors_;
+};
+
+}  // namespace PbMerge
+}  // namespace PacBio
+
+#endif  // PBMERGE_SETTINGS_H
diff --git a/tools/pbmerge/src/PbMergeVersion.h.in b/tools/pbmerge/src/PbMergeVersion.h.in

new file mode 100644 (file)

index 0000000..5730f08
--- /dev/null
+++ b/tools/pbmerge/src/PbMergeVersion.h.in
@@ -0,0 +1,16 @@
+// Author: Derek Barnett
+
+#ifndef PBMERGE_VERSION_H
+#define PBMERGE_VERSION_H
+
+#include <string>
+
+namespace PacBio {
+namespace PbMerge {
+
+const std::string Version = std::string("@PbMerge_VERSION@");
+
+} // namespace PbMerge
+} // namespace PacBio
+
+#endif // PBMERGE_VERSION_H
diff --git a/tools/pbmerge/src/PbMergeWorkflow.cpp b/tools/pbmerge/src/PbMergeWorkflow.cpp

new file mode 100644 (file)

index 0000000..c4358fa
--- /dev/null
+++ b/tools/pbmerge/src/PbMergeWorkflow.cpp
@@ -0,0 +1,37 @@
+// Author: Derek Barnett
+
+#include "PbMergeWorkflow.h"
+
+#include <pbbam/BamFileMerger.h>
+#include <pbbam/DataSet.h>
+#include <pbbam/ProgramInfo.h>
+
+#include "PbMergeSettings.h"
+#include "PbMergeVersion.h"
+
+namespace PacBio {
+namespace PbMerge {
+
+int Workflow::Runner(const CLI_v2::Results& args)
+{
+    const Settings settings{args};
+
+    // setup our @PG entry to add to header
+    BAM::ProgramInfo mergeProgram;
+    mergeProgram.Id(std::string("pbmerge-") + PbMerge::Version)
+        .Name("pbmerge")
+        .Version(PbMerge::Version);
+
+    BAM::DataSet dataset;
+    if (settings.InputFiles.size() == 1)
+        dataset = BAM::DataSet(settings.InputFiles.front());
+    else
+        dataset = BAM::DataSet(settings.InputFiles);
+
+    BAM::BamFileMerger::Merge(dataset, settings.OutputFile, settings.CreatePbi, mergeProgram);
+
+    return EXIT_SUCCESS;
+}
+
+}  // namespace PbMerge
+}  // namespace PacBio
+\ No newline at end of file
diff --git a/tools/pbmerge/src/PbMergeWorkflow.h b/tools/pbmerge/src/PbMergeWorkflow.h

new file mode 100644 (file)

index 0000000..dbf7fb1
--- /dev/null
+++ b/tools/pbmerge/src/PbMergeWorkflow.h
@@ -0,0 +1,22 @@
+// Author: Derek Barnett
+
+#ifndef PBMERGE_WORKFLOW_H
+#define PBMERGE_WORKFLOW_H
+
+#include <string>
+#include <vector>
+
+#include <pbcopper/cli2/Results.h>
+
+namespace PacBio {
+namespace PbMerge {
+
+struct Workflow
+{
+    static int Runner(const CLI_v2::Results& args);
+};
+
+}  // namespace PbMerge
+}  // namespace PacBio
+
+#endif  // PBMERGE_WORKFLOW_H
diff --git a/tools/pbmerge/src/main.cpp b/tools/pbmerge/src/main.cpp

new file mode 100644 (file)

index 0000000..832e018
--- /dev/null
+++ b/tools/pbmerge/src/main.cpp
@@ -0,0 +1,21 @@
+// Author: Derek Barnett
+
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+
+#include <pbcopper/cli2/CLI.h>
+
+#include "PbMergeSettings.h"
+#include "PbMergeWorkflow.h"
+
+int main(int argc, char* argv[])
+{
+    try {
+        return PacBio::CLI_v2::Run(argc, argv, PacBio::PbMerge::Settings::CreateCLI(),
+                                   &PacBio::PbMerge::Workflow::Runner);
+    } catch (const std::exception& e) {
+        std::cerr << "pbmerge ERROR: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
author	Andreas Tille <tille@debian.org>
	Fri, 20 Dec 2019 10:27:03 +0000 (10:27 +0000)
committer	Andreas Tille <tille@debian.org>
	Fri, 20 Dec 2019 10:27:03 +0000 (10:27 +0000)