From: Afif Elghraoui Date: Sun, 22 Jan 2017 07:48:21 +0000 (+0000) Subject: Import pbbam_0.7.4+ds.orig.tar.gz X-Git-Tag: archive/raspbian/0.19.0+dfsg-4+rpi1~5 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=a4848319cda7670ead169ad844e7cfb3060c19be;p=pbbam.git Import pbbam_0.7.4+ds.orig.tar.gz [dgit import orig pbbam_0.7.4+ds.orig.tar.gz] --- a4848319cda7670ead169ad844e7cfb3060c19be diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2bb3384 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +*.o +*.pico +*.so +*.a +*.dylib +*.pyc +*~ +CMakeLists.txt.user +bin/ +build/ +docs/Doxyfile +lib/ +tests/bin/test_pbbam +tests/data/test_group_query/group.fofn +tests/src/TestData.h + diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..b1990e9 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,64 @@ +language: cpp +compiler: + - gcc + +before_install: + + # Travis's default installs of gcc, boost, & cmake currently lag behind the minimums we need. + # So we need to manually setup them up. + # + # - gcc 4.8 (current default on Travis is 4.7, which is no good for C++11 work) + # - boost 1.55 + # - cmake 3.x + + # add external repos + - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test # gcc + - sudo add-apt-repository -y ppa:boost-latest/ppa # boost + - sudo add-apt-repository -y ppa:george-edison55/precise-backports # cmake + + # remove existing cmake install + - sudo apt-get remove -qq cmake cmake-data + - sudo apt-get autoremove -qq + + # update apt + - sudo apt-get update -y -qq + + # install + - sudo apt-get install -y -qq g++-4.8 boost1.55 cmake-data cmake + + # make sure we're using new gcc tools + - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90 + - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 90 + - sudo update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-4.8 90 + + # prep zlib + - sudo apt-get install -y -qq zlib1g-dev + + # prep htslib + - "cd .. && git clone https://github.com/PacificBiosciences/htslib.git && cd htslib && make && sudo make install; cd $TRAVIS_BUILD_DIR" + + # prep GoogleTest + - sudo apt-get install -y -qq libgtest-dev + +before_script: + # run cmake + - mkdir build + - cd build + - cmake .. -DGTEST_SRC_DIR=/usr/src/gtest -DCMAKE_BUILD_TYPE=Debug + +script: + # build & test + - make -j 3 + - make test + +branches: + only: + - master + +notifications: + recipients: + - dbarnett@pacb.com + email: + on_success: change + on_failure: always + diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..7704263 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,396 @@ +# PacBio::BAM - change log + +All notable changes to this project will be documented in this file. +This project adheres to [Semantic Versioning](http://semver.org/). + +**NOTE:** The current series (0.y.z) is under initial development. Anything may +change at any time. The public API should not be considered stable yet. Once we +lock down a version 1.0.0, this will define a reference point & compatibility +guarantees will be maintained within each major version series. + +## Active + +### Added +- Default DataSet 'Version' attribute if none already present (currently 4.0.0) + +## [0.7.4] - 2016-11-18 + +### Changed +- Compatibility for merging BAM files no longer requires exact match of PacBioBAM +version number (header @HD:pb tag). As long as both files meet the minimum +supported version number, the merge is allowed. + +## [0.7.3] - 2016-11-11 + +### Added +- Support for S/P2-C2 chemistry and forthcoming 4.0 basecaller + +## [0.7.2] - 2016-11-10 + +### Removed +- SAM header version equality check for merging BAM files. PacBioBAM version +number carries more meaning for PacBio data and thus will be the basis of +ensuring compatible merging. + +## [0.7.1] - 2016-11-09 + +### Added +- (Unindexed) FASTA reader & FastaSequence data structure. +- Missing unit tests for internal BAM tag access. +- Chemistry data for basecaller v3.3. +- Missing parsers for filtering barcode quality ("bq"), barcode forward ("bcf"), +and barcode reverse ("bcr") from DataSetXML. +- Integrated htslib into project. + +### Fixed +- Reverse complement on padding base. + +## [0.7.0] - 2016-09-26 + +### Added +- Clipping for CCS records + +### Fixed +- Cached position data leaking across records while iterating. +- Rolled back default pulse behavior in internal BAM API, to be backward- +compatible with existing client code (for now at least). v0.6.0 introduced +returning basecalled positions ONLY by default, rather than return ALL +pulses. +- Fixed crash when attempting to read from empty BAM/PBI files using the +PbiFilter-enabled APIs. + +## [0.6.0] - 2016-09-13 + +### Added +- BamWriter writes to a BAM file with the target name plus a ".tmp" suffix. On +successful completion (i.e. normal BamWriter destruction, not triggered by a +thrown exception) the file is renamed to the actual requested filename. +- PBI file creation follows the same temporary naming convention. +- Support for barcode pair (forward, reverse) in DataSetXML filter. +- Validation API & 'auto-validate' compile-time switch. +- Added support for a batched QNAME whitelist filter in DataSet XML. Uses (new) +Property name 'qname_file', with the value being the filepath containing the +whitelist. +- Exposed MD5 hashing to API. +- Ability to remove base features from a ReadGroupInfo object. +- Can construct an aggregate PbiRawData index object from a DataSet: essentially +concatenates all PBI data within the dataset. +- New SamWriter class to create SAM-formatted output of PacBio BAM data. +- Extended APIs for accessing "internal BAM" data, including PulseBehavior +switch for selecting between all pulses & basecalls only. + +### Fixed +- Improper 'clip to reference' product for BamRecord in some cases. +- Improper behavior in tag accessors (e.g. BamRecord::IPD()) on reverse strand- +aligned reads (bug 31339). +- Improper basecaller version parsing in ReadGroupInfo. + +### Changed +- RecordType::POLYMERASE renamed to RecordType::ZMW to reflect changes in +PacBio BAM spec v3.0.4 +- Refactored the 'virtual' reader classes - to match the new nomenclature, +and to combine the virtual reader & composite readers behind a shared +interface. The old class names still exist, as typedefs to the new ones, +and the interfaces are completely source-compatible - so as not to break +existing code. However, the old classes should be considered deprecated and +the new ones preferred. Below is the mapping of old -> new: + + VirtualPolymeraseBamRecord -> VirtualZmwBamRecord + VirtualPolymeraseReader -> ZmwReadStitcher + VirtualPolymeraseCompositeReader -> ZmwReadStitcher + ZmwWhitelistVirtualReader -> WhitelistedZmwReadStitcher + + +## [0.5.0] - 2016-02-22 + +### Added +- Platform model tag added to read group as RG::PM +- New scrap zmw type sz +- pbmerge accepts DataSetXML as input - using top-level resource BAMs as input, +applying filters, and generating a merged BAM. Also added FOFN support, instead +of listing out BAMs as command line args. +- PbiLocalContextFilter to allow filtering on subread local context. +- PbiBuilder: multithreading & zlib compression-level tuning for PBI output + +### Fixed +- Fixed mishandling of relative BAM filenames in the filename constructor for +DataSet (e.g. DataSet ds("../data.bam")). + +## [0.4.5] - 2016-01-14 + +### Changed +- PbiFilterQuery (and any other PBI-backed query, e.g. ZmwQuery ) now throws if +PBI file(s) missing insted of returning empty result. +- GenomicIntervalQuery now throws if BAI file(s) missing instead of returning +empty result. +- BamFile will throw if file is truncated (e.g. missing the EOF block). Disable +by defining PBBAM_NO_CHECK_EOF . + +## [0.4.4] - 2016-01-07 + +### Added +- bam2sam command line utility. The primary benefit is removing the dependency +on samtools during tests, but also provides users a functioning BAM -> SAM +converter in the absence of samtools. +- pbmerge command line utility. Allows merging N BAM files into one, optionally +creating the PBI file alongside. +- Added BamRecord::Pkmean2 & Pkmid2, 2D equivalent of Pkmean/Pkmid, for internal +BAMs. + +### Removed +- samtools dependency + +## [0.4.3] - 2015-12-22 + +### Added +- Compile using ccache by default, if available. Can be manually disabled using +-DPacBioBAM_use_ccache=OFF with cmake. +- pbindexdump: command-line utility that converts PBI file data into human- +readable formats. (JSON by default). + +### Changed +- CMake option PacBioBAM_build_pbindex is being deprecated. Use +PacBioBAM_build_tools instead. + +## [0.4.2] - 2015-12-22 + +### Changed +- BamFile::PacBioIndexExists & StandardIndexExists no longer check timestamps. +Copying/moving files around can yield timestamps that are not helpful (no longer +guaranteed that the .pbi will be "newer" than the .bam, even though no content +changed). Added methods (e.g. bool BamFile::PacBioIndexIsNewer()) to do that +lookup if needed, but it is no longer done automatically. + +## [0.4.1] - 2015-12-18 + +### Added +- BamRecord::HasNumPasses + +### Changed +- VirtualPolymeraseBamRecord::VirtualRegionsTable(type) returns an empty vector +of regions if none are associated with the requested type, instead of throwing. + +## [0.4.0] - 2015-12-15 + +### Changed +- Redesigned PbiFilter interface and backend. Previous implementation did not +scale well as intermediate results were far too unwieldy. This redesign provides +speedups of orders of magnitude in many cases. + +## [0.3.2] - 2015-12-10 + +### Added +- Support for ReadGroupInfo sequencing chemistry data. +InvalidSequencingChemistryException thrown if an unsupported combination is +encountered. +- VirtualPolymeraseCompositeReader - for re-stitching records, across multiple +resources (e.g. from DataSetXML). Reader respects DataSet filter criteria. + +## [0.3.1] - 2015-10-30 + +### Added +- ZmwWhitelistVirtualReader: similar to VirtualPolymeraseReader but restricts +iteration to a whitelist of ZMW hole numbers, leveraging PBI index data for +random-access. + +### Fixed +- Fixed error in PBI construction, in which entire file sections (e.g. +BarcodeData or MappedData) where being dropped when any one record lacked data. +Correct behavior is to allow file section ommission if all records lack that +data type. + +## [0.3.0] - 2015-10-29 + +### Fixed +- Improper reporting of current offset from multi-threaded BamWriter. This had +the effect of creating broken PBIs that were written alongside the BAM. Added a +flush step, which incurs a performance hit, but restores correctness. + +## [0.2.4] - 2015-10-26 + +### Fixed +- Empty PbiFilter now returns all records, instead of filtering away all records. + +## [0.2.3] - 2015-10-26 + +### Added/Fixed +- Syncing DataSetXML across APIs. Primary changes include output of Version +attribute ("3.0.1") on appropriate elements, as well as resolution of namespace +issues. + +## [0.2.2] - 2015-10-22 + +### Added +- Added BAI bin calculation to BamWriter::Write, to ensure maximal compatibility +with downstream tools (e.g. 'samtools index'). A new BinCalculationMode enum +flag in BamWriter constructor cotnrols whether this behavior is enabled[default] +or not. + +## [0.2.1] - 2015-10-19 + +### Added +- Exposed the following classes to public API: + - BamReader + - BaiIndexedBamReader + - PbiIndexedBamReader + - GenomicIntervalCompositeBamReader + - PbiFilterCompositeBamReader + +## [0.2.0] - 2015-10-09 + +### Changed +- BAM spec v3.0.1 compliance. Previous (betas) versions of the BAM spec are not +supported and will causean exception to be throw if encountered. +- PBI lookup interface & backend, see PbiIndex.h & PbiLookupData.h for details. + +### Added +- BamFile::PacBioIndexExists() & BamFile::StandardIndexExists() - query the +existence of index files without auto-building them if they are missing, as in +BamFile::Ensure*IndexExists(). +- GenomicInterval now accepts an htslib/samtools-style REGION string in the +constructor: GenomicInterval("chr1:1000-2000"). Please note though, that pbbam +uses 0-based coordinates throughout, whereas samtools expects 1-based. The above +string is equivalent to "chr1:1001-2000" in samtools. +- Built-in PBI filters. See PbiFlter.h & PbiFilterTypes.h for built-in filters +and constructing composite filters. These can be used in conjunction with the +new PbiFilterQuery, which takes a generic PbiFilter and applies that to a +DataSet for iteration. +- New built-in queries: BarcodeQuery, ReadAccuracyQuery, SubreadLengthQuery. +These leverage the new filter API to construct a PbiFilter and apply to a +DataSet. +- Built-in BamRecord comparators that are STL-compatible. See Compare.h for full +list. This allows for statements like the following, which sorts records by ZMW +number: +``` c++ + vector data; + std::sort(data.begin(), data.end(), Compare::Zmw()); +``` +- "exciseSoftClips" option to BamRecord::CigarData() + +## [0.1.0] - 2015-07-17 + +### Changed +- BAM spec v3.0b7 compliance + - Removal of 'M' as allowed CIGAR operation. Attempt to use such a CIGAR op + will throw an exception. + - Addition of IPD/PulseWidth codec version info in header + +### Added +- Auto-generation of UTC timestamp for DataSet objects +- PbiBuilder - allows generation of PBI index data alongside generation or +modification of BAM record data. This obviates the need to wait for a completed +BAM, then go through the zlib decompression, etc. +- Added DataSet::FromXml(string xml) to create DataSets from "raw" XML string, +rather than building up using DataSet API or loading from existing file. +- "pbindex" command line tool to generate ".pbi" files from BAM data. The +executable is built by default, but can be disabled using the cmake option +"-DPacBioBAM_build_pbindex=OFF". + +### Fixed +- PBI construction failing on CCS reads + +## [0.0.8] - 2015-07-02 + +### Changed +- Build system refactoring. + +## [0.0.7] - 2015-07-02 + +### Added +- PBI index lookup API. Not so much intended for client use directly, but will +enable construction of higher-level semantic queries: grouping by, filtering, +etc. +- DataSet & PBI-aware queries (e.g. ZmwGroupQuery). More PBI-enabled queries to +follow. +- More flexibility in tag access. Samtools has a habit of performing a +"shrink-to-fit" when it handles integer-valued tag data. Thus we cannot +**guarantee** the binary type that our API will have to process. Safe +conversions are allowed on integer-like data only. Under- or overflows in +casting will trigger an exception. All other tag data types must be asked for +explicitly, or else an exception will be raised, as before. +- BamHeader::DeepCopy - allows creation of editable header data, without +overwriting all shared instances + +### Fixed +- XSD compliance for DataSet APIs. + +### Changed +- The functionality provided by ZmwQuery (group by hole number), is now +available using the ZmwGroupQuery object. The new ZmwQuery returns a single- +record iterator (a la EntireFileQuery), but limited to a whitelist of requested +hole numbers. + +### Removed +- XSD non-compliant classes (e.g. ExternalDataReference) + +## [0.0.6] - 2015-06-07 + +### Added + +- Accessor methods for pulse bam support: + - LabelQV() + - AltLabelQV() + - LabelTag() + - AltLabelTag() + - Pkmean() + - Pkmid() + - PrePulseFrames() only RC, no clipping + - PulseCallWidth() only RC, no clipping + - PulseCall() case-sensitive RC, no clipping + - IPDRaw() to avoid up and downscaling for stitching +- BamRecord::ParseTagName and BamRecord::ParseTagString to convert a two + character tag string to a TagName enum and back. Allows a switch over tags. +- VirtualPolymeraseReader to create VirtualPolymeraseBamRecord from a + subreads|hqregion+scraps.bam +- VirtualRegion represents annotations of the polymerase reads, for adapters, + barcodes, lqregions, and hqregions. +- ReadGroupInfo operator== + +### Fixed + +- Reimplemented QueryStart(int), QueryEnd(int), UpdateName(void), + ReadGroup(ReadGroupInfo&), ReadGroupId(std::string&); + +## [0.0.5] - 2015-05-29 + +### Added + +- DataSet support. This includes XML I/O, basic dataset query/manipulation, and +multi-BAM-file queries. New classes are located in . DataSet- +capable queries currently reside in the PacBio::BAM::staging namespace. These +will be ported over to the main namespace once the support is stabilized and +works seamlessly with either a single BamFile or DataSet object as input. (bug +25941) +- PBI support. This includes read/write raw data & building from a BamFile. The +lookup API for random-access queries is under development, but the raw data is +available - for creating PBI files & generating summary statistics. (bug 26025) +- C# SWIG bindings, alongside existing Python and R wrappers. +- LocalContextFlags support in BamRecord (bug 26623) + +### Fixed + +- BamRecord[Impl] map quality now initialized with 255 (missing) value, instead +of 0. (bug 26228) +- ReadGroupId calculation. (bug 25940) + +## [0.0.4] - 2015-04-22 + +### Added + +- This changelog. Hope it helps. +- Hook to set verbosity of underlying htslib warnings. +- Grouped queries. (bug 26361) + +### Changed + +- Now using exceptions instead of return codes, output parameters, etc. +- Removed "messy" shared_ptrs across interface (see especially BamHeader). These +are now taken care of within the API, not exposed to client code. + +### Removed + +- BamReader + +### Fixed + +- ASCII tag output. (bug 26381) diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..48eeee8 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,75 @@ +######################################################################## +# CMake build script for PacBioBAM library. +######################################################################## + +cmake_policy(SET CMP0048 NEW) # lets us set version in project() +project(PacBioBAM VERSION 0.7.4 LANGUAGES CXX C) +cmake_minimum_required(VERSION 3.0) + +# project name & version +set(PacBioBAM_NAME pbbam) +set(PacBioBAM_VERSION + "${PacBioBAM_VERSION_MAJOR}.${PacBioBAM_VERSION_MINOR}.${PacBioBAM_VERSION_PATCH}" +) + +# list build-time options +option(PacBioBAM_build_docs "Build PacBioBAM's API documentation." ON) +option(PacBioBAM_build_tests "Build PacBioBAM's unit tests." ON) +option(PacBioBAM_build_shared "Build PacBioBAM as shared library as well." OFF) +option(PacBioBAM_build_tools "Build PacBioBAM command line utilities (e.g. pbindex)" ON) +option(PacBioBAM_wrap_csharp "Build PacBioBAM with SWIG bindings for C#." OFF) +option(PacBioBAM_wrap_python "Build PacBioBAM with SWIG bindings for Python." OFF) +option(PacBioBAM_wrap_r "Build PacBioBAM with SWIG bindings for R." OFF) +option(PacBioBAM_use_modbuild "Build PacBioBAM using Modular Build System." OFF) +option(PacBioBAM_use_ccache "Build PacBioBAM using ccache, if available." ON) +option(PacBioBAM_auto_validate "Build PacBioBAM with auto-validation enabled." OFF) + +if (PacBioBAM_wrap_csharp OR PacBioBAM_wrap_r OR PacBioBAM_wrap_python) + set(wrapping_swig TRUE) +else() + set(wrapping_swig FALSE) +endif() + +if(PacBioBAM_build_tests) + enable_testing() +endif() + +# project paths +set(PacBioBAM_RootDir ${CMAKE_CURRENT_LIST_DIR}) +set(PacBioBAM_DocsDir ${PacBioBAM_RootDir}/docs) +set(PacBioBAM_IncludeDir ${PacBioBAM_RootDir}/include) +set(PacBioBAM_SourceDir ${PacBioBAM_RootDir}/src) +set(PacBioBAM_SwigSourceDir ${PacBioBAM_RootDir}/src/swig) +set(PacBioBAM_TestsDir ${PacBioBAM_RootDir}/tests) +set(PacBioBAM_ThirdPartyDir ${PacBioBAM_RootDir}/third-party) +set(PacBioBAM_ToolsDir ${PacBioBAM_RootDir}/tools) + +if(NOT PacBioBAM_OutputDir) + set(PacBioBAM_OutputDir ${CMAKE_CURRENT_BINARY_DIR}) +else() + if(${wrapping_swig}) + message(FATAL_ERROR "SWIG bindings not currently supported in modular build.") + endif() +endif() +set(PacBioBAM_BinDir ${PacBioBAM_OutputDir}/bin) +set(PacBioBAM_LibDir ${PacBioBAM_OutputDir}/lib) + +set(GeneratedDir ${CMAKE_BINARY_DIR}/generated) +set(GeneratedTestDataDir ${GeneratedDir}/data) +file(MAKE_DIRECTORY ${PacBioBAM_BinDir}) +file(MAKE_DIRECTORY ${PacBioBAM_LibDir}) +file(MAKE_DIRECTORY ${GeneratedDir}) +file(MAKE_DIRECTORY ${GeneratedTestDataDir}) + +# project configuration (keep this order) +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake ${CMAKE_MODULE_PATH}) +include(pbbam-ccache) +include(pbbam-compilerflags) +include(pbbam-libtype) +include(pbbam-dependencies) + +# project components (keep this order) +add_subdirectory(src) +add_subdirectory(tools) +add_subdirectory(docs) +add_subdirectory(tests) diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000..86dddda --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,3 @@ +# PacBio::BAM - building & integrating + +Detailed build instructions can be found [here](http://pbbam.readthedocs.org/en/latest/getting_started.html). diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..77e9557 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,34 @@ +Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted (subject to the limitations in the +disclaimer below) provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Pacific Biosciences nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..046296e --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +# pbbam + +[![Build Status](https://travis-ci.org/PacificBiosciences/pbbam.svg?branch=master)](https://travis-ci.org/PacificBiosciences/pbbam) [![Documentation Status](https://readthedocs.org/projects/pbbam/badge/?version=latest)](http://pbbam.readthedocs.org/en/latest/?badge=latest) + +As of the 3.0 release of SMRTanalysis, PacBio is embracing the industry standard BAM +format for (both aligned and unaligned) basecall data files. We have also formulated +a BAM companion file format (bam.pbi) enabling fast access to a richer set of per-read +information as well as compatibility for software built around the legacy cmp.h5 format. + +The **pbbam** software package provides components to create, query, & edit PacBio BAM +files and associated indices. These components include a core C++ library, bindings for +additional languages, and command-line utilities. + +### Note: + +This library is **not** intended to be used as a general-purpose BAM utility - all input & output BAMs must adhere to the [PacBio BAM format specification](https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/BAM.rst). Non-PacBio BAMs will cause exceptions to be thrown. + +## Documentation + + - [Documentation Home](http://pbbam.readthedocs.org/en/latest/index.html) + - [Getting Started](http://pbbam.readthedocs.org/en/latest/getting_started.html) + - [C++ API Reference](http://pbbam.readthedocs.org/en/latest/api_reference.html) + + - [Changelog](https://github.com/PacificBiosciences/pbbam/blob/master/CHANGELOG.md) + +## License + + - [PacBio open source license](https://github.com/PacificBiosciences/pbbam/blob/master/LICENSE.txt) + diff --git a/cmake/FindCSharp.cmake b/cmake/FindCSharp.cmake new file mode 100644 index 0000000..08d09a7 --- /dev/null +++ b/cmake/FindCSharp.cmake @@ -0,0 +1,72 @@ +# +# A CMake Module for finding and using C# (.NET and Mono). +# +# The following variables are set: +# CSHARP_FOUND - set to ON if C# is found +# CSHARP_USE_FILE - the path to the C# use file +# CSHARP_TYPE - the type of the C# compiler (eg. ".NET" or "Mono") +# CSHARP_VERSION - the version of the C# compiler (eg. "v4.0" or "2.10.2") +# CSHARP_COMPILER - the path to the C# compiler executable (eg. "C:/Windows/Microsoft.NET/Framework/v4.0.30319/csc.exe" or "/usr/bin/gmcs") +# CSHARP_INTERPRETER - the path to interpreter needed to run CSharp executables +# CSHARP_PLATFORM - the C# target platform +# CSHARP_SDK - the SDK commandline switch (empty for .NET, for Mono eg. "/sdk:2" or "/sdk:4") +# +# This file is based on the work of GDCM: +# http://gdcm.svn.sf.net/viewvc/gdcm/trunk/CMake/FindCSharp.cmake +# Copyright (c) 2006-2010 Mathieu Malaterre +# + +# TODO: ADD ABILITY TO SELECT WHICH C# COMPILER eg. .NET or Mono (if both exist). For the moment, .NET is selected above Mono. + +# Make sure find package macros are included +include( FindPackageHandleStandardArgs ) + +unset( CSHARP_COMPILER CACHE ) +unset( CSHARP_INTERPRETER CACHE ) +unset( CSHARP_TYPE CACHE ) +unset( CSHARP_VERSION CACHE ) +unset( CSHARP_FOUND CACHE ) + +# By default use anycpu platform, allow the user to override +set( CSHARP_PLATFORM "anycpu" CACHE STRING "C# target platform: x86, x64, anycpu, or itanium" ) +if( NOT ${CSHARP_PLATFORM} MATCHES "x86|x64|anycpu|itanium" ) + message( FATAL_ERROR "The C# target platform '${CSHARP_PLATFORM}' is not valid. Please enter one of the following: x86, x64, anycpu, or itanium" ) +endif( ) + +if( WIN32 ) + find_package( DotNetFrameworkSdk ) + if( NOT CSHARP_DOTNET_FOUND ) + find_package( Mono ) + endif( ) +else( UNIX ) + find_package( Mono ) +endif( ) + +if( CSHARP_DOTNET_FOUND ) + set( CSHARP_TYPE ".NET" CACHE STRING "Using the .NET compiler" ) + set( CSHARP_VERSION ${CSHARP_DOTNET_VERSION} CACHE STRING "C# .NET compiler version" FORCE ) + set( CSHARP_COMPILER ${CSHARP_DOTNET_COMPILER_${CSHARP_DOTNET_VERSION}} CACHE STRING "Full path to .NET compiler" FORCE ) + set( CSHARP_INTERPRETER "" CACHE INTERNAL "Interpretor not required for .NET" FORCE ) +elseif( CSHARP_MONO_FOUND ) + set( CSHARP_TYPE "Mono" CACHE STRING "Using the Mono compiler" ) + set( CSHARP_VERSION ${CSHARP_MONO_VERSION} CACHE STRING "C# Mono compiler version" FORCE ) + set( CSHARP_COMPILER ${CSHARP_MONO_COMPILER_${CSHARP_MONO_VERSION}} CACHE STRING "Full path to Mono compiler" FORCE ) + set( CSHARP_INTERPRETER ${CSHARP_MONO_INTERPRETER_${CSHARP_MONO_VERSION}} CACHE STRING "Full path to Mono interpretor" FORCE ) + set( CSHARP_SDK "/sdk:4.5" CACHE STRING "C# Mono SDK commandline switch (e.g. /sdk:2, /sdk:4, /sdk:5)" ) +endif( ) + +# Handle WIN32 specific issues +if ( WIN32 ) + if ( CSHARP_COMPILER MATCHES "bat" ) + set( CSHARP_COMPILER "call ${CSHARP_COMPILER}" ) + endif ( ) +endif( ) + +FIND_PACKAGE_HANDLE_STANDARD_ARGS(CSharp DEFAULT_MSG CSHARP_TYPE CSHARP_VERSION CSHARP_COMPILER) + +mark_as_advanced( CSHARP_TYPE CSHARP_VERSION CSHARP_COMPILER CSHARP_INTERPRETER CSHARP_PLATFORM CSHARP_SDK ) + +# Set the USE_FILE path +# http://public.kitware.com/Bug/view.php?id=7757 +get_filename_component( current_list_path ${CMAKE_CURRENT_LIST_FILE} PATH ) +set( CSHARP_USE_FILE ${current_list_path}/UseCSharp.cmake ) diff --git a/cmake/FindDotNetFrameworkSdk.cmake b/cmake/FindDotNetFrameworkSdk.cmake new file mode 100644 index 0000000..8e12c70 --- /dev/null +++ b/cmake/FindDotNetFrameworkSdk.cmake @@ -0,0 +1,29 @@ +# Set paths and vars for .NET compilers +# This is hand-rolled because I had problems with the one from SimpleITK + +# +# The following variables are set: +# CSHARP_DOTNET_FOUND +# CSHARP_DOTNET_COMPILER_${version} eg. "CSHARP_DOTNET_COMPILER_v4.0.30319" +# CSHARP_DOTNET_VERSION eg. "v4.0.30319" +# CSHARP_DOTNET_VERSIONS eg. "v2.0.50727, v3.5, v4.0.30319" +# DotNetFrameworkSdk_USE_FILE +# +# CSHARP_PROJECT_BUILDER (xbuild/msbuild) + +set(framework_dir "C:/Windows/Microsoft.NET/Framework") + +set(CSHARP_DOTNET_VERSION "v4.0.30319") +set(CSHARP_DOTNET_VERSIONS "") +set(CSHARP_DOTNET_COMPILER_${CSHARP_DOTNET_VERSION} "${framework_dir}/${CSHARP_DOTNET_VERSION}/csc.exe") +set(CSHARP_PROJECT_BUILDER "${framework_dir}/${CSHARP_DOTNET_VERSION}/MSBuild.exe") + +if(EXISTS ${CSHARP_DOTNET_COMPILER_${CSHARP_DOTNET_VERSION}}) + set(CSHARP_DOTNET_FOUND 1) +else() + set(CSHARP_DOTNET_FOUND 0) +endif() + +# Set USE_FILE +get_filename_component( current_list_path ${CMAKE_CURRENT_LIST_FILE} PATH ) +set( DotNetFrameworkSdk_USE_FILE ${current_list_path}/UseDotNetFrameworkSdk.cmake ) \ No newline at end of file diff --git a/cmake/FindMono.cmake b/cmake/FindMono.cmake new file mode 100644 index 0000000..0fab116 --- /dev/null +++ b/cmake/FindMono.cmake @@ -0,0 +1,167 @@ +# +# A CMake Module for finding Mono. +# +# The following variables are set: +# CSHARP_MONO_FOUND +# CSHARP_MONO_COMPILER_${version} eg. "CSHARP_MONO_COMPILER_2.10.2" +# CSHARP_MONO_INTERPRETOR_${version} eg. "CSHARP_MONO_INTERPRETOR_2.10.2" +# CSHARP_MONO_VERSION eg. "2.10.2" +# CSHARP_MONO_VERSIONS eg. "2.10.2, 2.6.7" +# +# Additional references can be found here: +# http://www.mono-project.com/Main_Page +# http://www.mono-project.com/CSharp_Compiler +# http://mono-project.com/FAQ:_Technical (How can I tell where the Mono runtime is installed) +# +# This file is based on the work of GDCM: +# http://gdcm.svn.sf.net/viewvc/gdcm/trunk/CMake/FindMono.cmake +# Copyright (c) 2006-2010 Mathieu Malaterre +# + +set( csharp_mono_valid 1 ) +if( DEFINED CSHARP_MONO_FOUND ) + # The Mono compiler has already been found + # It may have been reset by the user, verify it is correct + if( NOT DEFINED CSHARP_MONO_COMPILER_${CSHARP_MONO_VERSION} ) + set( csharp_mono_version_user ${CSHARP_MONO_VERSION} ) + set( csharp_mono_valid 0 ) + set( CSHARP_MONO_FOUND 0 ) + set( CSHARP_MONO_VERSION "CSHARP_MONO_VERSION-NOTVALID" CACHE STRING "C# Mono compiler version, choices: ${CSHARP_MONO_VERSIONS}" FORCE ) + message( FATAL_ERROR "The C# Mono version '${csharp_mono_version_user}' is not valid. Please enter one of the following: ${CSHARP_MONO_VERSIONS}" ) + endif( NOT DEFINED CSHARP_MONO_COMPILER_${CSHARP_MONO_VERSION} ) +endif( DEFINED CSHARP_MONO_FOUND ) + +unset( CSHARP_MONO_VERSIONS CACHE ) # Clear versions +if( WIN32 ) + # Search for Mono on Win32 systems + # See http://mono-project.com/OldReleases and http://www.go-mono.com/mono-downloads/download.html + set( csharp_mono_bin_dirs ) + set( csharp_mono_search_hints + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.11.2;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.9;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.8;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.7;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.6;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.5;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.4;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.3;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.2;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.1;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.8;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.6.7;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.6.4;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.6.3;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.6.1;SdkInstallRoot]/bin" + "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.6;SdkInstallRoot]/bin" + ) + foreach( csharp_mono_search_hint ${csharp_mono_search_hints} ) + get_filename_component( csharp_mono_bin_dir "${csharp_mono_search_hint}" ABSOLUTE ) + if ( EXISTS "${csharp_mono_bin_dir}" ) + set( csharp_mono_bin_dirs ${csharp_mono_bin_dirs} ${csharp_mono_bin_dir} ) + endif ( EXISTS "${csharp_mono_bin_dir}" ) + endforeach( csharp_mono_search_hint ) + # TODO: Use HKLM_LOCAL_MACHINE\Software\Novell\Mono\DefaultCLR to specify default version + # get_filename_component( test "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono;DefaultCLR]" NAME ) + + foreach ( csharp_mono_bin_dir ${csharp_mono_bin_dirs} ) + string( REPLACE "\\" "/" csharp_mono_bin_dir ${csharp_mono_bin_dir} ) + if (EXISTS "${csharp_mono_bin_dir}/dmcs.bat") + set( csharp_mono_executable "${csharp_mono_bin_dir}/dmcs.bat") + elseif (EXISTS "${csharp_mono_bin_dir}/gmcs.bat") + set( csharp_mono_executable "${csharp_mono_bin_dir}/gmcs.bat") + elseif (EXISTS "${csharp_mono_bin_dir}/mcs.bat") + set( csharp_mono_executable "${csharp_mono_bin_dir}/mcs.bat") + endif (EXISTS "${csharp_mono_bin_dir}/dmcs.bat") + + if( csharp_mono_valid ) + # Extract version number (eg. 2.10.2) + string(REGEX MATCH "([0-9]*)([.])([0-9]*)([.]*)([0-9]*)" csharp_mono_version_temp ${csharp_mono_bin_dir}) + set( CSHARP_MONO_VERSION ${csharp_mono_version_temp} CACHE STRING "C# Mono compiler version" ) + mark_as_advanced( CSHARP_MONO_VERSION ) + + # Add variable holding executable + set( CSHARP_MONO_COMPILER_${csharp_mono_version_temp} ${csharp_mono_executable} CACHE STRING "C# Mono compiler ${csharp_mono_version_temp}" FORCE ) + mark_as_advanced( CSHARP_MONO_COMPILER_${csharp_mono_version_temp} ) + + # Set interpreter + if (EXISTS "${csharp_mono_bin_dir}/mono.exe") + set( CSHARP_MONO_INTERPRETER_${csharp_mono_version_temp} "${csharp_mono_bin_dir}/mono.exe" CACHE STRING "C# Mono interpreter ${csharp_mono_version_temp}" FORCE ) + mark_as_advanced( CSHARP_MONO_INTERPRETER_${csharp_mono_version_temp} ) + endif (EXISTS "${csharp_mono_bin_dir}/mono.exe") + endif( csharp_mono_valid ) + + # Create a list of supported compiler versions + if( NOT DEFINED CSHARP_MONO_VERSIONS ) + set( CSHARP_MONO_VERSIONS "${csharp_mono_version_temp}" CACHE STRING "Available C# Mono compiler versions" FORCE ) + else( NOT DEFINED CSHARP_MONO_VERSIONS ) + set( CSHARP_MONO_VERSIONS "${CSHARP_MONO_VERSIONS}, ${csharp_mono_version_temp}" CACHE STRING "Available C# Mono versions" FORCE ) + endif( NOT DEFINED CSHARP_MONO_VERSIONS ) + mark_as_advanced( CSHARP_MONO_VERSIONS ) + + # We found at least one Mono compiler version + set( CSHARP_MONO_FOUND 1 CACHE INTERNAL "Boolean indicating if C# Mono was found" ) + endforeach( csharp_mono_bin_dir ) + +else( UNIX ) + # Search for Mono on non-Win32 systems + set( chsarp_mono_names "mcs" "mcs.exe" "dmcs" "dmcs.exe" "smcs" "smcs.exe" "gmcs" "gmcs.exe" ) + set( + csharp_mono_paths + "/usr/bin/" + "/usr/local/bin/" + "/usr/lib/mono/2.0" + "/opt/novell/mono/bin" + ) + find_program( + csharp_mono_compiler # variable is added to the cache, we removed it below + NAMES ${chsarp_mono_names} + PATHS ${csharp_mono_paths} + ) + + if( EXISTS ${csharp_mono_compiler} ) + # Determine version + find_program( + csharp_mono_interpreter # variable is added to the cache, we removed it below + NAMES mono + PATHS ${csharp_mono_paths} + ) + if ( EXISTS ${csharp_mono_interpreter} ) + execute_process( + COMMAND ${csharp_mono_interpreter} -V + OUTPUT_VARIABLE csharp_mono_version_string + ) + string( REGEX MATCH "([0-9]*)([.])([0-9]*)([.]*)([0-9]*)" csharp_mono_version_temp ${csharp_mono_version_string} ) + set( CSHARP_MONO_INTERPRETER_${CSHARP_MONO_VERSION} ${csharp_mono_interpreter} CACHE STRING "C# Mono interpreter ${csharp_mono_version_temp}" FORCE ) + mark_as_advanced( CSHARP_MONO_INTERPRETER_${CSHARP_MONO_VERSION} ) + endif ( EXISTS ${csharp_mono_interpreter} ) + unset( csharp_mono_interpreter CACHE ) + + # We found Mono compiler + set( CSHARP_MONO_VERSION ${csharp_mono_version_temp} CACHE STRING "C# Mono compiler version" ) + mark_as_advanced( CSHARP_MONO_VERSION ) + set( CSHARP_MONO_COMPILER_${CSHARP_MONO_VERSION} ${csharp_mono_compiler} CACHE STRING "C# Mono compiler ${CSHARP_MONO_VERSION}" FORCE ) + mark_as_advanced( CSHARP_MONO_COMPILER_${CSHARP_MONO_VERSION} ) + set( CSHARP_MONO_VERSIONS ${CSHARP_MONO_VERSION} CACHE STRING "Available C# Mono compiler versions" FORCE ) + mark_as_advanced( CSHARP_MONO_VERSIONS ) + set( CSHARP_MONO_FOUND 1 CACHE INTERNAL "Boolean indicating if C# Mono was found" ) + + # Assume xbuild is just xbuild. + set(CSHARP_PROJECT_BUILDER "xbuild") + + + endif( EXISTS ${csharp_mono_compiler} ) + + # Remove temp variable from cache + unset( csharp_mono_compiler CACHE ) + +endif( WIN32 ) + +if( CSHARP_MONO_FOUND ) + # Report the found versions + message( STATUS "Found the following C# Mono versions: ${CSHARP_MONO_VERSIONS}" ) +endif( CSHARP_MONO_FOUND ) + +# Set USE_FILE +get_filename_component( current_list_path ${CMAKE_CURRENT_LIST_FILE} PATH ) +set( Mono_USE_FILE ${current_list_path}/UseMono.cmake ) diff --git a/cmake/FindR.cmake b/cmake/FindR.cmake new file mode 100644 index 0000000..6ae4354 --- /dev/null +++ b/cmake/FindR.cmake @@ -0,0 +1,48 @@ + +# +# - This module locates an installed R distribution. +# +# Defines the following: +# +# R_INCLUDE_DIR - Path to R include directory +# R_LIBRARIES - Path to R library +# R_LIBRARY_BASE - +# R_COMMAND - Path to R command +# RSCRIPT_EXECUTABLE - Path to Rscript command +# + + +# Make sure find package macros are included +include( FindPackageHandleStandardArgs ) + +set(TEMP_CMAKE_FIND_APPBUNDLE ${CMAKE_FIND_APPBUNDLE}) +set(CMAKE_FIND_APPBUNDLE "NEVER") +find_program(R_COMMAND R DOC "R executable.") +if(R_COMMAND) + execute_process(WORKING_DIRECTORY . COMMAND ${R_COMMAND} RHOME OUTPUT_VARIABLE R_BASE_DIR OUTPUT_STRIP_TRAILING_WHITESPACE) + set(R_HOME ${R_BASE_DIR} CACHE PATH "R home directory obtained from R RHOME") + mark_as_advanced(R_HOME) +endif(R_COMMAND) + +find_program(RSCRIPT_EXECUTABLE Rscript DOC "Rscript executable.") + +set(CMAKE_FIND_APPBUNDLE ${TEMP_CMAKE_FIND_APPBUNDLE}) + +# R.h gets installed in all sorts of places - +# ubuntu: /usr/share/R/include, RHEL/Fedora: /usr/include/R/R.h +find_path(R_INCLUDE_DIR R.h PATHS ${R_INCLUDE_DIR_HINT} /usr/local/lib /usr/local/lib64 /usr/share /usr/include ${R_BASE_DIR} PATH_SUFFIXES include R R/include DOC "Path to file R.h") +find_library(R_LIBRARY_BASE R PATHS ${R_BASE_DIR} PATH_SUFFIXES /lib DOC "R library (example libR.a, libR.dylib, etc.).") + +set(R_LIBRARIES ${R_LIBRARY_BASE}) +mark_as_advanced(RSCRIPT_EXECUTABLE R_LIBRARIES R_INCLUDE_DIR R_COMMAND R_LIBRARY_BASE) + + +set( _REQUIRED_R_VARIABLES R_INCLUDE_DIR R_COMMAND ) + +if( APPLE ) + # On linux platform some times the libR.so is not available, however + # on apple a link error results if the library is linked. + list( APPEND _REQUIRED_R_VARIABLES R_LIBRARIES R_LIBRARY_BASE ) +endif() + +find_package_handle_standard_args(R DEFAULT_MSG ${_REQUIRED_R_VARIABLES} ) diff --git a/cmake/PbbamTool.cmake b/cmake/PbbamTool.cmake new file mode 100644 index 0000000..daed917 --- /dev/null +++ b/cmake/PbbamTool.cmake @@ -0,0 +1,23 @@ +include(CMakeParseArguments) + +function(create_pbbam_tool) + + # parse args + set(oneValueArgs TARGET) + set(multiValueArgs SOURCES) + cmake_parse_arguments(create_pbbam_tool "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + # create executable + include_directories( + ${ToolsCommonDir} # shared tool code + ${GeneratedDir} # generated version headers + ${PacBioBAM_INCLUDE_DIRS} # pbbam/htslib includes + ) + add_executable(${create_pbbam_tool_TARGET} ${create_pbbam_tool_SOURCES}) + set_target_properties( + ${create_pbbam_tool_TARGET} PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${PacBioBAM_BinDir} + ) + target_link_libraries(${create_pbbam_tool_TARGET} pbbam) + +endfunction(create_pbbam_tool) diff --git a/cmake/UseCSharp.cmake b/cmake/UseCSharp.cmake new file mode 100644 index 0000000..dac4537 --- /dev/null +++ b/cmake/UseCSharp.cmake @@ -0,0 +1,111 @@ +# CMake Module for finding and using C# (.NET and Mono). +# +# The following global variables are assumed to exist: +# CSHARP_SOURCE_DIRECTORY - path to C# sources +# CSHARP_BINARY_DIRECTORY - path to place resultant C# binary files +# +# The following variables are set: +# CSHARP_TYPE - the type of the C# compiler (eg. ".NET" or "Mono") +# CSHARP_COMPILER - the path to the C# compiler executable (eg. "C:/Windows/Microsoft.NET/Framework/v4.0.30319/csc.exe") +# CSHARP_VERSION - the version number of the C# compiler (eg. "v4.0.30319") +# +# The following macros are defined: +# CSHARP_ADD_EXECUTABLE( name references [files] [output_dir] ) - Define C# executable with the given name +# CSHARP_ADD_LIBRARY( name references [files] [output_dir] ) - Define C# library with the given name +# +# Examples: +# CSHARP_ADD_EXECUTABLE( MyExecutable "" "Program.cs" ) +# CSHARP_ADD_EXECUTABLE( MyExecutable "ref1.dll ref2.dll" "Program.cs File1.cs" ) +# CSHARP_ADD_EXECUTABLE( MyExecutable "ref1.dll;ref2.dll" "Program.cs;File1.cs" ) +# +# This file is based on the work of GDCM: +# http://gdcm.svn.sf.net/viewvc/gdcm/trunk/CMake/UseCSharp.cmake +# Copyright (c) 2006-2010 Mathieu Malaterre +# + +# TODO: ADD SUPPORT FOR LINK LIBRARIES + +# Check something was found +if( NOT CSHARP_COMPILER ) + message( WARNING "A C# compiler executable was not found on your system" ) +endif( NOT CSHARP_COMPILER ) + +# Include type-based USE_FILE +if( CSHARP_TYPE MATCHES ".NET" ) + include( ${DotNetFrameworkSdk_USE_FILE} ) +elseif ( CSHARP_TYPE MATCHES "Mono" ) + include( ${Mono_USE_FILE} ) +endif ( CSHARP_TYPE MATCHES ".NET" ) + +macro( CSHARP_ADD_LIBRARY name ) + CSHARP_ADD_PROJECT( "library" ${name} ${ARGN} ) +endmacro( CSHARP_ADD_LIBRARY ) + +macro( CSHARP_ADD_EXECUTABLE name ) + CSHARP_ADD_PROJECT( "exe" ${name} ${ARGN} ) +endmacro( CSHARP_ADD_EXECUTABLE ) + +# Private macro +macro( CSHARP_ADD_PROJECT type name ) + set( refs "/reference:System.dll" ) + set( sources ) + set( sources_dep ) + + if( ${type} MATCHES "library" ) + set( output "dll" ) + elseif( ${type} MATCHES "exe" ) + set( output "exe" ) + endif( ${type} MATCHES "library" ) + + # Step through each argument + foreach( it ${ARGN} ) + if( ${it} MATCHES "(.*)(dll)" ) + # Argument is a dll, add reference + list( APPEND refs /reference:${it} ) + else( ) + # Argument is a source file + if( EXISTS ${it} ) + list( APPEND sources ${it} ) + list( APPEND sources_dep ${it} ) + elseif( EXISTS ${CSHARP_SOURCE_DIRECTORY}/${it} ) + list( APPEND sources ${CSHARP_SOURCE_DIRECTORY}/${it} ) + list( APPEND sources_dep ${CSHARP_SOURCE_DIRECTORY}/${it} ) + elseif( ${it} MATCHES "[*]" ) + # For dependencies, we need to expand wildcards + FILE( GLOB it_glob ${it} ) + list( APPEND sources ${it} ) + list( APPEND sources_dep ${it_glob} ) + endif( ) + endif ( ) + endforeach( ) + + # Check we have at least one source + list( LENGTH sources_dep sources_length ) + if ( ${sources_length} LESS 1 ) + MESSAGE( SEND_ERROR "No C# sources were specified for ${type} ${name}" ) + endif () + list( SORT sources_dep ) + + # Perform platform specific actions + if (WIN32) + string( REPLACE "/" "\\" sources ${sources} ) + else (UNIX) + string( REPLACE "\\" "/" sources ${sources} ) + endif (WIN32) + + # Add custom target and command + MESSAGE( STATUS "Adding C# ${type} ${name}: '${CSHARP_COMPILER} /t:${type} /out:${name}.${output} /platform:${CSHARP_PLATFORM} ${CSHARP_SDK} ${refs} ${sources}'" ) + add_custom_command( + COMMENT "Compiling C# ${type} ${name}: '${CSHARP_COMPILER} /t:${type} /out:${name}.${output} /platform:${CSHARP_PLATFORM} ${CSHARP_SDK} ${refs} ${sources}'" + OUTPUT ${CSHARP_BINARY_DIRECTORY}/${name}.${output} + COMMAND ${CSHARP_COMPILER} + ARGS /t:${type} /out:${name}.${output} /platform:${CSHARP_PLATFORM} ${CSHARP_SDK} ${refs} ${sources} + WORKING_DIRECTORY ${CSHARP_BINARY_DIRECTORY} + DEPENDS ${sources_dep} + ) + add_custom_target( + ${name} ALL + DEPENDS ${CSHARP_BINARY_DIRECTORY}/${name}.${output} + SOURCES ${sources_dep} + ) +endmacro( CSHARP_ADD_PROJECT ) diff --git a/cmake/UseDotNetFrameworkSdk.cmake b/cmake/UseDotNetFrameworkSdk.cmake new file mode 100644 index 0000000..6be4027 --- /dev/null +++ b/cmake/UseDotNetFrameworkSdk.cmake @@ -0,0 +1,16 @@ +# +# A CMake Module for using Mono. +# +# The following variables are set: +# (none) +# +# Additional references can be found here: +# http://www.mono-project.com/Main_Page +# http://www.mono-project.com/CSharp_Compiler +# +# This file is based on the work of GDCM: +# http://gdcm.svn.sf.net/viewvc/gdcm/trunk/CMake/FindMono.cmake +# Copyright (c) 2006-2010 Mathieu Malaterre +# + +message( STATUS "Using .NET compiler version ${CSHARP_DOTNET_VERSION}" ) \ No newline at end of file diff --git a/cmake/UseMono.cmake b/cmake/UseMono.cmake new file mode 100644 index 0000000..16a80ae --- /dev/null +++ b/cmake/UseMono.cmake @@ -0,0 +1,16 @@ +# +# A CMake Module for using Mono. +# +# The following variables are set: +# (none) +# +# Additional references can be found here: +# http://www.mono-project.com/Main_Page +# http://www.mono-project.com/CSharp_Compiler +# +# This file is based on the work of GDCM: +# http://gdcm.svn.sf.net/viewvc/gdcm/trunk/CMake/FindMono.cmake +# Copyright (c) 2006-2010 Mathieu Malaterre +# + +message( STATUS "Using Mono compiler version ${CSHARP_MONO_VERSION}" ) diff --git a/cmake/pbbam-ccache.cmake b/cmake/pbbam-ccache.cmake new file mode 100644 index 0000000..21b8ac5 --- /dev/null +++ b/cmake/pbbam-ccache.cmake @@ -0,0 +1,8 @@ + +if(PacBioBAM_use_ccache) + find_program(CCACHE_FOUND ccache) + if(CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) + endif() +endif() diff --git a/cmake/pbbam-compilerflags.cmake b/cmake/pbbam-compilerflags.cmake new file mode 100644 index 0000000..fcfd321 --- /dev/null +++ b/cmake/pbbam-compilerflags.cmake @@ -0,0 +1,44 @@ + +include(CheckCXXCompilerFlag) + +# C++11 check & enabling +if (CMAKE_VERSION VERSION_LESS "3.1") + if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") # clang + else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") # gcc + endif() +else() # 3.1+ + set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD_REQUIRED ON) +endif() + +# shared CXX flags for src & tests +if (MSVC) + set(PacBioBAM_CXX_FLAGS "/Wall") +else() + set(PacBioBAM_CXX_FLAGS "-Wall") +endif() + +# NOTE: -Wno-unused-local-typedefs used to quash clang warnings w/ Boost +check_cxx_compiler_flag("-Wno-unused-local-typedefs" HAS_NO_UNUSED_LOCAL_TYPEDEFS) +if(HAS_NO_UNUSED_LOCAL_TYPEDEFS) + set(PacBioBAM_CXX_FLAGS "${PacBioBAM_CXX_FLAGS} -Wno-unused-local-typedefs") +endif() + +check_cxx_compiler_flag("-Wno-sign-compare" HAS_NO_SIGN_COMPARE) +if(HAS_NO_SIGN_COMPARE) + set(PacBioBAM_CXX_FLAGS "${PacBioBAM_CXX_FLAGS} -Wno-sign-compare") +endif() + +# Turn on windows-style filepath resolution. +# We need to add this #define early (not just in the C# SWIG wrapper) +if(WIN32 AND PacBioBAM_wrap_csharp) + add_definitions(-DPBBAM_WIN_FILEPATHS) +endif() + +# For now, keep @rpath out of install names on OS X, as it causes SWIG +# tests to fail. +if(APPLE) + set(CMAKE_MACOSX_RPATH OFF) +endif() diff --git a/cmake/pbbam-dependencies.cmake b/cmake/pbbam-dependencies.cmake new file mode 100644 index 0000000..c2e21e6 --- /dev/null +++ b/cmake/pbbam-dependencies.cmake @@ -0,0 +1,23 @@ + +# pthreads +find_package(Threads REQUIRED) + +# boost +if(NOT Boost_INCLUDE_DIRS) + find_package(Boost REQUIRED) +endif() + +# Winsock for htslib on Windows +if(WIN32) + set(SOCKET_LIBRARIES "ws2_32") +endif() + +# zlib +if(NOT ZLIB_INCLUDE_DIRS OR NOT ZLIB_LIBRARIES) + find_package(ZLIB REQUIRED) +endif() + +# htslib +if(NOT HTSLIB_INCLUDE_DIRS OR NOT HTSLIB_LIBRARIES) + add_subdirectory(third-party/htslib external/htslib) +endif() diff --git a/cmake/pbbam-libtype.cmake b/cmake/pbbam-libtype.cmake new file mode 100644 index 0000000..4b9c0dd --- /dev/null +++ b/cmake/pbbam-libtype.cmake @@ -0,0 +1,21 @@ + +# determine if we need a shared lib +if(PacBioBAM_build_shared OR ${wrapping_swig}) + set(BUILD_SHARED_LIBS ON) + set(htslib_build_shared ON CACHE BOOL "force htslibConfig to export proper library name") + set(PB_LIB_MODE SHARED) + set(PB_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) +else() + set(BUILD_SHARED_LIBS OFF) + set(PB_LIB_MODE STATIC) + set(PB_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) +endif() + +if(WIN32) + # Limit the number of DLLs we will have to bundle + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -static-libstdc++") + set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -static-libstdc++") +endif() + + + diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt new file mode 100644 index 0000000..ff044b9 --- /dev/null +++ b/docs/CMakeLists.txt @@ -0,0 +1,11 @@ +find_package(Doxygen) + +if(DOXYGEN_FOUND) + configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${PacBioBAM_DocsDir}/Doxyfile @ONLY ) + add_custom_target(doc + ${DOXYGEN_EXECUTABLE} ${PacBioBAM_DocsDir}/Doxyfile + WORKING_DIRECTORY ${PacBioBAM_DocsDir} + COMMENT "Generating API documentation with Doxygen" + VERBATIM + ) +endif() diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in new file mode 100644 index 0000000..90f6f63 --- /dev/null +++ b/docs/Doxyfile.in @@ -0,0 +1,1602 @@ +# Doxyfile 1.6.3 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project +# +# All text after a hash (#) is considered a comment and will be ignored +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" ") + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = @PacBioBAM_NAME@ + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = @PacBioBAM_VERSION@ + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = @PacBioBAM_DocsDir@ + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, +# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English +# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, +# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, +# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = @PacBioBAM_IncludeDir@ + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful is your file systems +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 1 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +#samSpecURL=http://samtools.sourceforge.net/SAM1.pdf + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it parses. +# With this tag you can assign which parser to use for a given extension. +# Doxygen has a built-in mapping, but you can override or extend it using this tag. +# The format is ext=language, where ext is a file extension, and language is one of +# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP, +# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat +# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. Note that for custom extensions you also need to set +# FILE_PATTERNS otherwise the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = YES + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate getter +# and setter methods for a property. Setting this option to YES (the default) +# will make doxygen to replace the get and set methods by a property in the +# documentation. This will only work if the methods are indeed getting or +# setting a simple type. If this is not the case, or you want to show the +# methods anyway, you should set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = NO + +# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to +# determine which symbols to keep in memory and which to flush to disk. +# When the cache is full, less often used symbols will be written to disk. +# For small to medium size projects (<1000 input files) the default value is +# probably good enough. For larger projects a too small cache size can cause +# doxygen to be busy swapping symbols to and from disk most of the time +# causing a significant performance penality. +# If the system has enough physical memory increasing the cache will improve the +# performance by keeping more symbols in memory. Note that the value works on +# a logarithmic scale so increasing the size by one will rougly double the +# memory usage. The cache size is given by this formula: +# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols + +SYMBOL_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = NO + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespace are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen +# will list include files with double quotes in the documentation +# rather than with sharp brackets. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen +# will sort the (brief and detailed) documentation of class members so that +# constructors and destructors are listed first. If set to NO (the default) +# the constructors will appear in the respective orders defined by +# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. +# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO +# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or define consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and defines in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# If the sources in your project are distributed over multiple directories +# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy +# in the documentation. The default is NO. + +SHOW_DIRECTORIES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by +# doxygen. The layout file controls the global structure of the generated output files +# in an output format independent way. The create the layout file that represents +# doxygen's defaults, run doxygen with the -l option. You can optionally specify a +# file name after the option, if omitted DoxygenLayout.xml will be used as the name +# of the layout file. + +LAYOUT_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be abled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = @PacBioBAM_IncludeDir@ + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx +# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.d \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.idl \ + *.odl \ + *.cs \ + *.php \ + *.php3 \ + *.inc \ + *.m \ + *.mm \ + *.dox \ + *.py \ + *.f90 \ + *.f \ + *.vhd \ + *.vhdl + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = @PacBioBAM_IncludeDir@/pbbam/internal + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix filesystem feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = pugi, PacBio::BAM::internal + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = examples + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER +# is applied to all files. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. Otherwise they will link to the documentation. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = NO + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting +# this to NO can help when comparing the output of multiple runs. + +HTML_TIMESTAMP = YES + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. For this to work a browser that supports +# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox +# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). + +HTML_DYNAMIC_SECTIONS = YES + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. +# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER +# are set, an additional index file will be generated that can be used as input for +# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated +# HTML documentation. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can +# be used to specify the file name of the resulting .qch file. +# The path specified is relative to the HTML output folder. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#namespace + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#virtual-folders + +QHP_VIRTUAL_FOLDER = doc + +# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add. +# For more information please see +# http://doc.trolltech.com/qthelpproject.html#custom-filters + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see +# Qt Help Project / Custom Filters. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's +# filter section matches. +# Qt Help Project / Filter Attributes. + +QHP_SECT_FILTER_ATTRS = + +# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can +# be used to specify the location of Qt's qhelpgenerator. +# If non-empty doxygen will try to run qhelpgenerator on the generated +# .qhp file. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files +# will be generated, which together with the HTML files, form an Eclipse help +# plugin. To install this plugin and make it available under the help contents +# menu in Eclipse, the contents of the directory containing the HTML and XML +# files needs to be copied into the plugins directory of eclipse. The name of +# the directory within the plugins directory should be the same as +# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before +# the help appears. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have +# this name. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [1..20]) +# that doxygen will group on one line in the generated HTML documentation. + +ENUM_VALUES_PER_LINE = 4 + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to YES, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). +# Windows users are probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, +# and Class Hierarchy pages using a tree view instead of an ordered list. + +USE_INLINE_TREES = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box +# for the HTML output. The underlying search engine uses javascript +# and DHTML and should work on any modern browser. Note that when using +# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets +# (GENERATE_DOCSET) there is already a search function so this one should +# typically be disabled. For large projects the javascript based search engine +# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. + +SEARCHENGINE = YES + +# When the SERVER_BASED_SEARCH tag is enabled the search engine will be +# implemented using a PHP enabled web server instead of at the web client +# using Javascript. Doxygen will generate the search PHP script and index +# file to put on the web server. The advantage of the server +# based approach is that it scales better to large projects and allows +# full text search. The disadvances is that it is more difficult to setup +# and does not have live searching capabilities. + +SERVER_BASED_SEARCH = NO + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = NO + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. +# Note that when enabling USE_PDFLATEX this option is only used for +# generating bitmaps for formulas in the HTML output, but not in the +# Makefile that is written to the output directory. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, a4wide, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4wide + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +# If LATEX_SOURCE_CODE is set to YES then doxygen will include +# source code with syntax highlighting in the LaTeX output. +# Note that which sources are shown also depends on other settings +# such as SOURCE_BROWSER. + +LATEX_SOURCE_CODE = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = YES + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. This is useful +# if you want to understand what is going on. On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line, have an all uppercase name, and do not end with a semicolon. Such +# function macros are typically used for boiler-plate code, and will confuse +# the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = NO + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option is superseded by the HAVE_DOT option below. This is only a +# fallback. It is recommended to install and use dot, since it yields more +# powerful graphs. + +CLASS_DIAGRAMS = NO + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = NO + +# By default doxygen will write a font called FreeSans.ttf to the output +# directory and reference it in all dot files that doxygen generates. This +# font does not include all possible unicode characters however, so when you need +# these (or just want a differently looking font) you can specify the font name +# using DOT_FONTNAME. You need need to make sure dot is able to find the font, +# which can be done by putting it in a standard location or by setting the +# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory +# containing the font. + +DOT_FONTNAME = FreeSans + +# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. +# The default size is 10pt. + +DOT_FONTSIZE = 10 + +# By default doxygen will tell dot to use the output directory to look for the +# FreeSans.ttf font (which doxygen will put there itself). If you specify a +# different font using DOT_FONTNAME you can set the path where dot +# can find it using this tag. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are png, jpg, or gif +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, because dot on Windows does not +# seem to support this out of the box. Warning: Depending on the platform used, +# enabling this option may lead to badly anti-aliased labels on the edges of +# a graph (i.e. they become hard to read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..14e0fb1 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,168 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = build +SOURCEDIR = source + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SOURCEDIR) +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SOURCEDIR) + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext fig + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: basefig MANY_CLUSTER.png + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pbtoolkits.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pbtoolkits.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/pbtoolkits" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pbtoolkits" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +basefig: + dot -Tpng $(SOURCEDIR)/dependencies.dot > $(SOURCEDIR)/$@ + grep -v "\"pbsmrtpipe\" ->" $(SOURCEDIR)/dependencies.dot \ + | grep -v "> \"pbcore\"" \ + | sed 's/All/Sparse/' > $(SOURCEDIR)/sparse_dependencies.dot + dot -Tpng $(SOURCEDIR)/sparse_dependencies.dot \ + > $(SOURCEDIR)/sparse_dependencies.png + +%.png: basefig + grep -v $* $(SOURCEDIR)/sparse_dependencies.dot | \ + grep -v \? | sed 's/Sparse dependencies/Module bundles/' | \ + dot -Tpng > $(SOURCEDIR)/$@ + diff --git a/docs/examples/code/BarcodeQuery.txt b/docs/examples/code/BarcodeQuery.txt new file mode 100644 index 0000000..3fe8fce --- /dev/null +++ b/docs/examples/code/BarcodeQuery.txt @@ -0,0 +1,17 @@ +// using C++11 range-based for loop +BarcodeQuery query(42, dataset); +for (const BamRecord& r : query) { + assert(r.HasBarcodes()); + assert(r.BarcodeForward() == 42 || r.barcodeReverse() == 42); +} + +// OR + +// using iterators directly +BarcodeQuery query(42, dataset); +auto iter = query.cbegin(); +auto end = query.cend(); +for (; iter != end; ++iter) { + assert(iter->HasBarcodes()); + assert(iter->BarcodeForward() == 42 || iter->barcodeReverse() == 42); +} diff --git a/docs/examples/code/Compare.txt b/docs/examples/code/Compare.txt new file mode 100644 index 0000000..deecd8d --- /dev/null +++ b/docs/examples/code/Compare.txt @@ -0,0 +1,3 @@ +// sort on increasing ZMW hole number +std::vector records; +std::sort(records.begin(), records.end(), Compare::Zmw()); diff --git a/docs/examples/code/Compare_AlignedEnd.txt b/docs/examples/code/Compare_AlignedEnd.txt new file mode 100644 index 0000000..d34ed67 --- /dev/null +++ b/docs/examples/code/Compare_AlignedEnd.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::AlignedEnd()); diff --git a/docs/examples/code/Compare_AlignedStart.txt b/docs/examples/code/Compare_AlignedStart.txt new file mode 100644 index 0000000..68de3e2 --- /dev/null +++ b/docs/examples/code/Compare_AlignedStart.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::AlignedStart()); diff --git a/docs/examples/code/Compare_AlignedStrand.txt b/docs/examples/code/Compare_AlignedStrand.txt new file mode 100644 index 0000000..6c22cdc --- /dev/null +++ b/docs/examples/code/Compare_AlignedStrand.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::AlignedStrand()); diff --git a/docs/examples/code/Compare_BarcodeForward.txt b/docs/examples/code/Compare_BarcodeForward.txt new file mode 100644 index 0000000..1967341 --- /dev/null +++ b/docs/examples/code/Compare_BarcodeForward.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::BarcodeForward()); diff --git a/docs/examples/code/Compare_BarcodeQuality.txt b/docs/examples/code/Compare_BarcodeQuality.txt new file mode 100644 index 0000000..144f483 --- /dev/null +++ b/docs/examples/code/Compare_BarcodeQuality.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::BarcodeQuality()); diff --git a/docs/examples/code/Compare_BarcodeReverse.txt b/docs/examples/code/Compare_BarcodeReverse.txt new file mode 100644 index 0000000..9d3b245 --- /dev/null +++ b/docs/examples/code/Compare_BarcodeReverse.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::BarcodeReverse()); diff --git a/docs/examples/code/Compare_FullName.txt b/docs/examples/code/Compare_FullName.txt new file mode 100644 index 0000000..4b392b9 --- /dev/null +++ b/docs/examples/code/Compare_FullName.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::FullName()); diff --git a/docs/examples/code/Compare_LocalContextFlag.txt b/docs/examples/code/Compare_LocalContextFlag.txt new file mode 100644 index 0000000..aeab944 --- /dev/null +++ b/docs/examples/code/Compare_LocalContextFlag.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::LocalContextFlag()); diff --git a/docs/examples/code/Compare_MapQuality.txt b/docs/examples/code/Compare_MapQuality.txt new file mode 100644 index 0000000..fe22821 --- /dev/null +++ b/docs/examples/code/Compare_MapQuality.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::MapQuality()); diff --git a/docs/examples/code/Compare_MovieName.txt b/docs/examples/code/Compare_MovieName.txt new file mode 100644 index 0000000..cddcb64 --- /dev/null +++ b/docs/examples/code/Compare_MovieName.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::MovieName()); diff --git a/docs/examples/code/Compare_NumDeletedBases.txt b/docs/examples/code/Compare_NumDeletedBases.txt new file mode 100644 index 0000000..aa6dd4b --- /dev/null +++ b/docs/examples/code/Compare_NumDeletedBases.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::NumDeletedBases()); diff --git a/docs/examples/code/Compare_NumInsertedBases.txt b/docs/examples/code/Compare_NumInsertedBases.txt new file mode 100644 index 0000000..917d87f --- /dev/null +++ b/docs/examples/code/Compare_NumInsertedBases.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::NumInsertedBases()); diff --git a/docs/examples/code/Compare_NumMatches.txt b/docs/examples/code/Compare_NumMatches.txt new file mode 100644 index 0000000..47e3081 --- /dev/null +++ b/docs/examples/code/Compare_NumMatches.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::NumMatches()); diff --git a/docs/examples/code/Compare_NumMismatches.txt b/docs/examples/code/Compare_NumMismatches.txt new file mode 100644 index 0000000..12affb1 --- /dev/null +++ b/docs/examples/code/Compare_NumMismatches.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::NumMismatches()); diff --git a/docs/examples/code/Compare_QueryEnd.txt b/docs/examples/code/Compare_QueryEnd.txt new file mode 100644 index 0000000..d664d28 --- /dev/null +++ b/docs/examples/code/Compare_QueryEnd.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::QueryEnd()); diff --git a/docs/examples/code/Compare_QueryStart.txt b/docs/examples/code/Compare_QueryStart.txt new file mode 100644 index 0000000..12f6244 --- /dev/null +++ b/docs/examples/code/Compare_QueryStart.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::QueryStart()); diff --git a/docs/examples/code/Compare_ReadAccuracy.txt b/docs/examples/code/Compare_ReadAccuracy.txt new file mode 100644 index 0000000..9454309 --- /dev/null +++ b/docs/examples/code/Compare_ReadAccuracy.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::ReadAccuracy()); diff --git a/docs/examples/code/Compare_ReadGroupId.txt b/docs/examples/code/Compare_ReadGroupId.txt new file mode 100644 index 0000000..dab3497 --- /dev/null +++ b/docs/examples/code/Compare_ReadGroupId.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::ReadGroupId()); diff --git a/docs/examples/code/Compare_ReadGroupNumericId.txt b/docs/examples/code/Compare_ReadGroupNumericId.txt new file mode 100644 index 0000000..5ad8f9d --- /dev/null +++ b/docs/examples/code/Compare_ReadGroupNumericId.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::ReadGroupNumericId()); diff --git a/docs/examples/code/Compare_ReferenceEnd.txt b/docs/examples/code/Compare_ReferenceEnd.txt new file mode 100644 index 0000000..ed42d05 --- /dev/null +++ b/docs/examples/code/Compare_ReferenceEnd.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::ReferenceEnd()); diff --git a/docs/examples/code/Compare_ReferenceId.txt b/docs/examples/code/Compare_ReferenceId.txt new file mode 100644 index 0000000..5628427 --- /dev/null +++ b/docs/examples/code/Compare_ReferenceId.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::ReferenceId()); diff --git a/docs/examples/code/Compare_ReferenceName.txt b/docs/examples/code/Compare_ReferenceName.txt new file mode 100644 index 0000000..1f76e7e --- /dev/null +++ b/docs/examples/code/Compare_ReferenceName.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::ReferenceName()); diff --git a/docs/examples/code/Compare_ReferenceStart.txt b/docs/examples/code/Compare_ReferenceStart.txt new file mode 100644 index 0000000..0ccaf36 --- /dev/null +++ b/docs/examples/code/Compare_ReferenceStart.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::ReferenceStart()); diff --git a/docs/examples/code/Compare_TypeFromOperator.txt b/docs/examples/code/Compare_TypeFromOperator.txt new file mode 100644 index 0000000..afb0848 --- /dev/null +++ b/docs/examples/code/Compare_TypeFromOperator.txt @@ -0,0 +1,2 @@ +Compare::Type type = Compare::TypeFromOperator("!="); +assert(type == Compare::NOT_EQUAL); diff --git a/docs/examples/code/Compare_TypeToName.txt b/docs/examples/code/Compare_TypeToName.txt new file mode 100644 index 0000000..c44e1cb --- /dev/null +++ b/docs/examples/code/Compare_TypeToName.txt @@ -0,0 +1,2 @@ +string name = Compare::TypeToName(Compare::LESS_THAN); +assert(name = "Compare::LESS_THAN"); diff --git a/docs/examples/code/Compare_Zmw.txt b/docs/examples/code/Compare_Zmw.txt new file mode 100644 index 0000000..b02c426 --- /dev/null +++ b/docs/examples/code/Compare_Zmw.txt @@ -0,0 +1,2 @@ +std::vector records; +std::sort(records.begin(), records.end(), Compare::Zmw()); diff --git a/docs/examples/code/EntireFileQuery.txt b/docs/examples/code/EntireFileQuery.txt new file mode 100644 index 0000000..d3fcc2c --- /dev/null +++ b/docs/examples/code/EntireFileQuery.txt @@ -0,0 +1,15 @@ +// using C++11 range-based for loop +EntireFileQuery query(dataset); +for (const BamRecord& record : query) { + // ... do stuff ... +} + +// OR + +// using iterators +EntireFileQuery query(dataset); +auto iter = query.cbegin(); +auto end = query.cend(); +for (; iter != end; ++iter) { + // ... do stuff ... +} diff --git a/docs/examples/code/EntireFileQuery_BamFilename.txt b/docs/examples/code/EntireFileQuery_BamFilename.txt new file mode 100644 index 0000000..484db61 --- /dev/null +++ b/docs/examples/code/EntireFileQuery_BamFilename.txt @@ -0,0 +1,4 @@ +EntireFileQuery query("foo.bam"); +for (const BamRecord& record : query) { + // do stuff +} diff --git a/docs/examples/code/EntireFileQuery_NonConst.txt b/docs/examples/code/EntireFileQuery_NonConst.txt new file mode 100644 index 0000000..a0a092e --- /dev/null +++ b/docs/examples/code/EntireFileQuery_NonConst.txt @@ -0,0 +1,4 @@ +EntireFileQuery query("foo.bam"); +for (BamRecord& record : query) { + // ok to modify 'record' here +} diff --git a/docs/examples/code/GenomicIntervalQuery.txt b/docs/examples/code/GenomicIntervalQuery.txt new file mode 100644 index 0000000..651f254 --- /dev/null +++ b/docs/examples/code/GenomicIntervalQuery.txt @@ -0,0 +1,16 @@ +// using C++11 range-based for loop +GenomicIntervalQuery query(GenomicInterval("chr1:1000-2000"), dataset); +for (const BamRecord& record : query) { + // ... do stuff ... +} + +// OR + +// using iterators directly +GenomicIntervalQuery query(GenomicInterval("chr1:1000-2000"), dataset); +auto iter = query.cbegin(); +auto end = query.cend(); +for (; iter != end; ++iter) { + // ... do stuff ... +} + diff --git a/docs/examples/code/GenomicIntervalQuery_Reuse.txt b/docs/examples/code/GenomicIntervalQuery_Reuse.txt new file mode 100644 index 0000000..339ae95 --- /dev/null +++ b/docs/examples/code/GenomicIntervalQuery_Reuse.txt @@ -0,0 +1,8 @@ +DataSet ds("data.xml"); +GenomicIntervalQuery query(GenomicInterval(), ds); +for (const GenomicInterval& interval : intervals) { + query.Interval(interval); + for (const BamRecord& record : query) {} + // do stuff + } +} \ No newline at end of file diff --git a/docs/examples/code/PbiAlignedEndFilter.txt b/docs/examples/code/PbiAlignedEndFilter.txt new file mode 100644 index 0000000..bac1a46 --- /dev/null +++ b/docs/examples/code/PbiAlignedEndFilter.txt @@ -0,0 +1,4 @@ +PbiFilterQuery query(PbiAlignedEndFilter{3000, Compare::GREATER_THAN}); +for (const BamRecord& record : query) { + assert(record.AlignedEnd() > 3000); +} diff --git a/docs/examples/code/PbiAlignedLengthFilter.txt b/docs/examples/code/PbiAlignedLengthFilter.txt new file mode 100644 index 0000000..38dc3ff --- /dev/null +++ b/docs/examples/code/PbiAlignedLengthFilter.txt @@ -0,0 +1,4 @@ +PbiFilterQuery query(PbiAlignedLengthFilter{1000, Compare::GREATER_THAN}); +for (const BamRecord& record : query) { + assert((record.AlignedEnd() - record.AlignedStart()) > 1000); +} diff --git a/docs/examples/code/PbiAlignedStartFilter.txt b/docs/examples/code/PbiAlignedStartFilter.txt new file mode 100644 index 0000000..b78bb2c --- /dev/null +++ b/docs/examples/code/PbiAlignedStartFilter.txt @@ -0,0 +1,4 @@ +PbiFilterQuery query(PbiAlignedStartFilter{3000, Compare::GREATER_THAN}); +for (const BamRecord& record : query) { + assert(record.AlignedStart() > 3000); +} diff --git a/docs/examples/code/PbiAlignedStrandFilter.txt b/docs/examples/code/PbiAlignedStrandFilter.txt new file mode 100644 index 0000000..9f9a885 --- /dev/null +++ b/docs/examples/code/PbiAlignedStrandFilter.txt @@ -0,0 +1,5 @@ +PbiFilterQuery query(PbiAlignedStrandFilter{Strand::FORWARD}); +for (const BamRecord& record : query) { + assert(record.AlignedStrand() == Strand::FORWARD); +} + diff --git a/docs/examples/code/PbiBarcodeFilter.txt b/docs/examples/code/PbiBarcodeFilter.txt new file mode 100644 index 0000000..c7ce5cb --- /dev/null +++ b/docs/examples/code/PbiBarcodeFilter.txt @@ -0,0 +1,17 @@ +// single value +PbiFilter filter{ PbiBarcodeFilter{17} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + const auto barcodes = record.Barcodes(); + assert(barcodes.first == 17 || barcodes.second == 17); +} + +// whitelist +vector whitelist = { 50, 100 }; +PbiFilter filter{ PbiBarcodeFilter{whitelist} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + const auto barcodes = record.Barcodes(); + assert(barcodes.first == 50 || barcodes.second == 50 || + barcodes.first == 100 || barcodes.second == 100); +} diff --git a/docs/examples/code/PbiBarcodeForwardFilter.txt b/docs/examples/code/PbiBarcodeForwardFilter.txt new file mode 100644 index 0000000..a6c12fd --- /dev/null +++ b/docs/examples/code/PbiBarcodeForwardFilter.txt @@ -0,0 +1,15 @@ +// single value +PbiFilter filter{ PbiBarcodeForwardFilter{50} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.BarcodeForward() == 50); +} + +// whitelist +vector whitelist = { 50, 100 }; +PbiFilter filter{ PbiBarcodeForwardFilter{whitelist} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.BarcodeForward() == 50 || record.BarcodeForward() == 100); +} + diff --git a/docs/examples/code/PbiBarcodeQualityFilter.txt b/docs/examples/code/PbiBarcodeQualityFilter.txt new file mode 100644 index 0000000..34311d0 --- /dev/null +++ b/docs/examples/code/PbiBarcodeQualityFilter.txt @@ -0,0 +1,5 @@ +PbiFilter filter{ PbiBarcodeQualityFilter{42, Compare::GREATER_THAN_EQUAL} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.BarcodeQuality() >= 42); +} diff --git a/docs/examples/code/PbiBarcodeReverseFilter.txt b/docs/examples/code/PbiBarcodeReverseFilter.txt new file mode 100644 index 0000000..24134f8 --- /dev/null +++ b/docs/examples/code/PbiBarcodeReverseFilter.txt @@ -0,0 +1,15 @@ +// single value +PbiFilter filter{ PbiBarcodeReverseFilter{50} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.BarcodeReverse() == 50); +} + +// whitelist +vector whitelist = { 50, 100 }; +PbiFilter filter{ PbiBarcodeReverseFilter{whitelist} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.BarcodeReverse() == 50 || record.BarcodeReverse() == 100); +} + diff --git a/docs/examples/code/PbiBarcodesFilter.txt b/docs/examples/code/PbiBarcodesFilter.txt new file mode 100644 index 0000000..a655c57 --- /dev/null +++ b/docs/examples/code/PbiBarcodesFilter.txt @@ -0,0 +1,6 @@ +PbiFilter filter{ PbiBarcodesFilter{17, 18} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.BarcodeForward() == 17 && + record.BarcodeReverse() == 18); +} diff --git a/docs/examples/code/PbiBuilder_WithReader.txt b/docs/examples/code/PbiBuilder_WithReader.txt new file mode 100644 index 0000000..e2748c2 --- /dev/null +++ b/docs/examples/code/PbiBuilder_WithReader.txt @@ -0,0 +1,30 @@ +// To simply create a PBI file from BAM, the following is the easiest method: +// +#include +#include + +BamFile bamFile("data.bam"); +PbiFile::CreateFrom(bamFile); + + +// However if you need to perform additional operations while reading the BAM file, +// you can do something like the following: +// +{ + BamFile bamFile("data.bam"); + PbiBuilder builder(bamFile.PacBioIndexFilename(), + bamFile.Header().Sequences().size()); + BamReader reader(bamFile); + BamRecord b; + int64_t offset = reader.VirtualTell(); // first record's vOffset + while (reader.GetNext(b)) { + + // store PBI recrod entry & get next record's vOffset + builder.AddRecord(b, offset); + offset = reader.VirtualTell(); + + // ... additional stuff as needed ... + } + +} // <-- PBI data will only be written here, as PbiBuilder goes out of scope + diff --git a/docs/examples/code/PbiBuilder_WithWriter.txt b/docs/examples/code/PbiBuilder_WithWriter.txt new file mode 100644 index 0000000..0c7d6d1 --- /dev/null +++ b/docs/examples/code/PbiBuilder_WithWriter.txt @@ -0,0 +1,12 @@ +BamWriter writer(...); +PbiBuilder pbiBuilder(...); +int64_t vOffset; +BamRecord record; +while (...) { + + // ... populate record data ... + + // write record to BAM and add PBI entry + writer.Write(record, &vOffset); + pbiBuilder.AddRecord(record, vOffset); +} diff --git a/docs/examples/code/PbiFilterQuery.txt b/docs/examples/code/PbiFilterQuery.txt new file mode 100644 index 0000000..4914eab --- /dev/null +++ b/docs/examples/code/PbiFilterQuery.txt @@ -0,0 +1,22 @@ +// setup filter +PbiFilter filter; +filter.Add(PbiZmwFilter(42)); +filter.Add(PbiReadAccuracyFilter(0.9, Compare::GREATER_THAN_EQUAL)); + +// using C++11 range-based for loop +PbiFilterQuery query(filter, dataset); +for (const BamRecord& r : query) { + assert(r.HoleNumber() == 42); + assert(r.ReadAccuracy() >= 0.9); +} + +// OR + +// using iterators directly +PbiFilterQuery query(filter, dataset); +auto iter = query.cbegin(); +auto end = query.cend(); +for (; iter != end; ++iter) { + assert(iter->HoleNumber() == 42); + assert(iter->ReadAccuracy() >= 0.9); +} diff --git a/docs/examples/code/PbiFilter_Composition.txt b/docs/examples/code/PbiFilter_Composition.txt new file mode 100644 index 0000000..22cc6ff --- /dev/null +++ b/docs/examples/code/PbiFilter_Composition.txt @@ -0,0 +1,8 @@ +// (f1 && f2) || f3 + +PbiFilter f1; +PbiFilter f2; +PbiFilter intersect_f1_f2 = PbiFilter::Intersection(f1, f2); + +PbiFilter f3; +PbiFilter final = PbiFilter::Union(intersect_f1_f2, f3); diff --git a/docs/examples/code/PbiFilter_CustomFilter.txt b/docs/examples/code/PbiFilter_CustomFilter.txt new file mode 100644 index 0000000..f9cdd21 --- /dev/null +++ b/docs/examples/code/PbiFilter_CustomFilter.txt @@ -0,0 +1,21 @@ +struct MyCustomFilter +{ + bool Accepts(const PbiRawData& index, const size_t row) const + { + // Look up data for record at the provided row. Do any calculations + // necessary, then return whether that record passes your + // filter criteria. + + return true; + } +}; + +// use in composite filters +PbiFilter f; +f.Add(PbiMovieNameFilter("foo")); +f.Add(MyCustomFilter()); + +// pass directly to PbiFilterQuery +PbiFilterQuery query(MyCustomFilter(), "foo.bam"); +for (const BamRecord& record : query) + // ... do stuff ... diff --git a/docs/examples/code/PbiFilter_Interface.txt b/docs/examples/code/PbiFilter_Interface.txt new file mode 100644 index 0000000..0fea900 --- /dev/null +++ b/docs/examples/code/PbiFilter_Interface.txt @@ -0,0 +1 @@ +bool Accepts(const PbiRawData& index, const size_t row) const; diff --git a/docs/examples/code/PbiFilter_Intersection_Copy.txt b/docs/examples/code/PbiFilter_Intersection_Copy.txt new file mode 100644 index 0000000..ec0a7ac --- /dev/null +++ b/docs/examples/code/PbiFilter_Intersection_Copy.txt @@ -0,0 +1,3 @@ +PbiFilter result{ PbiFilter::INTERSECT }; +result.Add(filters); +return result; diff --git a/docs/examples/code/PbiFilter_Intersection_Move.txt b/docs/examples/code/PbiFilter_Intersection_Move.txt new file mode 100644 index 0000000..2b06106 --- /dev/null +++ b/docs/examples/code/PbiFilter_Intersection_Move.txt @@ -0,0 +1,3 @@ +PbiFilter result{ PbiFilter::INTERSECT }; +result.Add(std::move(filters)); +return result; diff --git a/docs/examples/code/PbiFilter_Union_Copy.txt b/docs/examples/code/PbiFilter_Union_Copy.txt new file mode 100644 index 0000000..7e2a192 --- /dev/null +++ b/docs/examples/code/PbiFilter_Union_Copy.txt @@ -0,0 +1,3 @@ +PbiFilter result{ PbiFilter::UNION }; +result.Add(filters); +return result; diff --git a/docs/examples/code/PbiFilter_Union_Move.txt b/docs/examples/code/PbiFilter_Union_Move.txt new file mode 100644 index 0000000..2e98d91 --- /dev/null +++ b/docs/examples/code/PbiFilter_Union_Move.txt @@ -0,0 +1,3 @@ +PbiFilter result{ PbiFilter::UNION }; +result.Add(std::move(filters)); +return result; diff --git a/docs/examples/code/PbiIdentityFilter.txt b/docs/examples/code/PbiIdentityFilter.txt new file mode 100644 index 0000000..6fcb8d0 --- /dev/null +++ b/docs/examples/code/PbiIdentityFilter.txt @@ -0,0 +1,6 @@ +// single value +PbiFilter filter{ PbiIdentityFilter{ 0.5, Compare::GREATER_THAN_EQUAL } }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + // ... at least 50% of record was aligned ... +} diff --git a/docs/examples/code/PbiLocalContextFilter.txt b/docs/examples/code/PbiLocalContextFilter.txt new file mode 100644 index 0000000..0aaa3eb --- /dev/null +++ b/docs/examples/code/PbiLocalContextFilter.txt @@ -0,0 +1,22 @@ + +// -------------------- +// has adapter_before +// -------------------- + +PbiFilter filter{ PbiLocalContextFilter{LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + const bool hasAdapterBefore = (record.LocalContextFlags() & LocalContextFlags::ADAPTER_BEFORE) != 0; + assert(hasAdapterBefore); +} + +// ---------------------------------- +// has any adapters, barcodes, etc. +// ---------------------------------- + +PbiFilter filter{ PbiLocalContextFilter{LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + const bool hasContext = (record.LocalContextFlags() != LocalContextFlags::NO_LOCAL_CONTEXT); + assert(hasContext); +} diff --git a/docs/examples/code/PbiMapQualityFilter.txt b/docs/examples/code/PbiMapQualityFilter.txt new file mode 100644 index 0000000..67fb5dc --- /dev/null +++ b/docs/examples/code/PbiMapQualityFilter.txt @@ -0,0 +1,5 @@ +PbiFilter filter{ PbiMapQualityFilter{75, Compare::GREATER_THAN_EQUAL} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.MapQuality() >= 75); +} diff --git a/docs/examples/code/PbiMovieNameFilter.txt b/docs/examples/code/PbiMovieNameFilter.txt new file mode 100644 index 0000000..dd124e2 --- /dev/null +++ b/docs/examples/code/PbiMovieNameFilter.txt @@ -0,0 +1,14 @@ +// single value +PbiFilter filter{ PbiMovieFilter{ "foo" } }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.MovieName() == "foo"); +} + +// whitelist +vector whitelist = { "foo", "bar" }; +PbiFilter filter{ PbiMovieNameFilter{whitelist} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.MovieName() == "foo" || record.MovieName() == "bar"); +} diff --git a/docs/examples/code/PbiNumDeletedBasesFilter.txt b/docs/examples/code/PbiNumDeletedBasesFilter.txt new file mode 100644 index 0000000..e1e3d1f --- /dev/null +++ b/docs/examples/code/PbiNumDeletedBasesFilter.txt @@ -0,0 +1,6 @@ +PbiFilter filter{ PbiNumDeletedBasesFilter{50, Compare::LESS_THAN} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.NumDeletedBases() < 50); +} + diff --git a/docs/examples/code/PbiNumInsertedBasesFilter.txt b/docs/examples/code/PbiNumInsertedBasesFilter.txt new file mode 100644 index 0000000..ab385e4 --- /dev/null +++ b/docs/examples/code/PbiNumInsertedBasesFilter.txt @@ -0,0 +1,6 @@ +PbiFilter filter{ PbiNumInsertedBasesFilter{50, Compare::LESS_THAN} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.NumInsertedBases() < 50); +} + diff --git a/docs/examples/code/PbiNumMatchesFilter.txt b/docs/examples/code/PbiNumMatchesFilter.txt new file mode 100644 index 0000000..4e1b97d --- /dev/null +++ b/docs/examples/code/PbiNumMatchesFilter.txt @@ -0,0 +1,6 @@ +PbiFilter filter{ PbiNumMatchesFilter{2000, Compare::GREATER_THAN_EQUAL} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.NumMatches() >= 2000); +} + diff --git a/docs/examples/code/PbiNumMismatchesFilter.txt b/docs/examples/code/PbiNumMismatchesFilter.txt new file mode 100644 index 0000000..690e4a1 --- /dev/null +++ b/docs/examples/code/PbiNumMismatchesFilter.txt @@ -0,0 +1,6 @@ +PbiFilter filter{ PbiNumMismatchesFilter{500, Compare::LESS_THAN} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.NumMismatches() < 500); +} + diff --git a/docs/examples/code/PbiQueryEndFilter.txt b/docs/examples/code/PbiQueryEndFilter.txt new file mode 100644 index 0000000..f85166b --- /dev/null +++ b/docs/examples/code/PbiQueryEndFilter.txt @@ -0,0 +1,5 @@ +PbiFilter filter{ PbiQueryEndFilter{3000, Compare::GREATER_THAN} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.QueryEnd() > 3000); +} diff --git a/docs/examples/code/PbiQueryLengthFilter.txt b/docs/examples/code/PbiQueryLengthFilter.txt new file mode 100644 index 0000000..123412a --- /dev/null +++ b/docs/examples/code/PbiQueryLengthFilter.txt @@ -0,0 +1,5 @@ +PbiFilter filter{ PbiQueryLengthFilter{2000, Compare::GREATER_THAN} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert( (record.QueryEnd() - record.QueryStart()) > 2000 ); +} diff --git a/docs/examples/code/PbiQueryNameFilter.txt b/docs/examples/code/PbiQueryNameFilter.txt new file mode 100644 index 0000000..f1e51c7 --- /dev/null +++ b/docs/examples/code/PbiQueryNameFilter.txt @@ -0,0 +1,15 @@ +// single value +PbiFilter filter{ PbiQueryNameFilter{ "movie_1/42/100_200" } }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.FullName() == "movie_1/42/100_200"); +} + +// whitelist +vector whitelist = { "movie_1/42/100_200", "movie_3/24/300_500" }; +PbiFilter filter{ PbiQueryNameFilter{whitelist} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.FullName() == "movie_1/42/100_200" || + record.FullName() == "movie_3/24/300_500"); +} diff --git a/docs/examples/code/PbiQueryStartFilter.txt b/docs/examples/code/PbiQueryStartFilter.txt new file mode 100644 index 0000000..56353df --- /dev/null +++ b/docs/examples/code/PbiQueryStartFilter.txt @@ -0,0 +1,5 @@ +PbiFilter filter{ PbiQueryStartFilter{3000, Compare::GREATER_THAN} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.QueryStart() > 3000); +} diff --git a/docs/examples/code/PbiReadAccuracyFilter.txt b/docs/examples/code/PbiReadAccuracyFilter.txt new file mode 100644 index 0000000..dd2df32 --- /dev/null +++ b/docs/examples/code/PbiReadAccuracyFilter.txt @@ -0,0 +1,5 @@ +PbiFilter filter{ PbiReadAccuracyFilter{0.8, Compare::GREATER_THAN_EQUAL} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReadAccuracy() >= 0.8); +} diff --git a/docs/examples/code/PbiReadGroupFilter.txt b/docs/examples/code/PbiReadGroupFilter.txt new file mode 100644 index 0000000..9af096d --- /dev/null +++ b/docs/examples/code/PbiReadGroupFilter.txt @@ -0,0 +1,64 @@ +// ------------------------- +// numeric ID +// ------------------------- + +// single value +PbiFilter filter{ PbiReadGroupFilter{ 2458765 } }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReadGroupNumericId() == 2458765); +} + +// whitelist +vector whitelist = { 2458765, -32143 }; +PbiFilter filter{ PbiReadGroupFilter{whitelist} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReadGroupNumericId() == 2458765 || + record.ReadGroupNumericId() == -32143); +} + +// ------------------------- +// printable ID +// ------------------------- + +// single value +PbiFilter filter{ PbiReadGroupFilter{ "12B33F00" } }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReadGroupId() == "12B33F00"); +} + +// whitelist +vector whitelist = { "12B33F00", "123ABC77" }; +PbiFilter filter{ PbiReadGroupFilter{whitelist} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReadGroupId() == "12B33F00" || + record.ReadGroupId() == "123ABC77"); +} + + +// ------------------------- +// read group +// ------------------------- + +BamFile file("foo.bam"); +BamHeader header = file.Header(); +assert(header.ReadGroups().size() > 1); + +// single value +PbiFilter filter{ PbiReadGroupFilter{ header.ReadGroups()[0] } }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReadGroup() == header.ReadGroups()[0]); +} + +// whitelist +vector whitelist = { header.ReadGroups()[0], header.ReadGroups()[1] }; +PbiFilter filter{ PbiReadGroupFilter{whitelist} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReadGroup() == header.ReadGroups()[0] || + record.ReadGroup() == header.ReadGroups()[1]); +} diff --git a/docs/examples/code/PbiReferenceEndFilter.txt b/docs/examples/code/PbiReferenceEndFilter.txt new file mode 100644 index 0000000..ce005c6 --- /dev/null +++ b/docs/examples/code/PbiReferenceEndFilter.txt @@ -0,0 +1,5 @@ +PbiFilter filter{ PbiReferenceEndFilter{ 2000 } }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReferenceEnd() == 2000); +} diff --git a/docs/examples/code/PbiReferenceIdFilter.txt b/docs/examples/code/PbiReferenceIdFilter.txt new file mode 100644 index 0000000..d963d28 --- /dev/null +++ b/docs/examples/code/PbiReferenceIdFilter.txt @@ -0,0 +1,16 @@ +// single value +PbiFilter filter{ PbiReferenceIdFilter{ 4 } }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReferenceId() == 4); +} + +// whitelist +vector whitelist = { 0, 1 }; +PbiFilter filter{ PbiReferenceIdFilter{whitelist} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReferenceId() == 0 || + record.ReferenceId() == 1); +} + diff --git a/docs/examples/code/PbiReferenceNameFilter.txt b/docs/examples/code/PbiReferenceNameFilter.txt new file mode 100644 index 0000000..c86b14a --- /dev/null +++ b/docs/examples/code/PbiReferenceNameFilter.txt @@ -0,0 +1,15 @@ +// single value +PbiFilter filter{ PbiReferenceNameFilter{ "chr1" } }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReferenceName() == "chr1"); +} + +// whitelist +vector whitelist = { "chr1", "chr5" }; +PbiFilter filter{ PbiReferenceNameFilter{whitelist} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReferenceName() == "chr1" || + record.ReferenceName() == "chr5"); +} diff --git a/docs/examples/code/PbiReferenceStartFilter.txt b/docs/examples/code/PbiReferenceStartFilter.txt new file mode 100644 index 0000000..d3ffdbb --- /dev/null +++ b/docs/examples/code/PbiReferenceStartFilter.txt @@ -0,0 +1,5 @@ +PbiFilter filter{ PbiReferenceStartFilter{ 2000 } }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.ReferenceStart() == 2000); +} diff --git a/docs/examples/code/PbiZmwFilter.txt b/docs/examples/code/PbiZmwFilter.txt new file mode 100644 index 0000000..c63a804 --- /dev/null +++ b/docs/examples/code/PbiZmwFilter.txt @@ -0,0 +1,16 @@ +// single value +PbiFilter filter{ PbiZmwFilter{ 4000 } }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.HoleNumber() == 4000); +} + +// whitelist +vector whitelist = { 4000, 8000 }; +PbiFilter filter{ PbiZmwFilter{whitelist} }; +PbiFilterQuery query(filter); +for (const BamRecord& record : query) { + assert(record.HoleNumber() == 4000 || + record.HoleNumber() == 8000); +} + diff --git a/docs/examples/code/ReadAccuracyQuery.txt b/docs/examples/code/ReadAccuracyQuery.txt new file mode 100644 index 0000000..5b0404f --- /dev/null +++ b/docs/examples/code/ReadAccuracyQuery.txt @@ -0,0 +1,15 @@ +// using C++11 range-based for loop +ReadAccuracyQuery query(0.9, Compare::GREATER_THAN_EQUAL, dataset); +for (const BamRecord& r : query) { + assert(r.ReadAccuracy() >= 0.9); +} + +// OR + +// using iterators directly +ReadAccuracyQuery query(0.9, Compare::GREATER_THAN_EQUAL, dataset); +auto iter = query.cbegin(); +auto end = query.cend(); +for (; iter != end; ++iter) { + assert(iter->ReadAccuracy() >= 0.9); +} diff --git a/docs/examples/code/SubreadLengthQuery.txt b/docs/examples/code/SubreadLengthQuery.txt new file mode 100644 index 0000000..466a1d9 --- /dev/null +++ b/docs/examples/code/SubreadLengthQuery.txt @@ -0,0 +1,15 @@ +// using C++11 range-based for loop +SubreadLengthQuery query(500, Compare::GREATER_THAN_EQUAL, dataset); +for (const BamRecord& r : query) { + assert((r.QueryEnd() - r.QueryStart()) >= 500); +} + +// OR + +// using iterators directly +SubreadLengthQuery query(500, Compare::GREATER_THAN_EQUAL, dataset); +auto iter = query.cbegin(); +auto end = query.cend(); +for (; iter != end; ++iter) { + assert((iter->QueryEnd() - iter->QueryStart()) >= 500); +} diff --git a/docs/examples/code/Tag_AsciiCtor.txt b/docs/examples/code/Tag_AsciiCtor.txt new file mode 100644 index 0000000..057d22f --- /dev/null +++ b/docs/examples/code/Tag_AsciiCtor.txt @@ -0,0 +1,10 @@ +// One-step construction +// +// This is useful in situations that require a const Tag. +// +const auto t = Tag('A', TagModifier::ASCII_CHAR); + +// or two-step construction +auto t = Tag('A'); +t.Modifier(TagModifier::ASCII_CHAR); + diff --git a/docs/examples/code/WhitelistedZmwReadStitcher.txt b/docs/examples/code/WhitelistedZmwReadStitcher.txt new file mode 100644 index 0000000..a94c27b --- /dev/null +++ b/docs/examples/code/WhitelistedZmwReadStitcher.txt @@ -0,0 +1,6 @@ +vector zmws = { ... }; +WhitelistedZmwReadStitcher reader(zmws, "primary.bam", "scraps.bam"); +while(reader.HasNext()) { + auto virtualRecord = reader.Next(); + // ... do stuff ... +} diff --git a/docs/examples/code/ZmwGroupQuery.txt b/docs/examples/code/ZmwGroupQuery.txt new file mode 100644 index 0000000..1d728ac --- /dev/null +++ b/docs/examples/code/ZmwGroupQuery.txt @@ -0,0 +1,23 @@ +bool allHoleNumbersEqual(const vector& group) +{ + if (group.empty()) + return true; + const auto firstHoleNumber = group[0].HoleNumber(); + for (size_t i = 1; i < group.size(); ++i) { + if (group[i].HoleNumber() != firstHoleNumber) + return false; + } + return true; +} + +vector whitelist = { 50, 100 }; +ZmwGroupQuery query(whitelist, dataset); +for(const vector& group : query) { + + assert(allHoleNumbersEqual(group)); + + for (const BamRecord& record : group) { + assert(record.HoleNumber() == 50 || + record.HoleNumber() == 100); + } +} diff --git a/docs/examples/code/ZmwQuery.txt b/docs/examples/code/ZmwQuery.txt new file mode 100644 index 0000000..59c22c4 --- /dev/null +++ b/docs/examples/code/ZmwQuery.txt @@ -0,0 +1,6 @@ +vector whitelist = { 50, 100 }; +ZmwQuery query(whitelist, dataset); +for (const BamRecord& record : query) { + assert(record.HoleNumber() == 50 || + record.HoleNumber() == 100); +} diff --git a/docs/examples/plaintext/AlignmentPrinterOutput.txt b/docs/examples/plaintext/AlignmentPrinterOutput.txt new file mode 100644 index 0000000..21d948b --- /dev/null +++ b/docs/examples/plaintext/AlignmentPrinterOutput.txt @@ -0,0 +1,13 @@ +Read : singleInsertion2 +Reference : lambda_NEB3011 + +Read-length : 49 +Concordance : 0.96 + +5210 : GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGG : 5249 + |||||||| ||||||||||||||||||| ||||||||||| + 0 : GGCTGCAG-GTACAGCGGTCAGGAGGCCAATTGATGCCGG : 39 + +5249 : ACTGGCTGAT : 5259 + |||||||||| + 39 : ACTGGCTGAT : 49 diff --git a/docs/examples/plaintext/PbiFilter_DataSetXmlFilters.txt b/docs/examples/plaintext/PbiFilter_DataSetXmlFilters.txt new file mode 100644 index 0000000..5b5e8c2 --- /dev/null +++ b/docs/examples/plaintext/PbiFilter_DataSetXmlFilters.txt @@ -0,0 +1,14 @@ + + + + # A + # B + + + + + # C + # D + + + diff --git a/docs/source/api/Accuracy.rst b/docs/source/api/Accuracy.rst new file mode 100644 index 0000000..f88b722 --- /dev/null +++ b/docs/source/api/Accuracy.rst @@ -0,0 +1,11 @@ +Accuracy +======== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::Accuracy + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/AlignmentPrinter.rst b/docs/source/api/AlignmentPrinter.rst new file mode 100644 index 0000000..ef0b191 --- /dev/null +++ b/docs/source/api/AlignmentPrinter.rst @@ -0,0 +1,11 @@ +AlignmentPrinter +================ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::AlignmentPrinter + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/AlignmentSet.rst b/docs/source/api/AlignmentSet.rst new file mode 100644 index 0000000..1817962 --- /dev/null +++ b/docs/source/api/AlignmentSet.rst @@ -0,0 +1,11 @@ +AlignmentSet +============ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::AlignmentSet + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BaiIndexedBamReader.rst b/docs/source/api/BaiIndexedBamReader.rst new file mode 100644 index 0000000..aab136f --- /dev/null +++ b/docs/source/api/BaiIndexedBamReader.rst @@ -0,0 +1,11 @@ +BaiIndexedBamReader +=================== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BaiIndexedBamReader + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BamFile.rst b/docs/source/api/BamFile.rst new file mode 100644 index 0000000..c7e48fb --- /dev/null +++ b/docs/source/api/BamFile.rst @@ -0,0 +1,11 @@ +BamFile +======= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BamFile + :members: + :protected-members: + :undoc-members: diff --git a/docs/source/api/BamHeader.rst b/docs/source/api/BamHeader.rst new file mode 100644 index 0000000..6cf06af --- /dev/null +++ b/docs/source/api/BamHeader.rst @@ -0,0 +1,11 @@ +BamHeader +========= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BamHeader + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BamReader.rst b/docs/source/api/BamReader.rst new file mode 100644 index 0000000..e0b6f3c --- /dev/null +++ b/docs/source/api/BamReader.rst @@ -0,0 +1,11 @@ +BamReader +========= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BamReader + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BamRecord.rst b/docs/source/api/BamRecord.rst new file mode 100644 index 0000000..a749775 --- /dev/null +++ b/docs/source/api/BamRecord.rst @@ -0,0 +1,17 @@ +BamRecord +========= + +.. code-block:: cpp + + #include + +.. doxygenenum:: PacBio::BAM::ClipType + +.. doxygenenum:: PacBio::BAM::RecordType + +.. doxygenenum:: PacBio::BAM::FrameEncodingType + +.. doxygenclass:: PacBio::BAM::BamRecord + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BamRecordBuilder.rst b/docs/source/api/BamRecordBuilder.rst new file mode 100644 index 0000000..ce477b4 --- /dev/null +++ b/docs/source/api/BamRecordBuilder.rst @@ -0,0 +1,11 @@ +BamRecordBuilder +================ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BamRecordBuilder + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BamRecordImpl.rst b/docs/source/api/BamRecordImpl.rst new file mode 100644 index 0000000..92b6759 --- /dev/null +++ b/docs/source/api/BamRecordImpl.rst @@ -0,0 +1,11 @@ +BamRecordImpl +============= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BamRecordImpl + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BamRecordView.rst b/docs/source/api/BamRecordView.rst new file mode 100644 index 0000000..2bc8fc4 --- /dev/null +++ b/docs/source/api/BamRecordView.rst @@ -0,0 +1,11 @@ +BamRecordView +============= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BamRecordView + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BamTagCodec.rst b/docs/source/api/BamTagCodec.rst new file mode 100644 index 0000000..9307421 --- /dev/null +++ b/docs/source/api/BamTagCodec.rst @@ -0,0 +1,11 @@ +BamTagCodec +=========== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BamTagCodec + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BamWriter.rst b/docs/source/api/BamWriter.rst new file mode 100644 index 0000000..2e2951b --- /dev/null +++ b/docs/source/api/BamWriter.rst @@ -0,0 +1,11 @@ +BamWriter +========= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BamWriter + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BarcodeLookupData.rst b/docs/source/api/BarcodeLookupData.rst new file mode 100644 index 0000000..2dac47d --- /dev/null +++ b/docs/source/api/BarcodeLookupData.rst @@ -0,0 +1,11 @@ +BarcodeLookupData +================= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BarcodeLookupData + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BarcodeQuery.rst b/docs/source/api/BarcodeQuery.rst new file mode 100644 index 0000000..5836059 --- /dev/null +++ b/docs/source/api/BarcodeQuery.rst @@ -0,0 +1,11 @@ +BarcodeQuery +============ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BarcodeQuery + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BarcodeSet.rst b/docs/source/api/BarcodeSet.rst new file mode 100644 index 0000000..a7ee056 --- /dev/null +++ b/docs/source/api/BarcodeSet.rst @@ -0,0 +1,11 @@ +BarcodeSet +========== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BarcodeSet + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/BasicLookupData.rst b/docs/source/api/BasicLookupData.rst new file mode 100644 index 0000000..b991fdf --- /dev/null +++ b/docs/source/api/BasicLookupData.rst @@ -0,0 +1,11 @@ +BasicLookupData +=============== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::BasicLookupData + :members: + :protected-members: + :undoc-members: diff --git a/docs/source/api/Cigar.rst b/docs/source/api/Cigar.rst new file mode 100644 index 0000000..cea30d5 --- /dev/null +++ b/docs/source/api/Cigar.rst @@ -0,0 +1,11 @@ +Cigar +===== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::Cigar + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/CigarOperation.rst b/docs/source/api/CigarOperation.rst new file mode 100644 index 0000000..856400a --- /dev/null +++ b/docs/source/api/CigarOperation.rst @@ -0,0 +1,13 @@ +CigarOperation +============== + +.. code-block:: cpp + + #include + +.. doxygenenum:: PacBio::BAM::CigarOperationType + +.. doxygenclass:: PacBio::BAM::CigarOperation + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/Compare.rst b/docs/source/api/Compare.rst new file mode 100644 index 0000000..bb28a7e --- /dev/null +++ b/docs/source/api/Compare.rst @@ -0,0 +1,8 @@ +Compare +======= + +.. code-block:: cpp + + #include + +.. doxygenfile:: Compare.h \ No newline at end of file diff --git a/docs/source/api/Config.rst b/docs/source/api/Config.rst new file mode 100644 index 0000000..c4be9e4 --- /dev/null +++ b/docs/source/api/Config.rst @@ -0,0 +1,8 @@ +Config +======= + +.. code-block:: cpp + + #include + +.. doxygenfile:: Config.h \ No newline at end of file diff --git a/docs/source/api/ConsensusAlignmentSet.rst b/docs/source/api/ConsensusAlignmentSet.rst new file mode 100644 index 0000000..bc5a7e5 --- /dev/null +++ b/docs/source/api/ConsensusAlignmentSet.rst @@ -0,0 +1,11 @@ +ConsensusAlignmentSet +===================== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ConsensusAlignmentSet + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/ConsensusReadSet.rst b/docs/source/api/ConsensusReadSet.rst new file mode 100644 index 0000000..846698d --- /dev/null +++ b/docs/source/api/ConsensusReadSet.rst @@ -0,0 +1,11 @@ +ConsensusReadSet +================ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ConsensusReadSet + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/ContigSet.rst b/docs/source/api/ContigSet.rst new file mode 100644 index 0000000..96bb20b --- /dev/null +++ b/docs/source/api/ContigSet.rst @@ -0,0 +1,11 @@ +ContigSet +========= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ContigSet + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/DataSet.rst b/docs/source/api/DataSet.rst new file mode 100644 index 0000000..8b3f0db --- /dev/null +++ b/docs/source/api/DataSet.rst @@ -0,0 +1,11 @@ +DataSet +======= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::DataSet + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/DataSetBase.rst b/docs/source/api/DataSetBase.rst new file mode 100644 index 0000000..f23fbb5 --- /dev/null +++ b/docs/source/api/DataSetBase.rst @@ -0,0 +1,11 @@ +DataSetBase +======= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::DataSetBase + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/DataSetMetadata.rst b/docs/source/api/DataSetMetadata.rst new file mode 100644 index 0000000..eea260d --- /dev/null +++ b/docs/source/api/DataSetMetadata.rst @@ -0,0 +1,11 @@ +DataSetMetadata +=============== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::DataSetMetadata + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/EntireFileQuery.rst b/docs/source/api/EntireFileQuery.rst new file mode 100644 index 0000000..4e7b86b --- /dev/null +++ b/docs/source/api/EntireFileQuery.rst @@ -0,0 +1,11 @@ +EntireFileQuery +=============== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::EntireFileQuery + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/ExtensionElement.rst b/docs/source/api/ExtensionElement.rst new file mode 100644 index 0000000..980303e --- /dev/null +++ b/docs/source/api/ExtensionElement.rst @@ -0,0 +1,11 @@ +ExtensionElement +================ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ExtensionElement + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/Extensions.rst b/docs/source/api/Extensions.rst new file mode 100644 index 0000000..6704807 --- /dev/null +++ b/docs/source/api/Extensions.rst @@ -0,0 +1,11 @@ +Extensions +================ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::Extensions + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/ExternalResource.rst b/docs/source/api/ExternalResource.rst new file mode 100644 index 0000000..03ab0d3 --- /dev/null +++ b/docs/source/api/ExternalResource.rst @@ -0,0 +1,11 @@ +ExternalResource +================ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ExternalResource + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/ExternalResources.rst b/docs/source/api/ExternalResources.rst new file mode 100644 index 0000000..bd72ea4 --- /dev/null +++ b/docs/source/api/ExternalResources.rst @@ -0,0 +1,11 @@ +ExternalResources +================= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ExternalResources + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/FileIndex.rst b/docs/source/api/FileIndex.rst new file mode 100644 index 0000000..c117214 --- /dev/null +++ b/docs/source/api/FileIndex.rst @@ -0,0 +1,11 @@ +FileIndex +========= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::FileIndex + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/FileIndices.rst b/docs/source/api/FileIndices.rst new file mode 100644 index 0000000..b25720c --- /dev/null +++ b/docs/source/api/FileIndices.rst @@ -0,0 +1,11 @@ +FileIndices +========= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::FileIndices + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/Filter.rst b/docs/source/api/Filter.rst new file mode 100644 index 0000000..6faa8aa --- /dev/null +++ b/docs/source/api/Filter.rst @@ -0,0 +1,11 @@ +Filter +====== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::Filter + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/Filters.rst b/docs/source/api/Filters.rst new file mode 100644 index 0000000..7ea1620 --- /dev/null +++ b/docs/source/api/Filters.rst @@ -0,0 +1,11 @@ +Filters +======= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::Filters + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/Frames.rst b/docs/source/api/Frames.rst new file mode 100644 index 0000000..cf260f2 --- /dev/null +++ b/docs/source/api/Frames.rst @@ -0,0 +1,11 @@ +Frames +====== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::Frames + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/GenomicInterval.rst b/docs/source/api/GenomicInterval.rst new file mode 100644 index 0000000..811b83a --- /dev/null +++ b/docs/source/api/GenomicInterval.rst @@ -0,0 +1,11 @@ +GenomicInterval +=============== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::GenomicInterval + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/GenomicIntervalCompositeBamReader.rst b/docs/source/api/GenomicIntervalCompositeBamReader.rst new file mode 100644 index 0000000..f658621 --- /dev/null +++ b/docs/source/api/GenomicIntervalCompositeBamReader.rst @@ -0,0 +1,11 @@ +GenomicIntervalCompositeBamReader +================================= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::GenomicIntervalCompositeBamReader + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/GenomicIntervalQuery.rst b/docs/source/api/GenomicIntervalQuery.rst new file mode 100644 index 0000000..7bae558 --- /dev/null +++ b/docs/source/api/GenomicIntervalQuery.rst @@ -0,0 +1,11 @@ +GenomicIntervalQuery +==================== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::GenomicIntervalQuery + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/HdfSubreadSet.rst b/docs/source/api/HdfSubreadSet.rst new file mode 100644 index 0000000..88bf008 --- /dev/null +++ b/docs/source/api/HdfSubreadSet.rst @@ -0,0 +1,11 @@ +HdfSubreadSet +============= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::HdfSubreadSet + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/IndexResultBlock.rst b/docs/source/api/IndexResultBlock.rst new file mode 100644 index 0000000..fac804a --- /dev/null +++ b/docs/source/api/IndexResultBlock.rst @@ -0,0 +1,17 @@ +IndexResultBlock +================ + +.. code-block:: cpp + + #include + +.. doxygenstruct:: PacBio::BAM::IndexResultBlock + :members: + :protected-members: + :undoc-members: + +.. doxygentypedef:: PacBio::BAM::IndexResultBlocks + +.. doxygentypedef:: PacBio::BAM::IndexList + +.. doxygentypedef:: PacBio::BAM::IndexRange \ No newline at end of file diff --git a/docs/source/api/IndexedFastaReader.rst b/docs/source/api/IndexedFastaReader.rst new file mode 100644 index 0000000..7c46064 --- /dev/null +++ b/docs/source/api/IndexedFastaReader.rst @@ -0,0 +1,11 @@ +IndexedFastaReader +================== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::IndexedFastaReader + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/Interval.rst b/docs/source/api/Interval.rst new file mode 100644 index 0000000..f506a19 --- /dev/null +++ b/docs/source/api/Interval.rst @@ -0,0 +1,11 @@ +Interval +======== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::Interval + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/InvalidSequencingChemistryException.rst b/docs/source/api/InvalidSequencingChemistryException.rst new file mode 100644 index 0000000..d521ecc --- /dev/null +++ b/docs/source/api/InvalidSequencingChemistryException.rst @@ -0,0 +1,11 @@ +InvalidSequencingChemistryException +=================================== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::InvalidSequencingChemistryException + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/LocalContextFlags.rst b/docs/source/api/LocalContextFlags.rst new file mode 100644 index 0000000..8cd63be --- /dev/null +++ b/docs/source/api/LocalContextFlags.rst @@ -0,0 +1,8 @@ +LocalContextFlags +================= + +.. code-block:: cpp + + #include + +.. doxygenenum:: PacBio::BAM::LocalContextFlags diff --git a/docs/source/api/MappedLookupData.rst b/docs/source/api/MappedLookupData.rst new file mode 100644 index 0000000..7cf3c8b --- /dev/null +++ b/docs/source/api/MappedLookupData.rst @@ -0,0 +1,11 @@ +MappedLookupData +================ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::MappedLookupData + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/NamespaceInfo.rst b/docs/source/api/NamespaceInfo.rst new file mode 100644 index 0000000..c7613ec --- /dev/null +++ b/docs/source/api/NamespaceInfo.rst @@ -0,0 +1,11 @@ +NamespaceInfo +============= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::NamespaceInfo + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/NamespaceRegistry.rst b/docs/source/api/NamespaceRegistry.rst new file mode 100644 index 0000000..2f8f9a7 --- /dev/null +++ b/docs/source/api/NamespaceRegistry.rst @@ -0,0 +1,11 @@ +NamespaceRegistry +================= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::NamespaceRegistry + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/OrderedLookup.rst b/docs/source/api/OrderedLookup.rst new file mode 100644 index 0000000..d5b81b6 --- /dev/null +++ b/docs/source/api/OrderedLookup.rst @@ -0,0 +1,11 @@ +OrderedLookup +============= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::OrderedLookup + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/Orientation.rst b/docs/source/api/Orientation.rst new file mode 100644 index 0000000..e9bbc42 --- /dev/null +++ b/docs/source/api/Orientation.rst @@ -0,0 +1,8 @@ +Orientation +=========== + +.. code-block:: cpp + + #include + +.. doxygenenum:: PacBio::BAM::Orientation diff --git a/docs/source/api/ParentTool.rst b/docs/source/api/ParentTool.rst new file mode 100644 index 0000000..e2ffa1b --- /dev/null +++ b/docs/source/api/ParentTool.rst @@ -0,0 +1,11 @@ +ParentTool +========== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ParentTool + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/PbiBuilder.rst b/docs/source/api/PbiBuilder.rst new file mode 100644 index 0000000..d795d0f --- /dev/null +++ b/docs/source/api/PbiBuilder.rst @@ -0,0 +1,11 @@ +PbiBuilder +========== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::PbiBuilder + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/PbiFile.rst b/docs/source/api/PbiFile.rst new file mode 100644 index 0000000..5a8b85a --- /dev/null +++ b/docs/source/api/PbiFile.rst @@ -0,0 +1,14 @@ +PbiFile +======= + +.. code-block:: cpp + + #include + +.. doxygenenum:: PacBio::BAM::PbiFile::Section + +.. doxygentypedef:: PacBio::BAM::PbiFile::Sections + +.. doxygenenum:: PacBio::BAM::PbiFile::VersionEnum + +.. doxygenfunction:: PacBio::BAM::PbiFile::CreateFrom diff --git a/docs/source/api/PbiFilter.rst b/docs/source/api/PbiFilter.rst new file mode 100644 index 0000000..261498b --- /dev/null +++ b/docs/source/api/PbiFilter.rst @@ -0,0 +1,11 @@ +PbiFilter +========= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::PbiFilter + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/PbiFilterCompositeBamReader.rst b/docs/source/api/PbiFilterCompositeBamReader.rst new file mode 100644 index 0000000..7a69df3 --- /dev/null +++ b/docs/source/api/PbiFilterCompositeBamReader.rst @@ -0,0 +1,11 @@ +PbiFilterCompositeBamReader +=========================== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::PbiFilterCompositeBamReader + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/PbiFilterQuery.rst b/docs/source/api/PbiFilterQuery.rst new file mode 100644 index 0000000..75bbc12 --- /dev/null +++ b/docs/source/api/PbiFilterQuery.rst @@ -0,0 +1,11 @@ +PbiFilterQuery +============== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::PbiFilterQuery + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/PbiFilterTypes.rst b/docs/source/api/PbiFilterTypes.rst new file mode 100644 index 0000000..052389b --- /dev/null +++ b/docs/source/api/PbiFilterTypes.rst @@ -0,0 +1,8 @@ +PbiFilterTypes +============== + +.. code-block:: cpp + + #include + +.. doxygenfile:: PbiFilterTypes.h \ No newline at end of file diff --git a/docs/source/api/PbiIndex.rst b/docs/source/api/PbiIndex.rst new file mode 100644 index 0000000..811bc68 --- /dev/null +++ b/docs/source/api/PbiIndex.rst @@ -0,0 +1,11 @@ +PbiIndex +======== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::PbiIndex + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/PbiIndexedBamReader.rst b/docs/source/api/PbiIndexedBamReader.rst new file mode 100644 index 0000000..5450c8a --- /dev/null +++ b/docs/source/api/PbiIndexedBamReader.rst @@ -0,0 +1,11 @@ +PbiIndexedBamReader +=================== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::PbiIndexedBamReader + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/PbiRawBarcodeData.rst b/docs/source/api/PbiRawBarcodeData.rst new file mode 100644 index 0000000..c72ebfb --- /dev/null +++ b/docs/source/api/PbiRawBarcodeData.rst @@ -0,0 +1,11 @@ +PbiRawBarcodeData +================= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::PbiRawBarcodeData + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/PbiRawBasicData.rst b/docs/source/api/PbiRawBasicData.rst new file mode 100644 index 0000000..2282387 --- /dev/null +++ b/docs/source/api/PbiRawBasicData.rst @@ -0,0 +1,11 @@ +PbiRawBasicData +=============== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::PbiRawBasicData + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/PbiRawData.rst b/docs/source/api/PbiRawData.rst new file mode 100644 index 0000000..1a974e8 --- /dev/null +++ b/docs/source/api/PbiRawData.rst @@ -0,0 +1,11 @@ +PbiRawData +========== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::PbiRawData + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/PbiRawMappedData.rst b/docs/source/api/PbiRawMappedData.rst new file mode 100644 index 0000000..42e1de1 --- /dev/null +++ b/docs/source/api/PbiRawMappedData.rst @@ -0,0 +1,11 @@ +PbiRawMappedData +================ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::PbiRawMappedData + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/PbiRawReferenceData.rst b/docs/source/api/PbiRawReferenceData.rst new file mode 100644 index 0000000..460cde4 --- /dev/null +++ b/docs/source/api/PbiRawReferenceData.rst @@ -0,0 +1,11 @@ +PbiRawReferenceData +=================== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::PbiRawReferenceData + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/PbiReferenceEntry.rst b/docs/source/api/PbiReferenceEntry.rst new file mode 100644 index 0000000..472e586 --- /dev/null +++ b/docs/source/api/PbiReferenceEntry.rst @@ -0,0 +1,11 @@ +PbiReferenceEntry +================= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::PbiReferenceEntry + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/Position.rst b/docs/source/api/Position.rst new file mode 100644 index 0000000..3c945f2 --- /dev/null +++ b/docs/source/api/Position.rst @@ -0,0 +1,10 @@ +Position +======== + +.. code-block:: cpp + + #include + +.. doxygentypedef:: PacBio::BAM::Position + +.. doxygenvariable:: PacBio::BAM::UnmappedPosition \ No newline at end of file diff --git a/docs/source/api/ProgramInfo.rst b/docs/source/api/ProgramInfo.rst new file mode 100644 index 0000000..b58c93a --- /dev/null +++ b/docs/source/api/ProgramInfo.rst @@ -0,0 +1,11 @@ +ProgramInfo +=========== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ProgramInfo + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/QNameQuery.rst b/docs/source/api/QNameQuery.rst new file mode 100644 index 0000000..b549436 --- /dev/null +++ b/docs/source/api/QNameQuery.rst @@ -0,0 +1,11 @@ +QNameQuery +========== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::QNameQuery + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/QualityValue.rst b/docs/source/api/QualityValue.rst new file mode 100644 index 0000000..3520c5a --- /dev/null +++ b/docs/source/api/QualityValue.rst @@ -0,0 +1,11 @@ +QualityValue +============ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::QualityValue + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/QualityValues.rst b/docs/source/api/QualityValues.rst new file mode 100644 index 0000000..8f6dfa5 --- /dev/null +++ b/docs/source/api/QualityValues.rst @@ -0,0 +1,11 @@ +QualityValues +============= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::QualityValues + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/ReadAccuracyQuery.rst b/docs/source/api/ReadAccuracyQuery.rst new file mode 100644 index 0000000..abfd1e6 --- /dev/null +++ b/docs/source/api/ReadAccuracyQuery.rst @@ -0,0 +1,11 @@ +ReadAccuracyQuery +================= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ReadAccuracyQuery + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/ReadGroupInfo.rst b/docs/source/api/ReadGroupInfo.rst new file mode 100644 index 0000000..7fb4f69 --- /dev/null +++ b/docs/source/api/ReadGroupInfo.rst @@ -0,0 +1,21 @@ +ReadGroupInfo +============= + +.. code-block:: cpp + + #include + +.. doxygenenum:: PacBio::BAM::BaseFeature + +.. doxygenenum:: PacBio::BAM::FrameCodec + +.. doxygenenum:: PacBio::BAM::BarcodeModeType + +.. doxygenenum:: PacBio::BAM::BarcodeQualityType + +.. doxygenclass:: PacBio::BAM::ReadGroupInfo + :members: + :protected-members: + :undoc-members: + +.. doxygenfunction:: PacBio::BAM::MakeReadGroupId \ No newline at end of file diff --git a/docs/source/api/ReferenceLookupData.rst b/docs/source/api/ReferenceLookupData.rst new file mode 100644 index 0000000..20316fc --- /dev/null +++ b/docs/source/api/ReferenceLookupData.rst @@ -0,0 +1,11 @@ +ReferenceLookupData +=================== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ReferenceLookupData + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/ReferenceSet.rst b/docs/source/api/ReferenceSet.rst new file mode 100644 index 0000000..22e4703 --- /dev/null +++ b/docs/source/api/ReferenceSet.rst @@ -0,0 +1,11 @@ +ReferenceSet +============ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ReferenceSet + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/SamTagCodec.rst b/docs/source/api/SamTagCodec.rst new file mode 100644 index 0000000..4f8d65d --- /dev/null +++ b/docs/source/api/SamTagCodec.rst @@ -0,0 +1,11 @@ +SamTagCodec +=========== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::SamTagCodec + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/SequenceInfo.rst b/docs/source/api/SequenceInfo.rst new file mode 100644 index 0000000..393d5bb --- /dev/null +++ b/docs/source/api/SequenceInfo.rst @@ -0,0 +1,11 @@ +SequenceInfo +============ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::SequenceInfo + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/SequentialCompositeBamReader.rst b/docs/source/api/SequentialCompositeBamReader.rst new file mode 100644 index 0000000..31ed3b1 --- /dev/null +++ b/docs/source/api/SequentialCompositeBamReader.rst @@ -0,0 +1,11 @@ +SequentialCompositeBamReader +============================ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::SequentialCompositeBamReader + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/Strand.rst b/docs/source/api/Strand.rst new file mode 100644 index 0000000..4978f72 --- /dev/null +++ b/docs/source/api/Strand.rst @@ -0,0 +1,8 @@ +Strand +====== + +.. code-block:: cpp + + #include + +.. doxygenenum:: PacBio::BAM::Strand diff --git a/docs/source/api/SubDataSets.rst b/docs/source/api/SubDataSets.rst new file mode 100644 index 0000000..d179065 --- /dev/null +++ b/docs/source/api/SubDataSets.rst @@ -0,0 +1,11 @@ +SubDataSets +=========== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::SubDataSets + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/SubreadLengthQuery.rst b/docs/source/api/SubreadLengthQuery.rst new file mode 100644 index 0000000..23000b3 --- /dev/null +++ b/docs/source/api/SubreadLengthQuery.rst @@ -0,0 +1,11 @@ +SubreadLengthQuery +================== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::SubreadLengthQuery + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/SubreadSet.rst b/docs/source/api/SubreadSet.rst new file mode 100644 index 0000000..bfc3c13 --- /dev/null +++ b/docs/source/api/SubreadSet.rst @@ -0,0 +1,11 @@ +SubreadSet +========== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::SubreadSet + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/Tag.rst b/docs/source/api/Tag.rst new file mode 100644 index 0000000..50b85c7 --- /dev/null +++ b/docs/source/api/Tag.rst @@ -0,0 +1,15 @@ +Tag +=== + +.. code-block:: cpp + + #include + +.. doxygenenum:: PacBio::BAM::TagDataType + +.. doxygenenum:: PacBio::BAM::TagModifier + +.. doxygenclass:: PacBio::BAM::Tag + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/TagCollection.rst b/docs/source/api/TagCollection.rst new file mode 100644 index 0000000..1314b13 --- /dev/null +++ b/docs/source/api/TagCollection.rst @@ -0,0 +1,11 @@ +TagCollection +============= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::TagCollection + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/UnorderedLookup.rst b/docs/source/api/UnorderedLookup.rst new file mode 100644 index 0000000..718e4e7 --- /dev/null +++ b/docs/source/api/UnorderedLookup.rst @@ -0,0 +1,11 @@ +UnorderedLookup +=============== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::UnorderedLookup + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/VirtualPolymeraseBamRecord.rst b/docs/source/api/VirtualPolymeraseBamRecord.rst new file mode 100644 index 0000000..06d5531 --- /dev/null +++ b/docs/source/api/VirtualPolymeraseBamRecord.rst @@ -0,0 +1,11 @@ +VirtualPolymeraseBamRecord +========================== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::VirtualPolymeraseBamRecord + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/VirtualPolymeraseCompositeReader.rst b/docs/source/api/VirtualPolymeraseCompositeReader.rst new file mode 100644 index 0000000..e6cab4e --- /dev/null +++ b/docs/source/api/VirtualPolymeraseCompositeReader.rst @@ -0,0 +1,11 @@ +VirtualPolymeraseCompositeReader +================================ + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::VirtualPolymeraseCompositeReader + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/VirtualPolymeraseReader.rst b/docs/source/api/VirtualPolymeraseReader.rst new file mode 100644 index 0000000..14a46e8 --- /dev/null +++ b/docs/source/api/VirtualPolymeraseReader.rst @@ -0,0 +1,11 @@ +VirtualPolymeraseReader +======================= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::VirtualPolymeraseReader + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/VirtualRegion.rst b/docs/source/api/VirtualRegion.rst new file mode 100644 index 0000000..7a09846 --- /dev/null +++ b/docs/source/api/VirtualRegion.rst @@ -0,0 +1,11 @@ +VirtualRegion +============= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::VirtualRegion + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/VirtualRegionType.rst b/docs/source/api/VirtualRegionType.rst new file mode 100644 index 0000000..4279200 --- /dev/null +++ b/docs/source/api/VirtualRegionType.rst @@ -0,0 +1,8 @@ +VirtualRegionType +================= + +.. code-block:: cpp + + #include + +.. doxygenenum:: PacBio::BAM::VirtualRegionType diff --git a/docs/source/api/VirtualRegionTypeMap.rst b/docs/source/api/VirtualRegionTypeMap.rst new file mode 100644 index 0000000..eebe637 --- /dev/null +++ b/docs/source/api/VirtualRegionTypeMap.rst @@ -0,0 +1,11 @@ +VirtualRegionTypeMap +==================== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::VirtualRegionTypeMap + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/ZmwGroupQuery.rst b/docs/source/api/ZmwGroupQuery.rst new file mode 100644 index 0000000..01fc18a --- /dev/null +++ b/docs/source/api/ZmwGroupQuery.rst @@ -0,0 +1,11 @@ +ZmwGroupQuery +============= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ZmwGroupQuery + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/ZmwQuery.rst b/docs/source/api/ZmwQuery.rst new file mode 100644 index 0000000..375fcb0 --- /dev/null +++ b/docs/source/api/ZmwQuery.rst @@ -0,0 +1,11 @@ +ZmwQuery +======== + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ZmwQuery + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api/ZmwWhitelistVirtualReader.rst b/docs/source/api/ZmwWhitelistVirtualReader.rst new file mode 100644 index 0000000..95d2d1a --- /dev/null +++ b/docs/source/api/ZmwWhitelistVirtualReader.rst @@ -0,0 +1,11 @@ +ZmwWhitelistVirtualReader +========================= + +.. code-block:: cpp + + #include + +.. doxygenclass:: PacBio::BAM::ZmwWhitelistVirtualReader + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst new file mode 100644 index 0000000..354c0de --- /dev/null +++ b/docs/source/api_reference.rst @@ -0,0 +1,12 @@ +.. _api_reference: + +C++ API Reference +================= + +Watch this space for more recipes & how-tos. + +.. toctree:: + :maxdepth: 1 + :glob: + + api/* diff --git a/docs/source/commandline_utilities.rst b/docs/source/commandline_utilities.rst new file mode 100644 index 0000000..7f1bdaf --- /dev/null +++ b/docs/source/commandline_utilities.rst @@ -0,0 +1,15 @@ +.. _command_line: + +Command Line Utilities +====================== + +In addition to the main library and wrappers, pbbam also provides a few basic +utilities for working with PacBio indices (".pbi" files). + +.. toctree:: + :maxdepth: 1 + + tools/bam2sam + tools/pbindex + tools/pbindexdump + tools/pbmerge diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100755 index 0000000..c1de190 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,332 @@ +# -*- coding: utf-8 -*- +# +# pbbam documentation build configuration file, created by +# sphinx-quickstart on Fri Dec 4 10:08:52 2015. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +import shlex +import re +import subprocess + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# get RTD to run doxygen first, per http://breathe.readthedocs.org/en/latest/readthedocs.html +# but... we generate our actual Doxyfile via CMake in a normal build, +# so we need to create one here, subbing actual values +read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True' +if read_the_docs_build: + + # fetch directory info + this_dir = os.path.abspath(os.getcwd()) + docs_dir = os.path.abspath(os.path.join(this_dir, '..')) + root_dir = os.path.abspath(os.path.join(docs_dir, '..')) + include_dir = os.path.abspath(os.path.join(root_dir, 'include')) + + # get project version + version = '' + with open(os.path.abspath(os.path.join(root_dir, 'CMakeLists.txt')), 'r') as cmakeFile: + for line in cmakeFile: + if line.startswith('project'): + version = re.search(r'VERSION\s*([\d.]+)', line).group(1) + break + + # read Doxyfile.in, replace markers with real values, and write Doxyfile + inDoxyfile = open(os.path.abspath(os.path.join(docs_dir, 'Doxyfile.in')), 'r') + configIn = inDoxyfile.read() + configOut = re.sub('@PacBioBAM_NAME@', 'pbbam', \ + re.sub('@PacBioBAM_VERSION@', version, \ + re.sub('@PacBioBAM_DocsDir@', docs_dir, \ + re.sub('@PacBioBAM_IncludeDir@', include_dir, configIn)))) + outDoxyfile = open(os.path.abspath(os.path.join(docs_dir, 'Doxyfile')), 'w') + #print(configOut, outDoxyfile) + print >>outDoxyfile, configOut + outDoxyfile.close() + + # now run Doxygen + subprocess.call('cd ..; doxygen', shell=True) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['breathe'] +#extensions = [ +# 'sphinx.ext.autodoc', + # 'sphinx.ext.coverage', + # 'breathe', +#] + +# Setup Breathe extension varialbes +breathe_projects = { 'pbbam' : os.path.join(os.getcwd(), '..', 'xml') + os.path.sep } +breathe_default_project = 'pbbam' +breathe_default_members = ('members', 'undoc-members') +breathe_implementation_filename_extensions = [ '.cpp', '.inl' ] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'pbbam' +copyright = u'2015, Derek Barnett' +author = u'Derek Barnett' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.7.4' +# The full version, including alpha/beta/rc tags. +release = '0.7.4' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'pacbio-theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +html_theme_path = ['.'] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pbbamdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'pbbam.tex', u'pbbam Documentation', + u'Derek Barnett', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'pbbam', u'pbbam Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'pbbam', u'pbbam Documentation', + author, 'pbbam', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst new file mode 100644 index 0000000..6860f9f --- /dev/null +++ b/docs/source/getting_started.rst @@ -0,0 +1,144 @@ + +.. _getting_started: + +Getting Started +=============== + +.. _getting_started-requirements: + +Requirements +------------ + +These components will almost certainly already be on your system. + +* `gcc`_ (4.8+) OR `clang`_ (v3.1+) +* pthreads +* zlib + +Double-check your compiler version, to be sure it is compatible. + +.. code-block:: console + + $ g++ -v + $ clang -v + +Additional requirements: + +* `Boost`_ (1.55+) +* `CMake`_ (3.0+) +* `Google Test`_ +* `htslib`_ (PacBio fork) + +For additional languages: + +* `SWIG`_ (3.0.5+) + +For building API documentation locally: + +* `Doxygen`_ + +For maximal convenience, install htslib and google test in the same parent directory you plan to install pbbam. + +.. _Boost: http://www.boost.org/ +.. _clang: http://clang.llvm.org/ +.. _CMake: https://cmake.org/ +.. _Doxygen: http://www.stack.nl/~dimitri/doxygen/ +.. _gcc: https://gcc.gnu.org/ +.. _Google Test: https://github.com/google/googletest +.. _htslib: https://github.com/PacificBiosciences/htslib.git +.. _SWIG: http://www.swig.org/ + +.. _getting_started-build: + +Clone & Build +------------- + +.. note:: + + The following steps are for building the C++ library and command-line utilities. + If you are integrating pbbam into a C#, Python, or R project, take a look at the + instructions for :ref:`additional languages `. + +The basic steps for obtaining pbbam and building it from source are as follows: + +.. code-block:: console + + $ git clone https://github.com/PacificBiosciences/pbbam.git + $ cd pbbam + $ mkdir build + $ cd build + $ cmake .. + $ make -j 4 # compiles using 4 threads + +Output: + + * Library : /lib + * Headers : /include + * Utilities : /bin + +You may need to set a few options on the cmake command, to point to dependencies' install locations. +Common installation-related options include: + + * HTSLIB_ROOTDIR + * GTEST_SRC_DIR + +Add these using the '-D' argument, like this: + +.. code-block:: console + + $ cmake .. -DHTSLIB_ROOTDIR="path/to/htslib" + +To run the test suite, run: + +.. code-block:: console + + $ make test + +To build a local copy of the (Doxygen-style) API documentation, run: + +.. code-block:: console + + $ make doc + +And then open /docs/html/index.html in your favorite browser. + +.. _getting_started-integrate: + +Integrate +--------- + +CMake-based projects +```````````````````` + +For CMake-based projects that will "ship with" or otherwise live alongside pbbam, you can +use the approach described here. + +Before defining your library or executable, add the following: + +.. code-block:: cmake + + add_subdirectory( external/build/pbbam) + +When it's time to run "make" this will ensure that pbbam will be built, inside your own project's +build directory. After this point in the CMakeLists.txt file(s), a few variables will be available +that can be used to setup your include paths and library linking targets: + +.. code-block:: cmake + + include_directories( + ${PacBioBAM_INCLUDE_DIRS} + # other includes that your project needs + ) + + add_executable(foo) + + target_link_libraries(foo + ${PacBioBAM_LIBRARIES} + # other libs that your project needs + ) + +Non-CMake projects +`````````````````` + +If you're using something other than CMake for your project's build system, then you need to point +it to pbbam's include directory & library, as well as those of its dependencies (primarily htslib). diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..426c3c5 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,33 @@ +.. pbbam documentation master file, created by + sphinx-quickstart on Fri Dec 4 10:08:52 2015. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. _home: + +pbbam documentation +=================== + +As of the 3.0 release of SMRTanalysis, PacBio is embracing the industry standard BAM +format for (both aligned and unaligned) basecall data files. We have also formulated +a BAM companion file format (bam.pbi) enabling fast access to a richer set of per-read +information as well as compatibility for software built around the legacy cmp.h5 format. + +The **pbbam** software package provides components to create, query, & edit PacBio BAM +files and associated indices. These components include a core C++ library, bindings for +additional languages, and command-line utilities. + +.. toctree:: + :maxdepth: 1 + + getting_started + api_reference + swig_bindings + commandline_utilities + + +Search: + +* :ref:`genindex` +* :ref:`search` + diff --git a/docs/source/pacbio-theme/static/headerGradient.jpg b/docs/source/pacbio-theme/static/headerGradient.jpg new file mode 100644 index 0000000..883f147 Binary files /dev/null and b/docs/source/pacbio-theme/static/headerGradient.jpg differ diff --git a/docs/source/pacbio-theme/static/pacbio.css b/docs/source/pacbio-theme/static/pacbio.css new file mode 100644 index 0000000..b4ab87f --- /dev/null +++ b/docs/source/pacbio-theme/static/pacbio.css @@ -0,0 +1,238 @@ +/** + * Sphinx stylesheet -- default theme + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +@import url("basic.css"); + +/* -- page layout ----------------------------------------------------------- */ + +body { + font-family: Arial, sans-serif; + font-size: 100%; + background-color: #555; + color: #555; + margin: 0; + padding: 0; + min-width: 500px; + max-width: 956px; + margin: 0 auto; +} + +div.documentwrapper { + float: left; + width: 100%; +} + +div.bodywrapper { + margin: 0 0 0 230px; +} + +hr{ + border: 1px solid #B1B4B6; + +} + +div.document { + background-color: #eee; +} + +div.body { + background-color: #ffffff; + color: #3E4349; + padding: 30px 30px 30px 30px; + font-size: 0.8em; +} + +div.footer { + color: #555; + background-color: #fff; + padding: 13px 0; + text-align: center; + font-size: 75%; + +} +div.footer a { + color: #444; + text-decoration: underline; +} + +div.related { + background: #fff url(headerGradient.jpg); + line-height: 80px; + color: #fff; + font-size: 0.80em; + height: 79px; + z-index: -1; +} + +div.related ul { + background: url(pacbioLogo.png) 10px no-repeat; + padding: 0 0 0 200px; +} + +div.related a { + color: #E2F3CC; +} + +div.sphinxsidebar { + font-size: 0.75em; + line-height: 1.5em; +} + +div.sphinxsidebarwrapper{ + padding: 20px 0; +} + +div.sphinxsidebar h3, +div.sphinxsidebar h4 { + font-family: Arial, sans-serif; + color: #222; + font-size: 1.2em; + font-weight: bold; + margin: 0; + padding: 5px 10px 0 10px; +} + +div.sphinxsidebar h4{ + font-size: 1.1em; +} + +div.sphinxsidebar h3 a { + color: #444; +} + + +div.sphinxsidebar p { + color: #888; + padding: 0px 20px; + margin-top: 5px; +} + +div.sphinxsidebar p.topless { +} + +div.sphinxsidebar ul { + margin: 5px 20px 10px 20px; + padding: 0; + color: #000; +} + +div.sphinxsidebar a { + color: #444; +} + +div.sphinxsidebar input { + border: 1px solid #ccc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar input[type=text]{ + margin-left: 20px; +} + +/* -- body styles ----------------------------------------------------------- */ + +a { + color: #005B81; + text-decoration: none; +} + +a:hover { + color: #E32E00; + text-decoration: underline; +} + +div.body h1, +div.body h2, +div.body h3, +div.body h4, +div.body h5, +div.body h6 { + font-family: Arial, sans-serif; + font-weight: bold; + color: #264868; + margin: 30px 0px 10px 0px; + padding: 5px 0 5px 0px; +} + +div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 180%; font-weight: normal; } +div.body h2 { font-size: 125%; } +div.body h3 { font-size: 110%; } +div.body h4 { font-size: 100%; } +div.body h5 { font-size: 100%; } +div.body h6 { font-size: 100%; } + +a.headerlink { + color: #c60f0f; + font-size: 0.8em; + padding: 0 4px 0 4px; + text-decoration: none; +} + +a.headerlink:hover { + background-color: #c60f0f; + color: white; +} + +div.body p, div.body dd, div.body li { + line-height: 1.5em; + font-size: 1em; +} + +div.admonition p.admonition-title + p { + display: inline; +} + +div.highlight{ + background-color: white; +} + +div.note { + background-color: #eee; + border: 1px solid #ccc; +} + +div.seealso { + background-color: #ffc; + border: 1px solid #ff6; +} + +div.topic { + background-color: #eee; +} + +div.warning { + background-color: #ffe4e4; + border: 1px solid #f66; +} + +p.admonition-title { + display: inline; +} + +p.admonition-title:after { + content: ":"; +} + +pre { + padding: 10px; + background-color: White; + color: #222; + line-height: 1.2em; + border: 1px solid #C6C9CB; + font-size: 1.2em; + margin: 1.5em 0 1.5em 0; + -webkit-box-shadow: 1px 1px 1px #d8d8d8; + -moz-box-shadow: 1px 1px 1px #d8d8d8; +} + +tt { + background-color: #ecf0f3; + color: #222; + padding: 1px 2px; + font-size: 1.2em; + font-family: monospace; +} + diff --git a/docs/source/pacbio-theme/static/pacbioLogo.png b/docs/source/pacbio-theme/static/pacbioLogo.png new file mode 100644 index 0000000..b2e4887 Binary files /dev/null and b/docs/source/pacbio-theme/static/pacbioLogo.png differ diff --git a/docs/source/pacbio-theme/static/pygments.css b/docs/source/pacbio-theme/static/pygments.css new file mode 100644 index 0000000..4588cde --- /dev/null +++ b/docs/source/pacbio-theme/static/pygments.css @@ -0,0 +1,55 @@ +.c { color: #999988; font-style: italic } /* Comment */ +.k { font-weight: bold } /* Keyword */ +.o { font-weight: bold } /* Operator */ +.cm { color: #999988; font-style: italic } /* Comment.Multiline */ +.cp { color: #999999; font-weight: bold } /* Comment.preproc */ +.c1 { color: #999988; font-style: italic } /* Comment.Single */ +.gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */ +.ge { font-style: italic } /* Generic.Emph */ +.gr { color: #aa0000 } /* Generic.Error */ +.gh { color: #999999 } /* Generic.Heading */ +.gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */ +.go { color: #111 } /* Generic.Output */ +.gp { color: #555555 } /* Generic.Prompt */ +.gs { font-weight: bold } /* Generic.Strong */ +.gu { color: #aaaaaa } /* Generic.Subheading */ +.gt { color: #aa0000 } /* Generic.Traceback */ +.kc { font-weight: bold } /* Keyword.Constant */ +.kd { font-weight: bold } /* Keyword.Declaration */ +.kp { font-weight: bold } /* Keyword.Pseudo */ +.kr { font-weight: bold } /* Keyword.Reserved */ +.kt { color: #445588; font-weight: bold } /* Keyword.Type */ +.m { color: #009999 } /* Literal.Number */ +.s { color: #bb8844 } /* Literal.String */ +.na { color: #008080 } /* Name.Attribute */ +.nb { color: #999999 } /* Name.Builtin */ +.nc { color: #445588; font-weight: bold } /* Name.Class */ +.no { color: #ff99ff } /* Name.Constant */ +.ni { color: #800080 } /* Name.Entity */ +.ne { color: #990000; font-weight: bold } /* Name.Exception */ +.nf { color: #990000; font-weight: bold } /* Name.Function */ +.nn { color: #555555 } /* Name.Namespace */ +.nt { color: #000080 } /* Name.Tag */ +.nv { color: purple } /* Name.Variable */ +.ow { font-weight: bold } /* Operator.Word */ +.mf { color: #009999 } /* Literal.Number.Float */ +.mh { color: #009999 } /* Literal.Number.Hex */ +.mi { color: #009999 } /* Literal.Number.Integer */ +.mo { color: #009999 } /* Literal.Number.Oct */ +.sb { color: #bb8844 } /* Literal.String.Backtick */ +.sc { color: #bb8844 } /* Literal.String.Char */ +.sd { color: #bb8844 } /* Literal.String.Doc */ +.s2 { color: #bb8844 } /* Literal.String.Double */ +.se { color: #bb8844 } /* Literal.String.Escape */ +.sh { color: #bb8844 } /* Literal.String.Heredoc */ +.si { color: #bb8844 } /* Literal.String.Interpol */ +.sx { color: #bb8844 } /* Literal.String.Other */ +.sr { color: #808000 } /* Literal.String.Regex */ +.s1 { color: #bb8844 } /* Literal.String.Single */ +.ss { color: #bb8844 } /* Literal.String.Symbol */ +.bp { color: #999999 } /* Name.Builtin.Pseudo */ +.vc { color: #ff99ff } /* Name.Variable.Class */ +.vg { color: #ff99ff } /* Name.Variable.Global */ +.vi { color: #ff99ff } /* Name.Variable.Instance */ +.il { color: #009999 } /* Literal.Number.Integer.Long */ + diff --git a/docs/source/pacbio-theme/theme.conf b/docs/source/pacbio-theme/theme.conf new file mode 100644 index 0000000..dd24a1a --- /dev/null +++ b/docs/source/pacbio-theme/theme.conf @@ -0,0 +1,4 @@ +[theme] +inherit = default +stylesheet = pacbio.css +pygments_style = tango diff --git a/docs/source/requirements.txt b/docs/source/requirements.txt new file mode 100644 index 0000000..cd6467e --- /dev/null +++ b/docs/source/requirements.txt @@ -0,0 +1 @@ +breathe diff --git a/docs/source/swig_bindings.rst b/docs/source/swig_bindings.rst new file mode 100644 index 0000000..e9dc33a --- /dev/null +++ b/docs/source/swig_bindings.rst @@ -0,0 +1,257 @@ +.. _swig_bindings: + +Additional Languages +==================== + +pbbam uses SWIG to generate bindings for other languages. Currently this includes support for C#, Python, and R. + +These bindings are disabled by default. See the entry below for your target language to configure pbbam & integrate +the bindings into your project. + +.. _swig_bindings-csharp: + +C# +------ + +Building +```````` + +To build the support for C#, you need to tell CMake to enable it before building: + +.. code-block:: console + + $ cmake .. -DPacBioBAM_wrap_csharp + $ make + +The 'make' step will build relevant libraries/wrappers, and then run a simple program using them, +as a quick sanity-check. + +After building, the libraries and wrappers can be found under the pbbam/lib/csharp directory. + +API Example +``````````` + +.. code-block:: c# + + using PacBio.BAM; + + namespace TestStuff + { + public class TestPbbam + { + public static void TestZmwQuery() + { + var d = new DataSet("foo.bam"); + var q = new ZmwQuery(new IntList {1, 2, 3}, d); + var q2 = new ZmwQuery(new IntList { 14743 }, d); + if (0 != q.Count() || 4 != q2.Count()) + { + throw new Exception("ZmwQuery not working"); + } + Console.WriteLine("TestZmwQuery - OK!"); + } + } + } + +.. _swig_bindings-python: + +Python +------ + +Building +```````` + +To build the support for Python, you need to tell CMake to enable it: + +.. code-block:: console + + $ cmake .. -DPacBioBAM_wrap_python + $ make + +The 'make' step will build relevant libraries/wrappers, and then run a simple program using them, +as a quick sanity-check. + +After building, the libraries and wrappers can be found in the pbbam/lib/python directory. +'make test' will also include some Python-side unit tests as well. + +To use the PacBioBam module, you can set your PYTHONPATH before invoking your script: + +.. code-block:: console + + $ PYTHONPATH="path/to/pbbam/lib/python" python myScript.py + +Or otherwise configure your environment to find the PacBioBam module. + +API Example +``````````` + +.. code-block:: python + + import PacBioBam + + try: + file = PacBioBam.BamFile('foo.bam') + writer = PacBioBam.BamWriter('new.bam', file.Header()) + dataset = PacBioBam.DataSet(file) + entireFile = PacBioBam.EntireFileQuery(dataset) + for record in PacBioBam.Iterate(entireFile): + writer.Write(record) + except RuntimeError: + # found error + +Python-Specific Notes +````````````````````` + +Iteration +......... + +Iteration over dataset queries in Python will likely need to use the PacBioBam.Iterate() method. Thus +file iteration loops will look something like the following: + +.. code-block:: python + + entireFile = PacBioBam.EntireFileQuery("input.bam") + for record in PacBioBam.Iterate(entireFile): + foo.bar(record) + +Exception Handling +.................. + +Exceptions are used widely by the C++ library. To handle them from Python, you can use try blocks, looking for +any RuntimeError: + +.. code-block:: python + + try: + file = PacBioBam.BamFile("does_not_exist.bam") + except RuntimeError: + print("caught expected error") + +.. _swig_bindings-r: + +R +------ + +Building +```````` + +To build the support for R, you need to tell CMake to enable it: + +.. code-block:: console + + $ cmake .. -DPacBioBAM_wrap_r + $ make + +The 'make' step will build relevant libraries/wrappers, and then run a simple program using them, +as a quick sanity-check. + +After building, the libraries and wrappers can be found in the pbbam/lib/R directory. +'make test' will also include some R-side unit tests as well. + +To use the PacBioBam module in your script, nothing should be needed up front - simply invoke 'R' as normal. +You'll do the dynamic load of the R module near the beginning of your script: + +.. code-block:: r + + # load pbbam R library + lib_path <- "path/to/pbbam/lib/R" + pbbam_libname <- paste(lib_path, "PacBioBam", sep="/") + pbbam_wrapper <- paste(lib_path, "PacBioBam.R", sep="/") + dyn.load(paste(pbbam_libname, .Platform$dynlib.ext, sep="")) + source(pbbam_wrapper) + cacheMetaData(1) + + +API Example +``````````` + +.. code-block:: r + + # load pbbam R library + lib_path <- "path/to/pbbam/lib/R" + pbbam_libname <- paste(lib_path, "PacBioBam", sep="/") + pbbam_wrapper <- paste(lib_path, "PacBioBam.R", sep="/") + dyn.load(paste(pbbam_libname, .Platform$dynlib.ext, sep="")) + source(pbbam_wrapper) + cacheMetaData(1) + + # sample method + copyFileAndFetchRecordNames <-function(inputFn, outputFn) { + + result <- tryCatch( + { + file <- BamFile(inputFn) + writer <- BamWriter(outputFn, file$Header()) + ds <- DataSet(file) + + entireFile <- EntireFileQuery(ds) + iter <- entireFile$begin() + end <- entireFile$end() + + while ( iter$'__ne__'(end) ) { + record <- iter$value() + + names_in <- c(names_in, record$FullName()) + writer$Write(record) + iter$incr() + } + writer$TryFlush() + return(names_in) + }, + error = function(e) { + # handle error + return(list()) + }) + return(result) + } + +R-Specific Notes +```````````````` + +Iteration +......... + +To compare iterators, you'll need to explicitly use the '__eq__' or '__ne__' methods. Thus iterating over +a data query, will look something like this: + +.. code-block:: r + + iter <- query$begin() + end <- query$end() + while ( iter$'__ne__'(end) ) { + record <- iter$value() + + # do stuff with record + } + +operator[] +.......... + +In C++, operator[] can be used in some classes to directly access elements in a sequence, e.g. Cigar string + +.. code-block:: cpp + + CigarOperation op = cigar[0]; + +For the R wrapper, if you want to do the same sort of thing, you'll need to use the '__getitem__' method. +Please note that these are **0-based** indices, not 1-based as in much of R. + +.. code-block:: r + + op <- cigar$'__getitem__'(0) + +Exception Handling +.................. + +Exceptions are used widely by the C++ library. To handle them from R, you can use the 'tryCatch' block, listening for +'error' type exceptions. + + .. code-block:: r + + result <- tryCatch( + { + f <- BamFile("does_not_exist.bam") # this statement will throw + }, + error = function(e) { + print(paste("caught expected erorr: ",e)) + }) diff --git a/docs/source/tools/bam2sam.rst b/docs/source/tools/bam2sam.rst new file mode 100644 index 0000000..4577686 --- /dev/null +++ b/docs/source/tools/bam2sam.rst @@ -0,0 +1,21 @@ +.. _bam2sam: + +bam2sam +======= + +:: + + Usage: bam2sam [options] [input] + + bam2sam converts a BAM file to SAM. It is essentially a stripped-down 'samtools + view', mostly useful for testing/debugging without requiring samtools. Input BAM + file is read from a file or stdin, and SAM output is written to stdout. + + Options: + -h, --help show this help message and exit + --version show program's version number and exit + + Options: + input Input BAM file. If not provided, stdin will be used as input. + --no-header Omit header from output. + --header-only Print only the header (no records). diff --git a/docs/source/tools/pbindex.rst b/docs/source/tools/pbindex.rst new file mode 100644 index 0000000..e7c491f --- /dev/null +++ b/docs/source/tools/pbindex.rst @@ -0,0 +1,18 @@ +.. _pbindex: + +pbindex +======= + +:: + + Usage: pbindex + + pbindex creates a index file that enables random-access to PacBio-specific data + in BAM files. Generated index filename will be the same as input BAM plus .pbi suffix. + + Options: + -h, --help show this help message and exit + --version show program's version number and exit + + Input/Output: + input Input BAM file diff --git a/docs/source/tools/pbindexdump.rst b/docs/source/tools/pbindexdump.rst new file mode 100644 index 0000000..6829064 --- /dev/null +++ b/docs/source/tools/pbindexdump.rst @@ -0,0 +1,233 @@ +.. _pbindexdump: + +pbindexdump +=========== + +:: + + Usage: pbindexdump [options] [input] + + pbindexdump prints a human-readable view of PBI data to stdout. + + Options: + -h, --help show this help message and exit + --version show program's version number and exit + + Input/Output: + input Input PBI file. If not provided, stdin will be used as input. + --format=STRING Output format, one of: + json, cpp + + json: pretty-printed JSON [default] + + cpp: copy/paste-able C++ code that can be used to + construct the equivalent PacBio::BAM::PbiRawData object + + JSON Formatting: + --json-indent-level=INT + JSON indent level [4] + --json-raw Prints fields in a manner that more closely reflects the + PBI file format - presenting data as per-field columns, + not per-record objects. + +JSON Output Schemas +------------------- + +Normal JSON: + +.. code-block:: JSON + + { + "type": "object", + "properties": { + "fileSections": { + "type": "array", + "items": { "type": "string" }, + }, + "numReads": { "type": "integer" }, + "reads": { + "type": "array", + "items": { + "type": "object", + "properties": { + "aEnd": { "type": "integer" }, + "aStart": { "type": "integer" }, + "bcForward": { "type": "integer" }, + "bcQuality": { "type": "integer" }, + "bcReverse": { "type": "integer" }, + "contextFlag": { "type": "integer" }, + "fileOffset": { "type": "integer" }, + "holeNumber": { "type": "integer" }, + "mapQuality": { "type": "integer" }, + "nM": { "type": "integer" }, + "nMM": { "type": "integer" }, + "qEnd": { "type": "integer" }, + "qStart": { "type": "integer" }, + "readQuality": { "type": "number" }, + "reverseStrand": { "type": "integer" }, + "rgId": { "type": "integer" }, + "tEnd": { "type": "integer" }, + "tId": { "type": "integer" }, + "tStart: { "type": "integer" } + }, + "required": [ + "contextFlag", + "fileOffset", + "holeNumber", + "qEnd", + "qStart", + "readQuality", + "rgId" + ] + } + }, + "references": { + "type": "array", + "items": { + "type": "object", + "properties": { + "beginRow": { "type": "integer" }, + "endRow": { "type": "integer" }, + "tId": { "type": "integer" } + }, + "required" : [ "beginRow", "endRow","tId" ] + } + }q + "version": { "type": "string" } + }, + "required": [ + "fileSections", + "numReads", + "reads", + "version" + ] + } + +"Raw" JSON: + +.. code-block:: JSON + + { + "type": "object", + "properties": { + "barcodeData" : { + "type" : "object", + "properties: { + "bcForward" : { + "type": "array", + "items" : { "type": "integer" } + }, + "bcQuality" : { + "type": "array", + "items" : { "type": "integer" } + }, + "bcReverse" : { + "type": "array", + "items" : { "type": "integer" } + } + } + }, + "basicData" : { + "type" : "object", + "properties: { + "contextFlag" : { + "type": "array", + "items" : { "type": "integer" } + }, + "fileOffset" : { + "type": "array", + "items" : { "type": "integer" } + }, + "holeNumber" : { + "type": "array", + "items" : { "type": "integer" } + }, + "qEnd" : { + "type": "array", + "items" : { "type": "integer" } + }, + "qStart" : { + "type": "array", + "items" : { "type": "integer" } + }, + "readQuality" : { + "type": "array", + "items" : { "type": "number" } + }, + "rgId : { + "type": "array", + "items" : { "type": "integer" } + } + } + }, + "fileSections": { + "type": "array", + "items": { "type": "string" }, + }, + "mappedData" : { + "type" : "object", + "properties: { + "aEnd" : { + "type": "array", + "items" : { "type": "integer" } + }, + "aStart" : { + "type": "array", + "items" : { "type": "integer" } + }, + "mapQuality" : { + "type": "array", + "items" : { "type": "integer" } + }, + "nM" : { + "type": "array", + "items" : { "type": "integer" } + }, + "nMM" : { + "type": "array", + "items" : { "type": "integer" } + }, + "readQuality" : { + "type": "array", + "items" : { "type": "number" } + }, + "reverseStrand" : { + "type": "array", + "items" : { "type": "integer" } + }, + "tEnd" : { + "type": "array", + "items" : { "type": "integer" } + }, + "tId" : { + "type": "array", + "items" : { "type": "integer" } + }, + "tStart" : { + "type": "array", + "items" : { "type": "integer" } + } + } + }, + "numReads": { "type": "integer" }, + "references": { + "type": "array", + "items": { + "type": "object", + "properties": { + "beginRow": { "type": "integer" }, + "endRow": { "type": "integer" }, + "tId": { "type": "integer" } + }, + "required" : [ "beginRow", "endRow","tId" ] + } + }, + "version" : { "type": "string" } + }, + "required": [ + "fileSections", + "numReads", + "basicData", + "version" + ] + } diff --git a/docs/source/tools/pbmerge.rst b/docs/source/tools/pbmerge.rst new file mode 100644 index 0000000..937ec56 --- /dev/null +++ b/docs/source/tools/pbmerge.rst @@ -0,0 +1,30 @@ +.. _pbmerge: + +pbmerge +======= + +:: + + Usage: pbmerge [options] [-o ] + + pbmerge merges PacBio BAM files. If the input is DataSetXML, any filters will be + applied. If no output filename is specified, new BAM will be written to stdout. + + Options: + -h, --help show this help message and exit + --version show program's version number and exit + + Input/Output: + -o output Output BAM filename. + --no-pbi Set this option to skip PBI index file creation. PBI + creation is automatically skipped if no output filename + is provided. + INPUT Input may be one of: + DataSetXML, list of BAM files, or FOFN + + fofn: pbmerge -o merged.bam bams.fofn + + bams: pbmerge -o merged.bam 1.bam 2.bam 3.bam + + xml: pbmerge -o merged.bam foo.subreadset.xml + diff --git a/docs/specs/pbbam.rst b/docs/specs/pbbam.rst new file mode 100644 index 0000000..6842371 --- /dev/null +++ b/docs/specs/pbbam.rst @@ -0,0 +1,631 @@ +================================================================= +**pbbam Software Design & Functional Specification** +================================================================= +| *Version 0.1* +| *Pacific Biosciences Engineering Group* +| *Jan 29, 2016* + +1. Revision History +=================== + ++-------------+---------------+--------------------+---------------------------+ +| **Date** | **Revision** | **Author(s)** | **Comments** | ++=============+===============+====================+===========================+ +| 01-29-2016 | 0.1 | Derek Barnett | Initial draft created | +| | | | | ++-------------+---------------+--------------------+---------------------------+ + +2. Introduction +=============== + +2.1. Document Specification Identifier +-------------------------------------- + ++-----------------------------------+------------------------------------------+ +| **Document Specification Prefix** | **Description** | ++===================================+==========================================+ +| FS\_SA\_PBBAM\_ | Functional spec for pbbam | ++-----------------------------------+------------------------------------------+ + +2.2. Purpose +------------ + +This document is intended to describe the requirements and interface of the pbbam +library, which provides functionality for creating, querying, and editing PacBio +BAM files and associated file formats. + +2.3. Scope of Document +---------------------- + +This document covers the expected usage of the pbbam library, as well as any +desired or required performance characteristics with respect to quality or speed. + +This document does not provide installation instructions or API documentation. + +2.4. Glossary of Terms +---------------------- + +The table below specifies only terms specific to this document, and skips +acronyms/terms that are specified in `Pacific Biosciences Software Glossary`_. + +.. _Pacific Biosciences Software Glossary: http://smrtanalysis-docs/pb_sw_glossary.html + ++------------------+-----------------------------------------------------------+ +| **Acronym/Term** | **Description** | ++==================+===========================================================+ +| API | Application Programming Interface - a set of routines, | +| | protocols, and tools for building software applications. | +| | In this document , this will consist of one or more | +| | cooperating libraries that specify data structures, | +| | methods, etc. for use within a target programming | +| | language. | ++------------------+-----------------------------------------------------------+ +| Client | An application that uses the library. | ++------------------+-----------------------------------------------------------+ +| I/O | Input/output of data. | ++------------------+-----------------------------------------------------------+ + +2.5. References +--------------- + ++-------------+------------------------------+--------------------------------------+ +| **Ref No.** | **Document Name, Link** | **Description** | ++=============+==============================+======================================+ +| (1) | `BAM format`_ | General SAM/BAM specification | ++-------------+------------------------------+--------------------------------------+ +| (2) | `PacBio BAM`_ | PacBio BAM specification | ++-------------+------------------------------+--------------------------------------+ +| (3) | `PacBio BAM index`_ | PacBio BAM index specification | ++-------------+------------------------------+--------------------------------------+ +| (4) | `DataSet XML`_ | PacBio DataSet XML specification | ++-------------+------------------------------+--------------------------------------+ +| (5) | `Software Style Guide`_ | PacBio coding standards | ++-------------+------------------------------+--------------------------------------+ +| (6) | `SMRT Analysis`_ | General SMRT Analysis infrastructure | ++-------------+------------------------------+--------------------------------------+ + +.. _BAM format: https://samtools.github.io/hts-specs/SAMv1.pdf +.. _PacBio BAM: http://pacbiofileformats.readthedocs.org/en/3.0/BAM.html +.. _PacBio BAM index: http://pacbiofileformats.readthedocs.org/en/3.0/PacBioBamIndex.html +.. _DataSet XML: https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/DataSet.rst +.. _Software Style Guide: http://smrtanalysis-docs/_downloads/PBISoftwareStyleGuide.doc +.. _SMRT Analysis: http://smrtanalysis-docs/smrt_docs.html + +3. Software Overview +==================== + +3.1. Product Description +------------------------ + +As of the 3.0 release of SMRTanalysis, PacBio is embracing the industry standard +`BAM format`_ (1) for (both aligned and unaligned) basecall data files. We have +also formulated a BAM companion file format (.bam.pbi) enabling fast access to a +richer set of per-read information as well as compatibility for software built +around the legacy cmp.h5 format. + +The pbbam library provides components to create, query, & transform PacBio BAM +data: sequence files and their associated indices. This includes a core C++ +library as well as bindings for additional programming languages. + +3.2. Product Functional Capabilities +------------------------------------ + +The library must be able to read and write BAM files that conform to the +`PacBio BAM`_ specification (2). BAM records must be editable e.g. adding +alignment information. Random access must be supported, whether by genomic +region or by filtering record features. To this end, the library will be able to +read, write, and create associated index files - both the standard BAM index +(.bai) and the `PacBio BAM index`_ (.pbi) (3). In addition to working with +individual files, datasets of related BAM files will be supported. These are +described in a `DataSet XML`_ document. (4) + +3.3. User Characteristics +------------------------- + ++---------------------+--------------------------------------------------------+ +| **User Class/Role** | **User Knowledge and Skill Levels** | ++=====================+========================================================+ +| Developer | Competence in one or more programming languages | +| | supported (C++, R, Python, C#). No knowledge of | +| | molecular biology wet lab techniques required. | ++---------------------+--------------------------------------------------------+ + +3.4. User Operations and Practices +---------------------------------- + +Developer users will interact with the software by incorporating the library +into a client application. + +3.5. Operating Environment +-------------------------- + +The software is intended to be run in a Linux or OSX environment, with ideally 4 +or more cores. + +3.6. Design and Implementation Constraints +------------------------------------------ + +Currently there are no constraints outside the operating environment and speed +requirements. In particular, as the library will be used for writing the BAM +files coming off a Sequel instrument, it should be able to keep pace. + +3.7. Assumptions and Dependencies +--------------------------------- + +Input routines for the library will expect to receive files that conform to the +`PacBio BAM`_ (2) or `DataSet XML`_ (4) specifications. + +The pbbam library depends on Boost, zlib, and htslib libraries. + +3.8. Other Software +------------------- + +Output PacBio BAMs will be compatible with the `PacBio BAM`_ specification (2) +and thus compatible with the general `BAM format`_ specification (1). This +ensures that a wide variety of downstream tools can interact with data files. + +The software uses `CMake`_ as its build system. + +The core C++ API relies on the following 3rd party components: + +* `zlib`_ +* `htslib`_ +* `Boost`_ (header-only modules) + +Wrapper APIs for additional languages (Python, R, C#) are generated by `SWIG`_. + +API documentation is generated via `Doxygen`_. + +.. _CMake: https://cmake.org/ +.. _zlib: http://www.zlib.net/ +.. _htslib: https://github.com/samtools/htslib +.. _Boost: http://www.boost.org/ +.. _SWIG: http://www.swig.org/ +.. _Doxygen: http://www.stack.nl/~dimitri/doxygen/ + +4. External Interfaces +====================== + +4.1. User Interfaces +-------------------- + +N/A + +4.2. Software Interfaces +------------------------ + +pbbam will require the following software: + +* `htslib`_ & `zlib`_ - provides low-level handling of compressed BAM data +* `Boost`_ - provides utility classes + +Incoming data from upstream components will be compliant with +PacBio BAM format - see `PacBio BAM`_ specification (2) for more detail. + +4.3. Hardware Interfaces +------------------------ + +N/A + +4.4. Communications Interfaces +------------------------------ + +N/A + +5. Functional Requirements +========================== + +5.1. Query BAM data by genomic region +----------------------------------------- + +5.1.1. Description +~~~~~~~~~~~~~~~~~~ + +pbbam shall allow client applications to query data, limited to some genomic +region of interest. + +5.1.2. Inputs +~~~~~~~~~~~~~ + +* BAM file(s) or DataSet XML +* a standard index (.bai) for each source BAM file +* genomic interval (e.g. "chr1:1000-2000") + +5.1.3. Processing +~~~~~~~~~~~~~~~~~ + +Obtain an `htslib`_ "iterator" object for a given file and region. This will be +wrapped by pbbam to hide the low-level nature of this type, as well as handling +memory lifetime. + +5.1.4. Outputs +~~~~~~~~~~~~~~ + +Iterator providing access to individual BAM records from the input data sources, +which are aligned to the requested genomic interval. + +For example: + +.. code:: c++ + + GenomicIntervalQuery query(interval, dataset); + for (const BamRecord& record : query) { + // ... do stuff ... + } + + +5.1.5. Regulatory Compliance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +N/A + +5.2. Query BAM data by filter criteria +----------------------------------------- + +5.2.1. Description +~~~~~~~~~~~~~~~~~~ + +pbbam shall allow client applications to query data, limited to some filter +criteria (e.g. only reads from ZMW hole number 200 with a read quality of >0.5). + +5.2.2. Inputs +~~~~~~~~~~~~~ + +* BAM file(s) or DataSet XML +* a `PacBio BAM index`_ (.pbi) for each source BAM file +* filters supported by data contained in the PBI + +5.2.3. Processing +~~~~~~~~~~~~~~~~~ + +Query PBI files(s) for records that match the provided filter criteria. Merge +contiguous runs of records into record blocks, to minimize seeks. Advancing the +iterator either reads the next read from the current block or seeks to the next +block and fetches the next record. + +5.2.4. Outputs +~~~~~~~~~~~~~~ + +Iterator providing access to individual BAM records from the input data sources, +which satisfy the requested filter criteria. + +For example: + +.. code:: c++ + + PbiFilterQuery query(filter, dataset); + for (const BamRecord& record : query) { + // ... do stuff ... + } + +5.2.5. Regulatory Compliance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +N/A + +5.3. Write PacBio BAM data +------------------------------------------ + +5.3.1. Description +~~~~~~~~~~~~~~~~~~ + +pbbam shall be able to write `PacBio BAM`_ files conforming to the specification. + +5.3.2. Inputs +~~~~~~~~~~~~~ + +* filename +* header information +* BAM records + +5.3.3. Processing +~~~~~~~~~~~~~~~~~ + +Create file handle for the provided filename, output initial header information. +As records are passed in, write to file. Upon completion, flush any buffers and +close file handle. + +Multithreading, provided by `htslib`_, will be utilized where possible to speed +up the compression process - often then main bottleneck of BAM throughput. + +5.3.4. Outputs +~~~~~~~~~~~~~~ + +BAM file conforming to the `PacBio BAM`_ specification. + +5.3.5. Regulatory Compliance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +N/A + +5.4. Create PacBio BAM index file +------------------------------------------ + +5.4.1. Description +~~~~~~~~~~~~~~~~~~ + +Much of PacBio BAM data processing relies on the presence of a `PacBio BAM index`_ +file. pbbam shall be able to generate this file type for a `PacBio BAM`_ file. + +5.4.2. Inputs +~~~~~~~~~~~~~ + +`PacBio BAM`_ file + +5.4.3. Processing +~~~~~~~~~~~~~~~~~ + +Read through the input BAM records, storing the values relevant to a PBI index. +At end of file, write the index contents to a file and close. + +5.4.4. Outputs +~~~~~~~~~~~~~~ + +`PacBio BAM index`_ file + +5.4.5. Regulatory Compliance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +N/A + +6. Non-Functional Requirements +============================== + +6.1. Performance Requirements +----------------------------- + +Since pbbam will be used to write all BAM files coming off a Sequel device, the +library must keep pace with data generation requirements. + +** come back to this, hard numbers ?? ** + +6.2. Safety Requirements +------------------------ + +N/A + +6.3. Security Requirements +-------------------------- + +N/A + +6.4. Quality Attributes +----------------------- + +6.4.1. Availability +~~~~~~~~~~~~~~~~~~~ + +N/A + +6.4.2. Integrity +~~~~~~~~~~~~~~~~ + +Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications. +Files that do not meet this requirement will raise exceptions and will not be +accepted. + +6.4.3. Interoperability +~~~~~~~~~~~~~~~~~~~~~~~ + +Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications. + +6.4.4. Reliability +~~~~~~~~~~~~~~~~~~ + +The developed software shall meet the overall product reliability requirements. + +6.4.5. Robustness +~~~~~~~~~~~~~~~~~ + +pbbam will raise exceptions upon encountering failure cases, allowing client +applications to recover or report the error to a UI. + +6.4.6. Usability +~~~~~~~~~~~~~~~~ + +pbbam shall have comprehensive API documentation, available both on- and offline. +Further documentation will be provided for installation, API usage tips, etc. + +Raised exceptions shall carry as much information as possible so that client +applications can respond with appropriate actions or display useful messages. + +6.4.7. Maintainability +~~~~~~~~~~~~~~~~~~~~~~ + +The source code of the software covered in this functional specification shall +adhere to the PacBio `Software Style Guide`_ (9) work instruction, to guarantee +high quality of code that facilitates maintainability. + +6.4.8. Customizability +~~~~~~~~~~~~~~~~~~~~~~ + +N/A + +6.5. Business Rules +------------------- + +N/A + +6.6. Installation and Upgrade +----------------------------- + +Installation and Upgrade of this software will be handled as part of the SMRT +Analysis subsystem. See `SMRT Analysis`_ (6) specifications for more detail. + +Additionally, the library may be built independently, either from internal +version control (Perforce) or from the public-facing Github repository. In +either case, `CMake`_ is used to drive the build process. + +6.7. Administration +------------------- + +N/A + +6.8. User Documentation +----------------------- + +pbbam shall have comprehensive API documentation, available both on- and offline. +Further documentation will be provided for installation, API usage tips, etc. + +The "offline" API documentation may be built directly from the source code, using +`Doxygen`_. Online documentation will be generated via a continuous integration +server, thus ensuring it is always pointing to the current codebase. + +7. High Level Design +==================== + +7.1. Top Level Context +---------------------- + +The pbbam library is intended to be linked in with client applications, +providing programmatic access to data files. + +7.2. Use Cases +-------------- + +Primary use cases for pbbam include: + +* BAM file creation +* BAM file query - iterable access to various subsets of data + +8. Detailed Design +================== + +8.1. Structural Representation +------------------------------ + + *image(s) here* + +8.2. Behavioral Representation +------------------------------ + +This section provides behavioral (dynamic) representation of how the +elements of the system realize the required use cases. + +Describe how the significant subsystems and classes interact with each +other to realize the architecturally significant use cases. + +Provide a link to a file containing Sequence Diagram or Activity Diagram, when applicable. +The link may be provided with use of 'image' directive. + +Sequence Diagram shows one use case scenario, executed by class model, +with sequence of operations over period of time (time increased from top +to bottom). It shows interactions between objects, but does not show +relationships between them. + +Activity Diagram is a virtual representation of the sequential flow and +control logic of a set of related activities or actions. It is a type of +flowchart, frequently called Swim Lane Diagram, because activities of +each entity are presented within its swim lane. + +Note: You may use http://wsd tool to auto-generate a sequence diagram from +a descriptive text file, save the diagram to the wsd site, get link to the image, +and add this link to the document with use of 'image' directive. + +8.3. Information Storage +------------------------ + +pbbam software requires no persistent storage outside of availability of input +and output during analysis. + +8.4. Technology Overview +------------------------ + +pbbam is implemented in C++-11 and should perform as designed on any UNIX-like +operating system (Linux distributions, Apple OSX, etc.). + +8.5. SOUP Components +-------------------- + +pbbam utilizes CMake for its build system. The C++ library uses the following +3rd-party software components: Boost, htslib, & zlib. Wrappers for additional +languages are generated using SWIG. + +8.6. Deployment and Configuration +--------------------------------- + +Please refer to `SMRT Analysis`_ (6) documentation + +9. Automated Tests +================== + +9.1. Unit Testing +----------------- + +The library shall have unit tests for all classes & components. + +9.2. Performance Testing +------------------------ + +Unit tests may evaluate performance requirements as desired. + +9.3. Regression Testing +----------------------- + +As its role is primarily in data I/O, pbbam has no "scientific quality/validity" +metrics that would indicate a regression. Instead, passing its unit tests and +end-to-end tests will indicate that a regression has not been introduced. + +These tests will be run after each check-in and nightly. + +10. Requirements Traceability Matrices +====================================== + +This section provides traces from requirements specified in PRD/DIR documents to the +requirements covered in this functional specification, and from these +functional requirements to corresponding Test Cases/Procedures. + +10.1. HPQC Functional Specifications +------------------------------------ + ++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+ +| **PBI_ID** | **Name** | **Description** | **Comment** | **Metric** | **Owner** | **PRD/DIR Path** | ++=============+===========================+===================================================+=============+============+===========+===========================================+ +| 5.1 | Query BAM data by | pbbam shall allow client applications to query | | | dbarnett | | +| | genomic region | data, limited to some genomic region of interest. | | | | | +| | | | | | | | ++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+ +| 5.2 | Query BAM data by | pbbam shall allow client applications to query | | | dbarnett | | +| | filter criteria | data, limited to some filter criteria (e.g. only | | | | | +| | | reads from ZMW hole number 200 with a read | | | | | +| | | quality of >0.5). | | | | | +| | | | | | | | ++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+ +| 5.3 | Write PacBio BAM data | pbbam shall be able to write files conforming to | | | dbarnett | | +| | | the `PacBio BAM`_ specifictation. | | | | | +| | | | | | | | ++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+ +| 5.4 | Create PacBio BAM index | Much of PacBio BAM data processing relies on the | | | dbarnett | | +| | file | presence of a `PacBio BAM index`_ file. pbbam | | | | | +| | | shall be able to generate this file type for a | | | | | +| | | `PacBio BAM`_ file. | | | | | +| | | | | | | | ++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+ + +10.2. Automated Tests Coverage +------------------------------ + ++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+ +| **FS Item** | **FS Item Title** | **Use Case Description** | **Test Case Name/ID** | ++=============+===========================+====================================================+==================================================================+ +| 5.1 | Query BAM data by | pbbam shall allow client applications to query | TODO | +| | genomic region | data, limited to some genomic region of interest. | | +| | | | | ++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+ +| 5.2 | Query BAM data by | pbbam shall allow client applications to query | TODO | +| | filter criteria | data, limited to some filter criteria (e.g. only | | +| | | reads from ZMW hole number 200 with a read | | +| | | quality of >0.5). | | +| | | | | ++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+ +| 5.3 | Write PacBio BAM data | pbbam shall be able to write files conforming to | TODO | +| | | the `PacBio BAM`_ specifictation. | | +| | | | | ++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+ +| 5.4 | Create PacBio BAM index | Much of PacBio BAM data processing relies on the | TODO | +| | file | presence of a `PacBio BAM index`_ file. pbbam | | +| | | shall be able to generate this file type for a | | +| | | `PacBio BAM`_ file. | | +| | | | | ++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+ + diff --git a/docs/specs/pbbam_structure.png b/docs/specs/pbbam_structure.png new file mode 100755 index 0000000..40f50cf Binary files /dev/null and b/docs/specs/pbbam_structure.png differ diff --git a/docs/specs/pbbam_updated_release3_2.rst b/docs/specs/pbbam_updated_release3_2.rst new file mode 100755 index 0000000..72d9b76 --- /dev/null +++ b/docs/specs/pbbam_updated_release3_2.rst @@ -0,0 +1,618 @@ +============================================================= +**Pbbam Core API Software Design & Functional Specification** +============================================================= +| *Version 0.2* +| *Pacific Biosciences Engineering Group* +| *Oct 17, 2016* + +1. Revision History +=================== + ++-------------+---------------+--------------------+---------------------------------+ +| **Date** | **Revision** | **Author(s)** | **Comments** | ++=============+===============+====================+=================================+ +| 01-29-2016 | 0.1 | Derek Barnett | Initial draft created | +| | | | | ++-------------+---------------+--------------------+---------------------------------+ +| 10-17-2016 | 0.2 | Derek Barnett | Added behavioral representation | +| | | | and structural representation | +| | | | diagram | ++-------------+---------------+--------------------+---------------------------------+ + +2. Introduction +=============== + +2.1. Document Specification Identifier +-------------------------------------- + ++-----------------------------------+------------------------------------------+ +| **Document Specification Prefix** | **Description** | ++===================================+==========================================+ +| FS\_SA\_PBBAM\_ | Functional spec for pbbam | ++-----------------------------------+------------------------------------------+ + +2.2. Purpose +------------ + +This document is intended to describe the requirements and interface of the pbbam +library, which provides functionality for creating, querying, and editing PacBio +BAM files and associated file formats. + +2.3. Scope of Document +---------------------- + +This document covers the expected usage of the pbbam library, as well as any +desired or required performance characteristics with respect to quality or speed. + +This document does not provide installation instructions or API documentation. + +2.4. Glossary of Terms +---------------------- + +The table below specifies only terms specific to this document, and skips +acronyms/terms that are specified in `Pacific Biosciences Software Glossary`_. + +.. _Pacific Biosciences Software Glossary: http://smrtanalysis-docs/pb_sw_glossary.html + ++------------------+-----------------------------------------------------------+ +| **Acronym/Term** | **Description** | ++==================+===========================================================+ +| API | Application Programming Interface - a set of routines, | +| | protocols, and tools for building software applications. | +| | In this document, this will consist of one or more | +| | cooperating libraries that specify data structures, | +| | methods, etc. for use within a target programming | +| | language. | ++------------------+-----------------------------------------------------------+ +| Client | An application that uses the library. | ++------------------+-----------------------------------------------------------+ +| I/O | Input/output of data. | ++------------------+-----------------------------------------------------------+ + +2.5. References +--------------- + ++-------------+------------------------------+--------------------------------------+ +| **Ref No.** | **Document Name, Link** | **Description** | ++=============+==============================+======================================+ +| (1) | `BAM format`_ | General SAM/BAM specification | ++-------------+------------------------------+--------------------------------------+ +| (2) | `PacBio BAM`_ | PacBio BAM specification | ++-------------+------------------------------+--------------------------------------+ +| (3) | `PacBio BAM index`_ | PacBio BAM index specification | ++-------------+------------------------------+--------------------------------------+ +| (4) | `DataSet XML`_ | PacBio DataSet XML specification | ++-------------+------------------------------+--------------------------------------+ +| (5) | `Software Style Guide`_ | PacBio coding standards | ++-------------+------------------------------+--------------------------------------+ +| (6) | `SMRT Analysis`_ | General SMRT Analysis infrastructure | ++-------------+------------------------------+--------------------------------------+ + +.. _BAM format: https://samtools.github.io/hts-specs/SAMv1.pdf +.. _PacBio BAM: http://pacbiofileformats.readthedocs.org/en/3.0/BAM.html +.. _PacBio BAM index: http://pacbiofileformats.readthedocs.org/en/3.0/PacBioBamIndex.html +.. _DataSet XML: https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/DataSet.rst +.. _Software Style Guide: http://smrtanalysis-docs/_downloads/PBISoftwareStyleGuide.doc +.. _SMRT Analysis: http://smrtanalysis-docs/smrt_docs.html + +3. Software Overview +==================== + +3.1. Software Module Description +-------------------------------- + +As of the 3.0 release of SMRT Analysis, PacBio is embracing the industry standard +`BAM format`_ (1) for (both aligned and unaligned) basecall data files. We have +also formulated a BAM companion file format (.bam.pbi) enabling fast access to a +richer set of per-read information as well as compatibility for software built +around the legacy cmp.h5 format. + +The pbbam library provides components to create, query, & transform PacBio BAM +data: sequence files and their associated indices. This includes a core C++ +library as well as bindings for additional programming languages. + +3.2. Software Module Functional Capabilities +-------------------------------------------- + +The library must be able to read and write BAM files that conform to the +`PacBio BAM`_ specification (2). BAM records must be editable e.g. adding +alignment information. Random access must be supported, whether by genomic +region or by filtering record features. To this end, the library will be able to +read, write, and create associated index files - both the standard BAM index +(.bai) and the `PacBio BAM index`_ (.pbi) (3). In addition to working with +individual files, datasets of related BAM files will be supported. These are +described in a `DataSet XML`_ document. (4) + +3.3. User Characteristics +------------------------- + ++---------------------+--------------------------------------------------------+ +| **User Class/Role** | **User Knowledge and Skill Levels** | ++=====================+========================================================+ +| Developer | Competence in one or more programming languages | +| | supported (C++, R, Python, C#). No knowledge of | +| | molecular biology wet lab techniques required. | ++---------------------+--------------------------------------------------------+ + +3.4. User Operations and Practices +---------------------------------- + +Developer users will interact with the software by incorporating the library +into a client application. + +3.5. Operating Environment +-------------------------- + +The software is intended to be run in a Linux or OSX environment, with ideally 4 +or more cores. + +3.6. General Constraints +------------------------ + +Currently there are no constraints outside the operating environment and speed +requirements. In particular, as the library will be used for writing the BAM +files coming off a Sequel instrument, it should be able to keep pace. + +3.7. Assumptions and Dependencies +--------------------------------- + +Input routines for the library will expect to receive files that conform to the +`PacBio BAM`_ (2) or `DataSet XML`_ (4) specifications. + +The pbbam library depends on Boost, zlib, and htslib libraries. + +3.8. Other Software +------------------- + +Output PacBio BAMs will be compatible with the `PacBio BAM`_ specification (2) +and thus compatible with the general `BAM format`_ specification (1). This +ensures that a wide variety of downstream tools can interact with data files. + +The software uses `CMake`_ as its build system. + +The core C++ API relies on the following 3rd party components: + +* `zlib`_ +* `htslib`_ +* `Boost`_ (header-only modules) + +Wrapper APIs for additional languages (Python, R, C#) are generated by `SWIG`_. + +API documentation is generated via `Doxygen`_. + +.. _CMake: https://cmake.org/ +.. _zlib: http://www.zlib.net/ +.. _htslib: https://github.com/samtools/htslib +.. _Boost: http://www.boost.org/ +.. _SWIG: http://www.swig.org/ +.. _Doxygen: http://www.stack.nl/~dimitri/doxygen/ + +4. External Interfaces +====================== + +4.1. User Interfaces +-------------------- + +N/A + +4.2. Software Interfaces +------------------------ + +pbbam will require the following software: + +* `htslib`_ & `zlib`_ - provides low-level handling of compressed BAM data +* `Boost`_ - provides utility classes + +Incoming data from upstream components will be compliant with +PacBio BAM format - see `PacBio BAM`_ specification (2) for more detail. + +4.3. Hardware Interfaces +------------------------ + +N/A + +4.4. Communications Interfaces +------------------------------ + +N/A + +5. Functional Requirements +========================== + +5.1. Query BAM data by genomic region +------------------------------------- + +5.1.1. Description +~~~~~~~~~~~~~~~~~~ + +pbbam shall allow client applications to query data, limited to some genomic +region of interest. + +5.1.2. Inputs +~~~~~~~~~~~~~ + +* BAM file(s) or DataSet XML +* a standard index (.bai) for each source BAM file +* genomic interval (e.g. "chr1:1000-2000") + +5.1.3. Processing +~~~~~~~~~~~~~~~~~ + +Obtain an `htslib`_ "iterator" object for a given file and region. This will be +wrapped by pbbam to hide the low-level nature of this type, as well as handling +memory lifetime. + +5.1.4. Outputs +~~~~~~~~~~~~~~ + +Iterator providing access to individual BAM records from the input data sources, +which are aligned to the requested genomic interval. + +For example: + +.. code:: c++ + + GenomicIntervalQuery query(interval, dataset); + for (const BamRecord& record : query) { + // ... use record data ... + } + + +5.2. Query BAM data by filter criteria +-------------------------------------- + +5.2.1. Description +~~~~~~~~~~~~~~~~~~ + +pbbam shall allow client applications to query data, limited to some filter +criteria (e.g. only reads from ZMW hole number 200 with a read quality of >0.5). + +5.2.2. Inputs +~~~~~~~~~~~~~ + +* BAM file(s) or DataSet XML +* a `PacBio BAM index`_ (.pbi) for each source BAM file +* filters supported by data contained in the PBI + +5.2.3. Processing +~~~~~~~~~~~~~~~~~ + +Query PBI files(s) for records that match the provided filter criteria. Merge +contiguous runs of records into record blocks, to minimize seeks. Advancing the +iterator either reads the next read from the current block or seeks to the next +block and fetches the next record. + +5.2.4. Outputs +~~~~~~~~~~~~~~ + +Iterator providing access to individual BAM records from the input data sources, +which satisfy the requested filter criteria. + +For example: + +.. code:: c++ + + PbiFilterQuery query(filter, dataset); + for (const BamRecord& record : query) { + // ... do stuff ... + } + + +5.3. Write PacBio BAM data +-------------------------- + +5.3.1. Description +~~~~~~~~~~~~~~~~~~ + +pbbam shall be able to write `PacBio BAM`_ files conforming to the specification. + +5.3.2. Inputs +~~~~~~~~~~~~~ + +* filename +* header information +* BAM records + +5.3.3. Processing +~~~~~~~~~~~~~~~~~ + +Create file handle for the provided filename, output initial header information. +As records are passed in, write to file. Upon completion, flush any buffers and +close file handle. + +Multithreading, provided by `htslib`_, will be utilized where possible to speed +up the compression process - often then main bottleneck of BAM throughput. + +5.3.4. Outputs +~~~~~~~~~~~~~~ + +BAM file conforming to the `PacBio BAM`_ specification. + +5.4. Create PacBio BAM index file +--------------------------------- + +5.4.1. Description +~~~~~~~~~~~~~~~~~~ + +Much of PacBio BAM data processing relies on the presence of a `PacBio BAM index`_ +file. pbbam shall be able to generate this file type for a `PacBio BAM`_ file. + +5.4.2. Inputs +~~~~~~~~~~~~~ + +`PacBio BAM`_ file + +5.4.3. Processing +~~~~~~~~~~~~~~~~~ + +Read through the input BAM records, storing the values relevant to a PBI index. +At end of file, write the index contents to a file and close. + +5.4.4. Outputs +~~~~~~~~~~~~~~ + +`PacBio BAM index`_ file + +6. Non-Functional Requirements +============================== + +6.1. Performance Requirements +----------------------------- + +Since pbbam will be used to write all BAM files coming off a Sequel instrument, the +library must keep pace with data generation requirements. + +6.2. Safety Requirements +------------------------ + +N/A + +6.3. Security Requirements +-------------------------- + +N/A + +6.4. Quality Attributes +----------------------- + +6.4.1. Availability +~~~~~~~~~~~~~~~~~~~ + +The developed software shall meet the overall product availability requirements. + +6.4.2. Data Integrity +~~~~~~~~~~~~~~~~~~~~~ + +Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications. +Files that do not meet this requirement will raise exceptions and will not be +accepted. + +6.4.3. Interoperability +~~~~~~~~~~~~~~~~~~~~~~~ + +Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications. + +6.4.4. Reliability +~~~~~~~~~~~~~~~~~~ + +The developed software shall meet the overall product reliability requirements. + +6.4.5. Robustness +~~~~~~~~~~~~~~~~~ + +pbbam will raise exceptions upon encountering failure cases, allowing client +applications to recover or report the error to a UI. + +6.4.6. Usability +~~~~~~~~~~~~~~~~ + +pbbam shall have comprehensive API documentation, available both on- and offline. +Further documentation will be provided for installation, API usage tips, etc. + +Raised exceptions shall carry as much information as possible so that client +applications can respond with appropriate actions or display useful messages. + +6.4.7. Maintainability +~~~~~~~~~~~~~~~~~~~~~~ + +The source code of the software covered in this functional specification shall +adhere to the PacBio `Software Style Guide`_ (9) work instruction, to guarantee +high quality of code that facilitates maintainability. + +6.4.8. Customizability +~~~~~~~~~~~~~~~~~~~~~~ + +N/A + +6.4.9. Compatibility +~~~~~~~~~~~~~~~~~~~~ + +pbbam shall support backward compatibility of the API and BAM format versions +in order not to break existing clients. + +6.5. Business Rules +------------------- + +N/A + +6.6. Compliance Requirements +---------------------------- + +N/A + +6.7. Alarms and Error Handling +------------------------------ + +Raised exceptions shall carry as much information as possible so that client +applications can respond with appropriate actions or display useful messages. + +6.8. Persistence Requirements +----------------------------- + +pbbam software requires no persistent storage outside of availability of input +and output during analysis. + +6.9. Installation and Upgrade +----------------------------- + +Installation and Upgrade of this software will be handled as part of the SMRT +Analysis subsystem. See `SMRT Analysis`_ (6) specifications for more detail. + +Additionally, the library may be built independently, either from internal +version control (Perforce) or from the public-facing Github repository. In +either case, `CMake`_ is used to drive the build process. + +6.10. Administration and Maintenance +------------------------------------ + +N/A + +6.11. User Documentation +------------------------ + +pbbam shall have comprehensive API documentation, available both on- and offline. +Further documentation will be provided for installation, API usage tips, etc. + +The "offline" API documentation may be built directly from the source code, using +`Doxygen`_. Online documentation will be generated via a continuous integration +server, thus ensuring it is always pointing to the current codebase. + +7. High Level Design +==================== + +7.1. Top Level Context +---------------------- + +The pbbam library is intended to be linked in with client applications, +providing programmatic access to data files. + +7.2. Use Cases +-------------- + +Primary use cases for pbbam include: + +* BAM file creation +* BAM file query - iterable access to various subsets of data + +8. Detailed Design +================== + +8.1. Structural Representation +------------------------------ + +.. image:: ./pbbam_structure.png + +8.2. Behavioral Representation +------------------------------ + +The typical access pattern involves a client query against BAM data, optionally +described in DataSet XML. The query may involve some number of filter criteria. + +pbbam queries the associated index files (*.pbi) to pre-determine which records +pass filtering criteria and where they reside on disk. The client code is given +an iterable object, such that each iteration of the main access loop returns a +valid BAM record for analysis, modification, etc. + +8.3. Information Storage +------------------------ + +pbbam software requires no persistent storage outside of availability of input +and output during analysis. + +8.4. Technology Overview +------------------------ + +pbbam is implemented in C++-11 and should perform as designed on any UNIX-like +operating system (Linux distributions, Apple OSX, etc.). + +8.5. SOUP Components +-------------------- + +pbbam utilizes CMake for its build system. The C++ library uses the following +3rd-party software components: `Boost`_, `htslib`_, & `zlib`_. Wrappers for additional +languages are generated using SWIG. + +8.6. Deployment and Configuration +--------------------------------- + +Please refer to `SMRT Analysis`_ (6) documentation + +9. Automated Tests +================== + +9.1. Unit Testing +----------------- + +The library shall have unit tests for all classes & components. + +9.2. Performance Testing +------------------------ + +Unit tests may evaluate performance requirements as desired. + +9.3. Regression Testing +----------------------- + +As its role is primarily in data I/O, pbbam has no "scientific quality/validity" +metrics that would indicate a regression. Instead, passing its unit tests and +end-to-end tests will indicate that a regression has not been introduced. + +These tests will be run after each check-in and nightly. + +10. Requirements Traceability Matrices +====================================== + +This section provides traces from requirements specified in PRD/DIR documents to the +requirements covered in this functional specification, and from these +functional requirements to corresponding Test Cases/Procedures. + +10.1. HPQC Functional Specifications +------------------------------------ + ++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+ +| **PBI_ID** | **Name** | **Description** | **Comment** | **Metric** | **Owner** | **PRD/DIR Path** | ++=============+===========================+===================================================+=============+============+===========+==================================================+ +| 5.1 | Query BAM data by | pbbam shall allow client applications to query | | Yes | dbarnett | \\DIR\\Functionality\\Software\Common\APIs\\ | +| | genomic region | data, limited to some genomic region of interest. | | | | Software shall provide an API to allow 3rd | +| | | | | | | party software to extract all run information | +| | | | | | | including summary reports and locations | ++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+ +| 5.2 | Query BAM data by | pbbam shall allow client applications to query | | Yes | dbarnett | \\DIR\\Functionality\\Software\Common\APIs\\ | +| | filter criteria | data, limited to some filter criteria (e.g. only | | | | Software shall provide an API to allow 3rd | +| | | reads from ZMW hole number 200 with a read | | | | party software to extract all run information | +| | | quality of >0.5). | | | | including summary reports and locations | ++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+ +| 5.3 | Write PacBio BAM data | pbbam shall be able to write files conforming to | | Yes | dbarnett | \\DIR\\Functionality\\Software\\PostProcessing\\ | +| | | the `PacBio BAM`_ specification. | | | | Software shall provide base files including | +| | | | | | | kinetic information in industry standard format | +| | | | | | | such as SAM/BAM using current specifications | ++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+ +| 5.4 | Create PacBio BAM index | Much of PacBio BAM data processing relies on the | | Yes | dbarnett | \\DIR\\Functionality\\Software\\PostProcessing\\ | +| | file | presence of a `PacBio BAM index`_ file. pbbam | | | | Software shall provide base files including | +| | | shall be able to generate this file type for a | | | | kinetic information in industry standard format | +| | | `PacBio BAM`_ file. | | | | such as SAM/BAM using current specifications | ++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+ + +10.2. Automated Tests Coverage +------------------------------ + ++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+ +| **FS Item** | **FS Item Title** | **Use Case Description** | **Test Case Name/ID** | ++=============+===========================+====================================================+==================================================================+ +| 5.1 | Query BAM data by | pbbam shall allow client applications to query | See section 9.1. Unit Testing. | +| | genomic region | data, limited to some genomic region of interest. | | ++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+ +| 5.2 | Query BAM data by | pbbam shall allow client applications to query | See section 9.1. Unit Testing. | +| | filter criteria | data, limited to some filter criteria (e.g. only | | +| | | reads from ZMW hole number 200 with a read | | +| | | quality of >0.5). | | ++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+ +| 5.3 | Write PacBio BAM data | pbbam shall be able to write files conforming to | See section 9.1. Unit Testing. | +| | | the `PacBio BAM`_ specification. | | ++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+ +| 5.4 | Create PacBio BAM index | Much of PacBio BAM data processing relies on the | See section 9.1. Unit Testing. | +| | file | presence of a `PacBio BAM index`_ file. pbbam | | +| | | shall be able to generate this file type for a | | +| | | `PacBio BAM`_ file. | | ++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+ + diff --git a/include/pbbam/Accuracy.h b/include/pbbam/Accuracy.h new file mode 100644 index 0000000..f1db014 --- /dev/null +++ b/include/pbbam/Accuracy.h @@ -0,0 +1,89 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Accuracy.h +/// \brief Defines the Accuracy class. +// +// Author: Derek Barnett + +#ifndef ACCURACY_H +#define ACCURACY_H + +#include "pbbam/Config.h" + +namespace PacBio { +namespace BAM { + +/// \brief The Accuracy class represents the expected accuracy of a BamRecord. +/// +/// Values are clamped to fall within [0,1]. +/// +class PBBAM_EXPORT Accuracy +{ +public: + static const float MIN; ///< Minimum valid accuracy value [0.0] + static const float MAX; ///< Maximum valid accuracy value [1.0] + +public: + /// \name Constructors & Related Methods + /// \{ + + /// Constructs an Accuracy object from a floating-point number. + /// + /// \note This is not an \b explicit ctor, to make it as easy as + /// possible to use in numeric operations. We really just want + /// to make sure that the acceptable range is respected. + /// + Accuracy(float accuracy); + Accuracy(const Accuracy& other); + ~Accuracy(void); + + /// \} + +public: + /// \returns Accuracy as float primitive + operator float(void) const; + +private: + float accuracy_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/Accuracy.inl" + +#endif // ACCURACY_H diff --git a/include/pbbam/AlignmentPrinter.h b/include/pbbam/AlignmentPrinter.h new file mode 100644 index 0000000..4dda6cd --- /dev/null +++ b/include/pbbam/AlignmentPrinter.h @@ -0,0 +1,110 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file AlignmentPrinter.h +/// \brief Defines the AlignmentPrinter class. +// +// Author: Armin Töpfer + +#ifndef ALIGNMENTPRINTER_H +#define ALIGNMENTPRINTER_H + +#include +#include +#include "pbbam/BamRecord.h" +#include "pbbam/IndexedFastaReader.h" +#include "pbbam/Orientation.h" + +namespace PacBio { +namespace BAM { + +class BamRecord; + +/// \brief The AlignmentPrinter class "pretty-prints" an alignment with respect +/// to its associated reference sequence. +/// +/// Example output: +/// \verbinclude plaintext/AlignmentPrinterOutput.txt +/// +class AlignmentPrinter +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// Constructs the alignment printer with an associated FASTA file reader. + /// + /// \param[in] ifr FASTA reader + /// + /// \throws std::runtime_error if FASTA file cannot be opened for reading. + /// + AlignmentPrinter(const IndexedFastaReader& ifr); + + AlignmentPrinter(void) = delete; + AlignmentPrinter(const AlignmentPrinter&) = delete; + AlignmentPrinter(AlignmentPrinter&&) = default; + AlignmentPrinter& operator=(const AlignmentPrinter&) = delete; + AlignmentPrinter& operator=(AlignmentPrinter&&) = default; + ~AlignmentPrinter(void) = default; + + /// \} + +public: + /// \name Printing + /// \{ + + /// Pretty-prints an aligned BamRecord to std::string. + /// + /// \note The current implementation includes ANSI escape sequences for + /// coloring terminal output. Future versions of this method will + /// likely make this optional. + /// + /// \returns formatted string containing the alignment and summary + /// information + /// + std::string Print(const BamRecord& record, + const Orientation orientation = Orientation::GENOMIC); + + /// \} + +private: + const std::unique_ptr ifr_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // ALIGNMENTPRINTER_H diff --git a/include/pbbam/BaiIndexedBamReader.h b/include/pbbam/BaiIndexedBamReader.h new file mode 100644 index 0000000..7441c69 --- /dev/null +++ b/include/pbbam/BaiIndexedBamReader.h @@ -0,0 +1,130 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BaiIndexedBamReader.h +/// \brief Defines the BaiIndexedBamReader class. +// +// Author: Derek Barnett + +#ifndef BAIINDEXEDBAMREADER_H +#define BAIINDEXEDBAMREADER_H + +#include "pbbam/BamReader.h" +#include "pbbam/BamFile.h" +#include "pbbam/GenomicInterval.h" + +namespace PacBio { +namespace BAM { + +namespace internal { struct BaiIndexedBamReaderPrivate; } + +/// \brief The BaiIndexedBamReader class provides read-only iteration over %BAM +/// records, bounded by a particular genomic interval. +/// +/// The SAM/BAM standard index (*.bai) is used to allow random-access operations. +/// +class PBBAM_EXPORT BaiIndexedBamReader : public BamReader +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Constructs %BAM reader, bounded by a genomic interval. + /// + /// All reads that overlap the interval will be available. + /// + /// \param[in] interval iteration will be bounded by this GenomicInterval. + /// \param[in] filename input %BAM filename + /// + /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open + /// for reading, or if the interval is invalid + /// + BaiIndexedBamReader(const GenomicInterval& interval, + const std::string& filename); + + /// \brief Constructs BAM reader, bounded by a genomic interval. + /// + /// All reads that overlap the interval will be available. + /// + /// \param[in] interval iteration will be bounded by this GenomicInterval. + /// \param[in] bamFile input BamFile object + /// + /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open + /// for reading, or if the interval is invalid + /// + BaiIndexedBamReader(const GenomicInterval& interval, const BamFile& bamFile); + + /// \brief Constructs %BAM reader, bounded by a genomic interval. + /// + /// All reads that overlap the interval will be available. + /// + /// \param[in] interval iteration will be bounded by this GenomicInterval. + /// \param[in] bamFile input BamFile object + /// + /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open + /// for reading, or if the interval is invalid + /// + BaiIndexedBamReader(const GenomicInterval& interval, BamFile&& bamFile); + + /// \} + +public: + /// \name Random-Access + /// \{ + + /// \returns the current GenomicInterval in use by this reader + const GenomicInterval& Interval(void) const; + + /// \brief Sets a new genomic interval on the reader. + /// + /// \param[in] interval + /// \returns reference to this reader + /// + BaiIndexedBamReader& Interval(const GenomicInterval& interval); + + /// \} + +protected: + int ReadRawData(BGZF* bgzf, bam1_t* b); + +private: + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // BAIINDEXEDBAMREADER_H diff --git a/include/pbbam/BamFile.h b/include/pbbam/BamFile.h new file mode 100644 index 0000000..d7c6811 --- /dev/null +++ b/include/pbbam/BamFile.h @@ -0,0 +1,218 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamFile.h +/// \brief Defines the BamFile class. +// +// Author: Derek Barnett + +#ifndef BAMFILE_H +#define BAMFILE_H + +#include "pbbam/Config.h" +#include "pbbam/BamHeader.h" +#include + +namespace PacBio { +namespace BAM { + +namespace internal { class BamFilePrivate; } + +/// \brief The BamFile class represents a %BAM file. +/// +/// It provides access to header metadata and methods for finding/creating +/// associated index files. +/// +class PBBAM_EXPORT BamFile +{ +public: + + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates a BamFile object on the provided \p filename & + /// loads header information. + /// + /// \param[in] filename %BAM filename + /// \throws std::exception on failure to open %BAM file for reading + /// + BamFile(const std::string& filename); + + BamFile(const BamFile& other); + BamFile(BamFile&& other); + BamFile& operator=(const BamFile& other); + BamFile& operator=(BamFile&& other); + ~BamFile(void); + + /// \} + +public: + + /// \name Index & Filename Methods + /// \{ + + /// \brief Creates a ".pbi" file for this %BAM file. + /// + /// \note Existing index file will be overwritten. Use + /// EnsurePacBioIndexExists() if this is not desired. + /// + /// \throws if PBI file could not be properly created and/or + /// written to disk + /// + void CreatePacBioIndex(void) const; + + /// \brief Creates a ".bai" file for this %BAM file. + /// + /// \note Existing index file will be overwritten. Use + /// EnsureStandardIndexExists() if this is not desired. + /// + /// \throws if BAI file could not be properly created (e.g. this + /// %BAM is not coordinate-sorted) or could not be written to disk + /// + void CreateStandardIndex(void) const; + + /// \brief Creates a ".pbi" file if one does not exist or is older than its + /// %BAM file. + /// + /// Equivalent to: + /// \code{.cpp} + /// if (!file.PacBioIndexExists()) + /// file.CreatePacBioIndex(); + /// \endcode + /// + /// \note As of v0.4.02+, no timestamp check is performed. Previously we requr + /// with an additional timestamp check. + /// + /// \throws if PBI file could not be properly created and/or + /// written to disk + /// + void EnsurePacBioIndexExists(void) const; + + /// \brief Creates a ".bai" file if one does not exist or is older than its + /// %BAM file. + /// + /// Equivalent to: + /// \code{.cpp} + /// if (!file.StandardIndexExists()) + /// file.CreateStandardIndex(); + /// \endcode + /// + /// \note As of v0.4.2, no timestamp check is performed. + /// + /// \throws if BAI file could not be properly created (e.g. this + /// %BAM is not coordinate-sorted) or could not be written to disk + /// + void EnsureStandardIndexExists(void) const; + + /// \returns %BAM filename + std::string Filename(void) const; + + /// \returns true if %BAM file has EOF marker (empty BGZF block). Streamed + /// input (filename: "-") + bool HasEOF(void) const; + + /// \returns true if ".pbi" exists and is newer than this %BAM file. + bool PacBioIndexExists(void) const; + + /// \returns filename of %PacBio index file (".pbi") + /// \note No guarantee is made on the existence of this file. + /// This method simply returns the expected filename. + std::string PacBioIndexFilename(void) const; + + /// \returns true if ".pbi" has a more recent timestamp than this file + bool PacBioIndexIsNewer(void) const; + + /// \returns true if ".bai" exists + bool StandardIndexExists(void) const; + + /// \note No guarantee is made on the existence of this file. + /// This method simply returns the expected filename. + std::string StandardIndexFilename(void) const; + + /// \returns true if ".bai" has a more recent timestamp than this file + bool StandardIndexIsNewer(void) const; + + /// \} + +public: + /// \name File Header Data + /// \{ + + /// \returns true if header metadata has this reference name + bool HasReference(const std::string& name) const; + + /// \returns const reference to BamHeader containing the file's metadata + const BamHeader& Header(void) const; + + /// \returns true if file is a %PacBio %BAM file (i.e. has non-empty version + /// associated with header "pb" tag) + bool IsPacBioBAM(void) const; + + /// \returns ID for reference \p name (can be used for e.g. + /// GenomicIntervalQuery), or -1 if not found + int ReferenceId(const std::string& name) const; + + /// \return name of reference matching \p id, empty string if not found + std::string ReferenceName(const int id) const; + + /// \returns length of requested reference \p name. 0 if not found + uint32_t ReferenceLength(const std::string& name) const; + + /// \returns length of requested reference \p id. 0 if not found + uint32_t ReferenceLength(const int id) const; + + /// \} + +public: + /// \name Additional Attributes + /// \{ + + /// \returns virtual offset of first alignment. Intended mostly for internal + /// use. Note that this is a BGZF \b virtual offset, not a + /// 'normal' file position. + /// + int64_t FirstAlignmentOffset(void) const; + + /// \} + +private: + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // BAMFILE_H diff --git a/include/pbbam/BamHeader.h b/include/pbbam/BamHeader.h new file mode 100644 index 0000000..eada466 --- /dev/null +++ b/include/pbbam/BamHeader.h @@ -0,0 +1,418 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamHeader.h +/// \brief Defines the BamHeader class. +// +// Author: Derek Barnett + +#ifndef BAMHEADER_H +#define BAMHEADER_H + +#include "pbbam/Config.h" +#include "pbbam/ProgramInfo.h" +#include "pbbam/ReadGroupInfo.h" +#include "pbbam/SequenceInfo.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { + +namespace internal { class BamHeaderPrivate; } + +/// \brief The BamHeader class represents the header section of the %BAM file. +/// +/// It provides metadata about the file including file version, reference +/// sequences, read groups, comments, etc. +/// +/// A BamHeader may be fetched from a BamFile to view an existing file's +/// metadata. Or one may be created/edited for use with writing to a new file +/// (via BamWriter). +/// +/// \note A particular BamHeader is likely to be re-used in lots of places +/// throughout the library, for read-only purposes. For this reason, even +/// though a BamHeader may be returned by value, it is essentially a thin +/// wrapper for a shared-pointer to the actual data. This means, though, +/// that if you need to edit an existing BamHeader for use with a +/// BamWriter, please consider using BamHeader::DeepCopy. Otherwise any +/// modifications will affect all BamHeaders that are sharing its +/// underlying data. +/// +class PBBAM_EXPORT BamHeader +{ +public: + /// \name Constructors & Related Methods + /// \{ + + BamHeader(void); + BamHeader(const std::string& samHeaderText); + BamHeader(const BamHeader& other); + BamHeader(BamHeader&& other); + BamHeader& operator=(const BamHeader& other); + BamHeader& operator=(BamHeader&& other); + ~BamHeader(void); + + /// \brief Detaches underlying data from the shared-pointer, returning a + /// independent copy of the header contents. + /// + /// This ensures that any modifications to the newly returned BamHeader do + /// not affect other BamHeader objects that were sharing its underlying data. + /// + BamHeader DeepCopy(void) const; + + /// \} + +public: + /// \name Operators + /// \{ + + /// \brief Merges another header with this one. + /// + /// Headers must be compatible for merging. This means that their Version, + /// SortOrder, PacBioBamVersion (and in the case of aligned BAM data, + /// Sequences) must all match. If not, an exception will be thrown. + /// + /// \param[in] other header to merge with this one + /// \returns reference to this header + /// + /// \throws std::runtime_error if the headers are not compatible + /// + BamHeader& operator+=(const BamHeader& other); + + /// \brief Creates a new, merged header. + /// + /// Headers must be compatible for merging. This means that their Version, + /// SortOrder, PacBioBamVersion (and in the case of aligned BAM data, + /// Sequences) must all match. If not, an exception will be thrown. + /// + /// Both original headers (this header and \p other) will not be modified. + /// + /// \param[in] other header to merge with this one + /// \returns merged header + /// + /// \throws std::runtime_error if the headers are not compatible + /// + BamHeader operator+(const BamHeader& other) const; + + /// \} + +public: + /// \name General Attributes + /// \{ + + /// \returns the %PacBio %BAM version number (\@HD:pb) + /// + /// \note This is different from the SAM/BAM version number + /// \sa BamHeader::Version. + /// + std::string PacBioBamVersion(void) const; + + /// \returns the sort order used + /// + /// Valid values: "unknown", "unsorted", "queryname", or "coordinate" + /// + std::string SortOrder(void) const; + + /// \returns the SAM/BAM version number (\@HD:VN) + /// + /// \note This is different from the %PacBio %BAM version number + /// \sa BamHeader::PacBioBamVersion + /// + std::string Version(void) const; + + /// \} + +public: + /// \name Read Groups + /// \{ + + /// \returns true if the header contains a read group with \p id (\@RG:ID) + bool HasReadGroup(const std::string& id) const; + + /// \returns a ReadGroupInfo object representing the read group matching + /// \p id (\@RG:ID) + /// \throws std::runtime_error if \p id is unknown + /// + ReadGroupInfo ReadGroup(const std::string& id) const; + + /// \returns vector of read group IDs listed in this header + std::vector ReadGroupIds(void) const; + + /// \returns vector of ReadGroupInfo objects, representing all read groups + /// listed in this header + /// + std::vector ReadGroups(void) const; + + /// \} + +public: + /// \name Sequences + /// \{ + + /// \returns true if header contains a sequence with \p name (\@SQ:SN) + bool HasSequence(const std::string& name) const; + + /// \returns number of sequences (\@SQ entries) stored in this header + size_t NumSequences(void) const; + + /// \returns numeric ID for sequence matching \p name (\@SQ:SN) + /// + /// This is the numeric ID used elsewhere throughout the API. + /// + /// \throws std::runtime_error if \p name is unknown + /// \sa BamReader::ReferenceId, PbiReferenceIdFilter, + /// PbiRawMappedData::tId_ + /// + int32_t SequenceId(const std::string& name) const; + + /// \returns the length of the sequence (\@SQ:LN, e.g. chromosome length) at + /// index \p id + /// + /// \sa SequenceInfo::Length, BamHeader::SequenceId + /// + std::string SequenceLength(const int32_t id) const; + + /// \returns the name of the sequence (\@SQ:SN) at index \p id + /// + /// \sa SequenceInfo::Name, BamHeader::SequenceId + /// + std::string SequenceName(const int32_t id) const; + + /// \returns vector of sequence names (\@SQ:SN) stored in this header + /// + /// Position in the vector is equivalent to SequenceId. + /// + std::vector SequenceNames(void) const; + + /// \returns SequenceInfo object at index \p id + /// + /// \throws std::out_of_range if \p is an invalid or unknown index + /// \sa BamHeader::SequenceId + /// + SequenceInfo Sequence(const int32_t id) const; + + /// \returns SequenceInfo for the sequence matching \p name + SequenceInfo Sequence(const std::string& name) const; + + /// \returns vector of SequenceInfo objects representing the sequences + /// (\@SQ entries) stored in this header + /// + std::vector Sequences(void) const; + + /// \} + +public: + /// \name Programs + /// \{ + + /// \returns true if this header contains a program entry with ID (\@PG:ID) + /// matching \p id + /// + bool HasProgram(const std::string& id) const; + + /// \returns ProgramInfo object for the program entry matching \p id + /// \throws std::runtime_error if \p id is unknown + /// + ProgramInfo Program(const std::string& id) const; + + /// \returns vector of program IDs (\@PG:ID) + std::vector ProgramIds(void) const; + + /// \returns vector of ProgramInfo objects representing program entries + /// (\@PG) stored in this heder + /// + std::vector Programs(void) const; + + /// \} + +public: + /// \name Comments + /// \{ + + /// \returns vector of comment (\@CO) strings + std::vector Comments(void) const; + + /// \} + +public: + /// \name Conversion Methods + /// \{ + + /// \returns SAM-header-formatted string representing this header's data + std::string ToSam(void) const; + + /// \} + +public: + + /// \name General Attributes + /// \{ + + /// \brief Sets this header's PacBioBAM version number (\@HD:pb). + /// + /// \returns reference to this object + /// \throws std::runtime_error if version number cannot be parsed or + /// is less than the minimum version allowed. + /// + BamHeader& PacBioBamVersion(const std::string& version); + + /// \brief Sets this header's sort order label (\@HD:SO). + /// + /// Valid values: "unknown", "unsorted", "queryname", or "coordinate" + /// + /// \returns reference to this object + /// + BamHeader& SortOrder(const std::string& order); + + /// \brief Sets this header's SAM/BAM version number (\@HD:VN). + /// + /// \returns reference to this object + /// + BamHeader& Version(const std::string& version); + + /// \} + +public: + /// \name Read Groups + /// \{ + + /// \brief Appends a read group entry (\@RG) to this header. + /// + /// \returns reference to this object + /// + BamHeader& AddReadGroup(const ReadGroupInfo& readGroup); + + /// \brief Removes all read group entries from this header. + /// + /// \returns reference to this object + /// + BamHeader& ClearReadGroups(void); + + /// \brief Replaces this header's list of read group entries with those in + /// \p readGroups. + /// + /// \returns reference to this object + /// + BamHeader& ReadGroups(const std::vector& readGroups); + + /// \} + +public: + /// \name Sequences + /// \{ + + /// \brief Appends a sequence entry (\@SQ) to this header. + /// + /// \returns reference to this object + /// + BamHeader& AddSequence(const SequenceInfo& sequence); + + /// \brief Removes all sequence entries from this header. + /// + /// \returns reference to this object + /// + BamHeader& ClearSequences(void); + + /// \brief Replaces this header's list of sequence entries with those in + /// \p sequences. + /// + /// \returns reference to this object + /// + BamHeader& Sequences(const std::vector& sequences); + + /// \} + +public: + /// \name Programs + /// \{ + + /// \brief Appends a program entry (\@PG) to this header. + /// + /// \returns reference to this object + /// + BamHeader& AddProgram(const ProgramInfo& pg); + + /// \brief Removes all program entries from this header. + /// + /// \returns reference to this object + /// + BamHeader& ClearPrograms(void); + + /// \brief Replaces this header's list of program entries with those in + /// \p programs. + /// + /// \returns reference to this object + /// + BamHeader& Programs(const std::vector& programs); + + /// \} + +public: + /// \name Comments + /// \{ + + /// \brief Appends a comment (\@CO) to this header. + /// + /// \returns reference to this object + /// + BamHeader& AddComment(const std::string& comment); + + /// \brief Removes all comments from this header. + /// + /// \returns reference to this object + /// + BamHeader& ClearComments(void); + + /// \brief Replaces this header's list of comments with those in \p comments. + /// + /// \returns reference to this object + /// + BamHeader& Comments(const std::vector& comments); + + /// \} + +private: + PBBAM_SHARED_PTR d_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/BamHeader.inl" + +#endif // BAMHEADER_H diff --git a/include/pbbam/BamReader.h b/include/pbbam/BamReader.h new file mode 100644 index 0000000..774a2ec --- /dev/null +++ b/include/pbbam/BamReader.h @@ -0,0 +1,191 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamReader.h +/// \brief Defines the BamReader class. +// +// Author: Derek Barnett + +#ifndef BAMREADER_H +#define BAMREADER_H + +#include "pbbam/BamFile.h" +#include "pbbam/BamHeader.h" +#include "pbbam/BamRecord.h" +#include "pbbam/Config.h" +#include "pbbam/GenomicInterval.h" + +#include +#include +#include + +namespace PacBio { +namespace BAM { + +namespace internal { struct BamReaderPrivate; } + +/// \brief The BamReader class provides basic read-access to a %BAM file. +/// +/// The base-class implementation provides a sequential read-through of BAM +/// records. Derived classes may implement other access schemes (e.g. genomic +/// region, PBI-enabled record filtering). +/// +class PBBAM_EXPORT BamReader +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Opens BAM file for reading. + /// + /// \param[in] fn %BAM filename + /// \throws std::runtime_error if failed to open + /// + explicit BamReader(const std::string& fn); + + /// \brief Opens BAM file for reading. + /// + /// \param[in] bamFile BamFile object + /// \throws std::runtime_error if failed to open + /// + explicit BamReader(const BamFile& bamFile); + + /// \brief Opens BAM file for reading. + /// + /// \param[in] bamFile BamFile object + /// \throws std::runtime_error if failed to open + /// + explicit BamReader(BamFile&& bamFile); + + virtual ~BamReader(void); + + /// \} + +public: + /// \name BAM File Attributes + /// \{ + + /// \returns the underlying BamFile + const BamFile& File(void) const; + + /// \returns %BAM filename + std::string Filename(void) const; + + /// \returns BamHeader object from %BAM header contents + const BamHeader& Header(void) const; + + /// \} + +public: + /// \name BAM File I/O + /// \{ + + /// \brief Fetches the "next" %BAM record. + /// + /// Default implementation will read records until EOF. Derived readers may + /// use additional criteria to decide which record is "next" and when + /// reading is done. + /// + /// \param[out] record next BamRecord object. Should not be used if method + /// returns false. + /// + /// \returns true if record was read successfully. Returns false if EOF (or + /// end of iterator in derived readers). False is not an error, + /// it indicates "end of data". + /// + /// \throws std::runtime_error if failed to read from file (e.g. possible + /// truncated or corrupted file). + /// + bool GetNext(BamRecord& record); + + /// \brief Seeks to virtual offset in %BAM. + /// + /// \note This is \b NOT a normal file offset, but the virtual offset used + /// in %BAM indexing. + /// + /// \throws std::runtime_error if failed to seek + /// + void VirtualSeek(int64_t virtualOffset); + + /// \returns current (virtual) file position. + /// + /// \note This is \b NOT a normal file offset, but the virtual offset used + /// in %BAM indexing. + /// + int64_t VirtualTell(void) const; + + /// \} + +protected: + /// \name BAM File I/O + /// \{ + + /// \brief Helper method for access to underlying BGZF stream pointer. + /// + /// Useful for derived readers' contact points with htslib methods. + /// + /// \returns BGZF stream pointer + /// + BGZF* Bgzf(void) const; + + /// \brief Performs the actual raw read of the next record from the BAM + /// file. + /// + /// Default implementation will read records, sequentially, until EOF. + /// Derived readers may use additional criteria to decide which record is + /// "next" and when reading is done. + /// + /// Return value should be equivalent to htslib's bam_read1(): + /// >= 0 : normal + /// -1 : EOF (not an error) + /// < -1 : error + /// + /// \param[in] bgzf BGZF stream pointer + /// \param[out] b %BAM record pointer + /// \returns integer status code, see description + /// + virtual int ReadRawData(BGZF* bgzf, bam1_t* b); + + /// \} + +private: + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // BAMREADER_H diff --git a/include/pbbam/BamRecord.h b/include/pbbam/BamRecord.h new file mode 100644 index 0000000..a642a9f --- /dev/null +++ b/include/pbbam/BamRecord.h @@ -0,0 +1,1284 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamRecord.h +/// \brief Defines the BamRecord class. +// +// Author: Derek Barnett + +#ifndef BAMRECORD_H +#define BAMRECORD_H + +#include "pbbam/Accuracy.h" +#include "pbbam/Frames.h" +#include "pbbam/BamRecordImpl.h" +#include "pbbam/BamHeader.h" +#include "pbbam/ClipType.h" +#include "pbbam/FrameEncodingType.h" +#include "pbbam/LocalContextFlags.h" +#include "pbbam/Orientation.h" +#include "pbbam/PulseBehavior.h" +#include "pbbam/ReadGroupInfo.h" +#include "pbbam/RecordType.h" +#include "pbbam/Strand.h" +#include "pbbam/QualityValues.h" +#include "pbbam/virtual/VirtualRegionType.h" +#include "pbbam/ZmwType.h" +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { + +namespace internal { + +class BamRecordMemory; +class Pulse2BaseCache; + +} // namespace internal + +/// \brief The BamRecord class represents a %PacBio %BAM record. +/// +/// %PacBio %BAM records are extensions of normal SAM/BAM records. Thus in +/// addition to normal fields like bases, qualities, mapping coordinates, etc., +/// tags are used extensively to annotate records with additional +/// PacBio-specific data. +/// +/// Mapping and clipping APIs are provided as well to ensure that such +/// operations "trickle down" to all data fields properly. +/// +/// \sa https://samtools.github.io/hts-specs/SAMv1.pdf +/// for more information on standard %BAM data, and +/// https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/BAM.rst +/// for more information on %PacBio %BAM fields. +/// +class PBBAM_EXPORT BamRecord +{ +public: + /// \name Constructors & Related Methods + /// \{ + + BamRecord(void); + BamRecord(const BamHeader& header); + BamRecord(const BamRecordImpl& impl); + BamRecord(BamRecordImpl&& impl); + BamRecord(const BamRecord& other); + BamRecord(BamRecord&& other); + BamRecord& operator=(const BamRecord& other); + BamRecord& operator=(BamRecord&& other); + virtual ~BamRecord(void); + + /// \} + +public: + /// \name General Data + /// \{ + + /// \returns this record's full name + /// \sa BamRecordImpl::Name + /// + std::string FullName(void) const; + + /// \returns shared pointer to this record's associated BamHeader + BamHeader Header(void) const; + + /// \returns ZMW hole number + /// \throws if missing zm tag & record name does not contain hole number + /// + int32_t HoleNumber(void) const; + + /// \returns this record's LocalContextFlags + PacBio::BAM::LocalContextFlags LocalContextFlags(void) const; + + /// \returns this record's movie name + std::string MovieName(void) const; + + /// \returns "number of complete passes of the insert" + int32_t NumPasses(void) const; + + /// \returns the record's query end position, or Sequence().length() if not + /// stored + /// \note QueryEnd is in polymerase read coordinates, NOT genomic + /// coordinates. + /// + Position QueryEnd(void) const; + + /// \returns the record's query start position, or 0 if not stored + /// + /// \note QueryStart is in polymerase read coordinates, NOT genomic + /// coordinates. + /// + Position QueryStart(void) const; + + /// \returns this record's expected read accuracy [0, 1000] + Accuracy ReadAccuracy(void) const; + + /// \returns ReadGroupInfo object for this record + ReadGroupInfo ReadGroup(void) const; + + /// \returns string ID of this record's read group + /// \sa ReadGroupInfo::Id + /// + std::string ReadGroupId(void) const; + + /// \returns integer value for this record's read group ID + int32_t ReadGroupNumericId(void) const; + + /// \returns this scrap record's scrap region type + VirtualRegionType ScrapRegionType(void) const; + + /// \returns this scrap record's scrap ZMW type + ZmwType ScrapZmwType(void) const; + + /// \returns this record's average signal-to-noise for each of A, C, G, + /// and T + /// + std::vector SignalToNoise(void) const; + + /// \returns this record's type + /// \sa RecordType + RecordType Type(void) const; + + /// \} + +public: + /// \name Mapping Data + /// \{ + + /// \returns the record's aligned end position + /// + /// \note AlignedEnd is in polymerase read coordinates, NOT genomic + /// coordinates. + /// + Position AlignedEnd(void) const; + + /// \returns the record's aligned start position + /// + /// \note AlignedStart is in polymerase read coordinates, NOT genomic + /// coordinates. + /// + Position AlignedStart(void) const; + + /// \returns the record's strand as a Strand enum value + Strand AlignedStrand(void) const; + + /// \returns the record's CIGAR data as a Cigar object + /// + /// \param[in] exciseAllClips if true, remove all clipping operations + /// (hard & soft) [default:false] + /// + Cigar CigarData(bool exciseAllClips = false) const; + + /// \returns true if this record was mapped by aligner + bool IsMapped(void) const; + + /// \returns this record's mapping quality. A value of 255 indicates + /// "unknown" + /// + uint8_t MapQuality(void) const; + + /// \returns the number of deleted bases (relative to reference) + size_t NumDeletedBases(void) const; + + /// \returns the number of inserted bases (relative to reference) + size_t NumInsertedBases(void) const; + + /// \returns the number of matching bases (sum of '=' CIGAR op lengths) + size_t NumMatches(void) const; + + /// \returns a tuple containing NumMatches (first) and NumMismatches + /// (second) + /// + std::pair NumMatchesAndMismatches(void) const; + + /// \returns the number of mismatching bases (sum of 'X' CIGAR op lengths) + size_t NumMismatches(void) const; + + /// \returns this record's reference ID, or -1 if unmapped. + /// + /// \note This is only a valid identifier within this %BAM file + /// + int32_t ReferenceId(void) const; + + /// \returns this record's reference name. + /// + /// \throws an exception if unmapped record. + /// + std::string ReferenceName(void) const; + + /// \returns the record's reference end position, or UnmappedPosition if + /// unmapped + /// + /// \note ReferenceEnd is in reference coordinates, NOT polymerase read + /// coordinates. + /// + Position ReferenceEnd(void) const; + + /// \returns the record's reference start position, or UnmappedPosition if + /// unmapped + /// + /// \note ReferenceStart is in reference coordinates, NOT polymerase read + /// coordinates. + /// + Position ReferenceStart(void) const; + + /// \} + +public: + /// \name Barcode Data + /// \{ + + /// \returns forward barcode id + /// + /// \throws std::runtime_error if barcode data is absent or malformed. + /// \sa HasBarcodes + /// + int16_t BarcodeForward(void) const; + + /// \returns barcode call confidence (Phred-scaled posterior probability + /// of correct barcode call) + /// + /// \sa HasBarcodeQuality + /// + uint8_t BarcodeQuality(void) const; + + /// \returns reverse barcode id + /// + /// \throws std::runtime_error if barcode data is absent or malformed. + /// \sa HasBarcodes + /// + int16_t BarcodeReverse(void) const; + + /// \returns the forward and reverse barcode ids + /// + /// \throws std::runtime_error if barcode data is absent or malformed. + /// \sa HasBarcodes + /// + std::pair Barcodes(void) const; + + /// \} + +public: + /// \name Auxiliary Data Queries + /// \{ + + /// \returns true if this record has AltLabelQV data + bool HasAltLabelQV(void) const; + + /// \returns true if this record has AltLabelTag data + bool HasAltLabelTag(void) const; + + /// \returns true if this record has Barcode data + bool HasBarcodes(void) const; + + /// \returns true is this record has BarcodeQuality data + bool HasBarcodeQuality(void) const; + + /// \returns true if this record has DeletionQV data + bool HasDeletionQV(void) const; + + /// \returns true if this record has DeletionTag data + bool HasDeletionTag(void) const; + + /// \returns true if this record has a HoleNumber + bool HasHoleNumber(void) const; + + /// \returns true if this record has InsertionQV data + bool HasInsertionQV(void) const; + + /// \returns true if this record has IPD data + bool HasIPD(void) const; + + /// \returns true if this record has LabelQV data + bool HasLabelQV(void) const; + + /// \returns true if this record has LocalContextFlags (absent in CCS) + bool HasLocalContextFlags(void) const; + + /// \returns true if this record has MergeQV data + bool HasMergeQV(void) const; + + /// \returns true if this record has NumPasses data + bool HasNumPasses(void) const; + + /// \returns true if this record has Pkmean data + bool HasPkmean(void) const; + + /// \returns true if this record has Pkmid data + bool HasPkmid(void) const; + + /// \returns true if this record has Pkmean2 data + bool HasPkmean2(void) const; + + /// \returns true if this record has Pkmid2 data + bool HasPkmid2(void) const; + + /// \returns true if this record has PreBaseFrames aka IPD data + bool HasPreBaseFrames(void) const; + + /// \returns true if this record has PrePulseFrames data + bool HasPrePulseFrames(void) const; + + /// \returns true if this record has PulseCall data + bool HasPulseCall(void) const; + + /// \returns true if this record has PulseCallWidth data + bool HasPulseCallWidth(void) const; + + /// \returns true if this record has PulseMergeQV data + bool HasPulseMergeQV(void) const; + + /// \returns true if this record has PulseWidth data + bool HasPulseWidth(void) const; + + /// \returns true if this record has ReadAccuracyTag data + bool HasReadAccuracy(void) const; + + /// \returns true if this record has QueryEnd data + bool HasQueryEnd(void) const; + + /// \returns true if this record has QueryStart data + bool HasQueryStart(void) const; + + /// \returns true if this record has ScrapRegionType data (only in SCRAP) + bool HasScrapRegionType(void) const; + + /// \returns true if this record has scrap ZMW type data (only in SCRAP) + bool HasScrapZmwType(void) const; + + /// \returns true if this record has signal-to-noise data (absent in + /// POLYMERASE) + /// + bool HasSignalToNoise(void) const; + + /// \returns true if this record has StartFrame data + bool HasStartFrame(void) const; + + /// \returns true if this record has SubstitutionQV data + bool HasSubstitutionQV(void) const; + + /// \returns true if this record has SubstitutionTag data + bool HasSubstitutionTag(void) const; + + /// \} + +public: + /// \name Sequence & Tag Data + /// \{ + + /// \brief Fetches this record's AltLabelTag values ("pt" tag). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new gap chars will be '-' and padding chars will be '*'. + /// + /// \param[in] orientation Orientation of output. + /// + /// \returns AltLabelTags string + /// + std::string AltLabelTag(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false, + PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + /// \brief Fetches this record's DeletionTag values ("dt" tag). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new gap chars will be '-' and padding chars will be '*'. + /// + /// \param[in] orientation Orientation of output. + /// \param[in] aligned if true, gaps/padding will be inserted, per + /// Cigar info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + /// \returns DeletionTag string + /// + std::string DeletionTag(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false) const; + + /// \brief Fetches this record's DNA sequence (SEQ field). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new gap chars will be '-' and padding chars will be '*'. + /// + /// \param[in] orientation Orientation of output. + /// \param[in] aligned if true, gaps/padding will be inserted, per + /// Cigar info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + /// \returns sequence string + /// + std::string Sequence(const Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false) const; + + /// \brief Fetches this record's SubstitutionTag values ("st" tag). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new gap chars will be '-' and padding chars will be '*'. + /// + /// \param[in] orientation Orientation of output. + /// \param[in] aligned if true, gaps/padding will be inserted, per + /// Cigar info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + /// \returns SubstitutionTags string + /// + std::string SubstitutionTag(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false) const; + + /// \} + +public: + /// \name Quality Data + /// \{ + + /// \brief Fetches this record's AltLabelQV values ("pv" tag). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new QVs will have a value of 0. + /// + /// \param[in] orientation Orientation of output. + /// + /// \returns AltLabelQV as QualityValues object + /// + QualityValues AltLabelQV(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false, + PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + /// \brief Fetches this record's DeletionQV values ("dq" tag). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new QVs will have a value of 0. + /// + /// \param[in] orientation Orientation of output. + /// \param[in] aligned if true, gaps/padding will be inserted, per + /// Cigar info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + /// \returns DeletionQV as QualityValues object + /// + QualityValues DeletionQV(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false) const; + + /// \brief Fetches this record's InsertionQV values ("iq" tag). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new QVs will have a value of 0. + /// + /// \param[in] orientation Orientation of output. + /// \param[in] aligned if true, gaps/padding will be inserted, per + /// Cigar info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + /// \returns InsertionQVs as QualityValues object + /// + QualityValues InsertionQV(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false) const; + + /// \brief Fetches this record's LabelQV values ("pq" tag). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new QVs will have a value of 0. + /// + /// \param[in] orientation Orientation of output. + /// + /// \returns LabelQV as QualityValues object + /// + QualityValues LabelQV(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false, + PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + /// \brief Fetches this record's MergeQV values ("mq" tag). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new QVs will have a value of 0. + /// + /// \param[in] orientation Orientation of output. + /// \param[in] aligned if true, gaps/padding will be inserted, per + /// Cigar info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + /// \returns MergeQV as QualityValues object + /// + QualityValues MergeQV(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false) const; + + /// \brief Fetches this record's %BAM quality values (QUAL field). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new QVs will have a value of 0. + /// + /// \param[in] orientation Orientation of output. + /// \param[in] aligned if true, gaps/padding will be inserted, per + /// Cigar info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + /// \returns %BAM qualities as QualityValues object + /// + QualityValues Qualities(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false) const; + + /// \brief Fetches this record's SubstitutionQV values ("sq" tag). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new QVs will have a value of 0. + /// + /// \param[in] orientation Orientation of output. + /// \param[in] aligned if true, gaps/padding will be inserted, per + /// Cigar info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + /// \returns SubstitutionQV as QualityValues object + /// + QualityValues SubstitutionQV(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false) const; + + /// \} + +public: + /// \name Pulse Data + /// \{ + + /// \brief Fetches this record's IPD values ("ip" tag). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new frames will have a value of 0; + /// + /// \param[in] orientation Orientation of output. + /// \param[in] aligned if true, gaps/padding will be inserted, per + /// Cigar info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + /// \returns IPD as Frames object + /// + Frames IPD(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false) const; + + /// \brief Fetches this record's IPD values ("ip" tag), but does not upscale. + /// + /// \param[in] orientation Orientation of output. + /// \returns IPD as Frames object + /// + Frames IPDRaw(Orientation orientation = Orientation::NATIVE) const; + + /// \brief Fetches this record's Pkmean values ("pa" tag). + /// + /// \param[in] orientation Orientation of output. + /// \returns Pkmean as vector object + /// + std::vector Pkmean(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false, + PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + /// \brief Fetches this record's Pkmid values ("pm" tag). + /// + /// \param[in] orientation Orientation of output. + /// \returns Pkmid as vector object + /// + std::vector Pkmid(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false, + PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + /// \brief Fetches this record's Pkmean2 values ("pi" tag). + /// + /// \param[in] orientation Orientation of output. + /// \returns Pkmean as vector object + /// + std::vector Pkmean2(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false, + PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + /// \brief Fetches this record's Pkmid2 values ("ps" tag). + /// + /// \param[in] orientation Orientation of output. + /// \returns Pkmid as vector object + /// + std::vector Pkmid2(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false, + PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + /// \brief Fetches this record's PreBaseFrames aka IPD values ("ip" tag). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new frames will have a value of 0; + /// + /// \param[in] orientation Orientation of output. + /// \param[in] aligned if true, gaps/padding will be inserted, per + /// Cigar info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + /// \returns IPD as Frames object + /// + Frames PreBaseFrames(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false) const; + + /// \brief Fetches this record's PrePulseFrames values ("pd" tag). + /// + /// \param[in] orientation Orientation of output. + /// \returns PrePulseFrames as Frames object + /// + Frames PrePulseFrames(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false, + PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + /// \brief Fetches this record's PulseCall values ("pc" tag). + /// + /// \param[in] orientation Orientation of output. + /// \returns PulseCalls string + /// + std::string PulseCall(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false, + PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + /// \brief Fetches this record's PulseCallWidth values ("px" tag). + /// + /// \param[in] orientation Orientation of output. + /// \returns PulseCallWidth as Frames object + /// + Frames PulseCallWidth(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false, + PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + /// \brief Fetch this record's PulseMergeQV values ("pg" tag). + /// + /// \param[in] orientation Orientation of output. + /// \returns PulseMergeQV as QualityValues object + /// + QualityValues PulseMergeQV(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false, + PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + /// \brief Fetches this record's PulseWidth values ("pw" tag). + /// + /// \note If \p aligned is true, and gaps/padding need to be inserted, the + /// new frames will have a value of 0. + /// + /// \param[in] orientation Orientation of output. + /// \param[in] aligned if true, gaps/padding will be inserted, per + /// Cigar info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + /// \returns PulseWidths as Frames object + /// + Frames PulseWidth(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false) const; + + /// \brief Fetches this record's PulseWidth values ("pw" tag), but does not + /// upscale. + /// + /// \param[in] orientation Orientation of output. + /// \returns PulseWidth as Frames object + /// + Frames PulseWidthRaw(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false) const; + + /// \brief Fetches this record's StartFrame values ("sf" tag). + /// + /// \param[in] orientation Orientation of output + /// + /// \returns StartFrame as uint32_t vector + /// + std::vector StartFrame(Orientation orientation = Orientation::NATIVE, + bool aligned = false, + bool exciseSoftClips = false, + PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + /// \} + +public: + /// \name Low-Level Access & Operations + /// \{ + + /// \warning This method should be considered temporary and avoided as much + /// as possible. Direct access to the internal object is likely to + /// disappear as BamRecord interface matures. + /// + /// \returns const reference to underlying BamRecordImpl object + /// + const BamRecordImpl& Impl(void) const; + + /// \warning This method should be considered temporary and avoided as much + /// as possible. Direct access to the internal object is likely to + /// disappear as BamRecord interface matures. + /// + /// \returns reference to underlying BamRecordImpl object + /// + BamRecordImpl& Impl(void); + + /// \} + +public: + /// \name General Data + /// \{ + + /// \brief Sets this record's ZMW hole number. + /// + /// \param[in] holeNumber + /// \returns reference to this record + /// + BamRecord& HoleNumber(const int32_t holeNumber); + + /// \brief Sets this record's local context flags + /// + /// \param[in] flags + /// \returns reference to this record + /// + BamRecord& LocalContextFlags(const PacBio::BAM::LocalContextFlags flags); + + /// \brief Sets this record's "number of complete passes of the insert". + /// + /// \param[in] numPasses + /// \returns reference to this record + /// + BamRecord& NumPasses(const int32_t numPasses); + + /// \brief Sets this record's query end position. + /// + /// \note Changing this will modify the name of non-CCS records. + /// + /// \param[in] pos + /// \returns reference to this record + /// + BamRecord& QueryEnd(const PacBio::BAM::Position pos); + + /// \brief Sets this record's query start position. + /// + /// \note Changing this will modify the name of non-CCS records. + /// + /// \param[in] pos + /// \returns reference to this record + /// + BamRecord& QueryStart(const PacBio::BAM::Position pos); + + /// \brief Sets this record's expected read accuracy [0, 1000] + /// + /// \param[in] accuracy + /// \returns reference to this record + /// + BamRecord& ReadAccuracy(const Accuracy& accuracy); + + /// \brief Attaches this record to the provided read group, changing the + /// record name & 'RG' tag. + /// + /// \param[in] rg + /// \returns reference to this record + /// + BamRecord& ReadGroup(const ReadGroupInfo& rg); + + /// \brief Attaches this record to the provided read group, changing the + /// record name & 'RG' tag. + /// + /// \param[in] id + /// \returns reference to this record + /// + BamRecord& ReadGroupId(const std::string& id); + + /// \brief Sets this scrap record's ScrapRegionType + /// + /// \param[in] type + /// \returns reference to this record + /// + BamRecord& ScrapRegionType(const VirtualRegionType type); + + /// \brief Sets this scrap record's ScrapRegionType + /// + /// \param[in] type character equivalent of VirtualRegionType + /// \returns reference to this record + /// + BamRecord& ScrapRegionType(const char type); + + /// \brief Sets this scrap record's ScrapZmwType + /// + /// \param[in] type + /// \returns reference to this record + /// + BamRecord& ScrapZmwType(const ZmwType type); + + /// \brief Sets this scrap record's ScrapZmwType + /// + /// \param[in] type character equivalent of ZmwType + /// \returns reference to this record + /// + BamRecord& ScrapZmwType(const char type); + + /// \brief Sets this record's average signal-to-noise in each of A, C, G, + /// and T + /// + /// \param[in] snr average signal-to-noise of A, C, G, and T (in this order) + /// \returns reference to this record + /// + BamRecord& SignalToNoise(const std::vector& snr); + + /// \} + +public: + /// \name Barcode Data + /// \{ + + /// \brief Sets this record's barcode IDs ('bc' tag) + /// + /// \param[in] barcodeIds + /// \returns reference to this record + /// + BamRecord& Barcodes(const std::pair& barcodeIds); + + /// \brief Sets this record's barcode quality ('bq' tag) + /// + /// \param[in] quality Phred-scaled confidence call + /// \returns reference to this record + /// + BamRecord& BarcodeQuality(const uint8_t quality); + + /// \} + +public: + /// \name Sequence & Tag Data + /// \{ + + /// \brief Sets this record's AltLabelTag values ("at" tag). + /// + /// \param[in] tags + /// \returns reference to this record + /// + BamRecord& AltLabelTag(const std::string& tags); + + /// \brief Sets this record's DeletionTag values ("dt" tag). + /// + /// \param[in] tags + /// \returns reference to this record + /// + BamRecord& DeletionTag(const std::string& tags); + + /// \brief Sets this record's SubstitutionTag values ("st" tag). + /// + /// \param[in] tags + /// \returns reference to this record + /// + BamRecord& SubstitutionTag(const std::string& tags); + + /// \} + +public: + /// \name Quality Data + /// \{ + + /// \brief Sets this record's AltLabelQV values ("pv" tag). + /// + /// \param[in] altLabelQVs + /// \returns reference to this record + /// + BamRecord& AltLabelQV(const QualityValues& altLabelQVs); + + /// \brief Sets this record's DeletionQV values ("dq" tag). + /// + /// \param[in] deletionQVs + /// \returns reference to this record + /// + BamRecord& DeletionQV(const QualityValues& deletionQVs); + + /// \brief Sets this record's InsertionQV values ("iq" tag). + /// + /// \param[in] insertionQVs + /// \returns reference to this record + /// + BamRecord& InsertionQV(const QualityValues& insertionQVs); + + /// \brief Sets this record's LabelQV values ("pq" tag). + /// + /// \param[in] labelQVs + /// \returns reference to this record + /// + BamRecord& LabelQV(const QualityValues& labelQVs); + + /// \brief Sets this record's MergeQV values ("mq" tag). + /// + /// \param[in] mergeQVs + /// \returns reference to this record + /// + BamRecord& MergeQV(const QualityValues& mergeQVs); + + /// \brief Sets this record's SubstitutionQV values ("sq" tag). + /// + /// \param[in] substitutionQVs + /// \returns reference to this record + /// + BamRecord& SubstitutionQV(const QualityValues& substitutionQVs); + + /// \} + +public: + /// \name Pulse Data + /// \{ + + /// \brief Sets this record's IPD values ("ip" tag). + /// + /// \param[in] frames + /// \param[in] encoding specify how to encode the data (8-bit lossy, or + /// 16-bit lossless) + /// \returns reference to this record + /// + BamRecord& IPD(const Frames& frames, + const FrameEncodingType encoding); + + /// \brief Sets this record's Pkmean values ("pm" tag). + /// + /// \param[in] photons + /// \returns reference to this record + /// + BamRecord& Pkmean(const std::vector& photons); + + /// \brief Sets this record's Pkmean values ("pm" tag). + /// + /// \param[in] encodedPhotons + /// \returns reference to this record + /// + BamRecord& Pkmean(const std::vector& encodedPhotons); + + /// \brief Sets this record's Pkmid values ("pa" tag). + /// + /// \param[in] photons + /// \returns reference to this record + /// + BamRecord& Pkmid(const std::vector& photons); + + /// \brief Sets this record's Pkmid values ("pa" tag). + /// + /// \param[in] encodedPhotons + /// \returns reference to this record + /// + BamRecord& Pkmid(const std::vector& encodedPhotons); + + /// \brief Sets this record's Pkmean2 values ("ps" tag). + /// + /// \param[in] photons + /// \returns reference to this record + /// + BamRecord& Pkmean2(const std::vector& photons); + + /// \brief Sets this record's Pkmean2 values ("ps" tag). + /// + /// \param[in] encodedPhotons + /// \returns reference to this record + /// + BamRecord& Pkmean2(const std::vector& encodedPhotons); + + /// \brief Sets this record's Pkmid2 values ("pi" tag). + /// + /// \param[in] photons + /// \returns reference to this record + /// + BamRecord& Pkmid2(const std::vector& photons); + + /// \brief Sets this record's Pkmid2 values ("pi" tag). + /// + /// \param[in] encodedPhotons + /// \returns reference to this record + /// + BamRecord& Pkmid2(const std::vector& encodedPhotons); + + /// \brief Sets this record's PreBaseFrames aka IPD values ("ip" tag). + /// + /// \param[in] frames + /// \param[in] encoding specify how to encode the data (8-bit lossy, or + /// 16-bit lossless) + /// \returns reference to this record + /// + BamRecord& PreBaseFrames(const Frames& frames, + const FrameEncodingType encoding); + + /// \brief Sets this record's PrePulseFrames values ("pd" tag). + /// + /// \param[in] frames + /// \param[in] encoding specify how to encode the data (8-bit lossy, or + /// 16-bit lossless) + /// \returns reference to this record + /// + BamRecord& PrePulseFrames(const Frames& frames, + const FrameEncodingType encoding); + + /// \brief Sets this record's PulseCall values ("pc" tag). + /// + /// \param[in] tags + /// \returns reference to this record + /// + BamRecord& PulseCall(const std::string& tags); + + /// \brief Sets this record's PulseCallWidth values ("px" tag). + /// + /// \param[in] frames + /// \param[in] encoding specify how to encode the data (8-bit lossy, or + /// 16-bit lossless) + /// \returns reference to this record + /// + BamRecord& PulseCallWidth(const Frames& frames, + const FrameEncodingType encoding); + + /// \brief Sets this record's PulseMergeQV values ("pg" tag). + /// + /// \param[in] pulseMergeQVs + /// \returns reference to this record + /// + BamRecord& PulseMergeQV(const QualityValues& pulseMergeQVs); + + /// \brief Sets this record's PulseWidth values ("pw" tag). + /// + /// \param[in] frames + /// \param[in] encoding specify how to encode the data (8-bit lossy, or + /// 16-bit lossless) + /// \returns reference to this record + /// + BamRecord& PulseWidth(const Frames& frames, + const FrameEncodingType encoding); + + /// \brief Sets this record's StartFrame values ("sf" tag). + /// + /// \param[in] startFrame + /// \returns reference to this record + /// + BamRecord& StartFrame(const std::vector& startFrame); + + /// \} + +public: + /// \name Low-Level Access & Operations + /// \{ + + /// \brief Resets cached aligned start/end. + /// + /// \note This method should not be needed in most client code. It exists + /// primarily as a hook for internal reading loops (queries, index + /// build, etc.) It's essentially a workaround and will likely be + /// removed from the API. + /// + void ResetCachedPositions(void) const; + + /// \brief Resets cached aligned start/end. + /// + /// \note This method should not be needed in most client code. It exists + /// primarily as a hook for internal reading loops (queries, index + /// build, etc.) It's essentially a workaround and will likely be + /// removed from the API. + /// + void ResetCachedPositions(void); + + /// \brief Updates the record's name (BamRecord::FullName) to reflect + /// modifications to name components (movie name, ZMW hole number, + /// etc.) + /// + void UpdateName(void); + + /// \} + +public: + /// \name Pulse Data + /// \{ + + static const float photonFactor; + + static std::vector EncodePhotons(const std::vector& data); + + /// \} + +public: + /// \name Clipping & Mapping + /// \{ + + /// Creates a copied record from input, with clipping applied + static BamRecord Clipped(const BamRecord& input, + const ClipType clipType, + const PacBio::BAM::Position start, + const PacBio::BAM::Position end); + + /// Creates a copied record from input, with mapping applied + static BamRecord Mapped(const BamRecord& input, + const int32_t referenceId, + const Position refStart, + const Strand strand, + const Cigar& cigar, + const uint8_t mappingQuality); + + /// Applies clipping to this record + BamRecord& Clip(const ClipType clipType, + const PacBio::BAM::Position start, + const PacBio::BAM::Position end); + + /// Creates a copied record from this one, with clipping applied + BamRecord Clipped(const ClipType clipType, + const PacBio::BAM::Position start, + const PacBio::BAM::Position end) const; + + /// Applies mapping to this record + BamRecord& Map(const int32_t referenceId, + const Position refStart, + const Strand strand, + const Cigar& cigar, + const uint8_t mappingQuality); + + /// Creates a copied record from this one, with mapping applied + BamRecord Mapped(const int32_t referenceId, + const Position refStart, + const Strand strand, + const Cigar& cigar, + const uint8_t mappingQuality) const; + /// \} + +private: + BamRecordImpl impl_; + +public: + /// public & mutable so that queries can directly set the header info, + /// even on a record that is const from client code's perspective + mutable BamHeader header_; + +private: + /// \internal + /// cached positions (mutable to allow lazy-calc in const methods) + mutable Position alignedStart_; + mutable Position alignedEnd_; + +private: + /// \internal + /// pulse to bam mapping cache + mutable std::unique_ptr p2bCache_; + +private: + ///\internal + /// clipping methods + + void ClipFields(const size_t clipPos, const size_t clipLength); + BamRecord& ClipToQuery(const PacBio::BAM::Position start, + const PacBio::BAM::Position end); + BamRecord& ClipToReference(const PacBio::BAM::Position start, + const PacBio::BAM::Position end); + BamRecord& ClipToReferenceForward(const PacBio::BAM::Position start, + const PacBio::BAM::Position end); + BamRecord& ClipToReferenceReverse(const PacBio::BAM::Position start, + const PacBio::BAM::Position end); + +private: + ///\internal + /// raw tag data fetching + + // sequence tags + std::string FetchBasesRaw(const BamRecordTag tag) const; + std::string FetchBases(const BamRecordTag tag, + const Orientation orientation = Orientation::NATIVE, + const bool aligned = false, + const bool exciseSoftClips = false, + const PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + // frame tags + Frames FetchFramesRaw(const BamRecordTag tag) const; + Frames FetchFrames(const BamRecordTag tag, + const Orientation orientation = Orientation::NATIVE, + const bool aligned = false, + const bool exciseSoftClips = false, + const PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + // pulse tags + std::vector FetchPhotonsRaw(const BamRecordTag tag) const; + std::vector FetchPhotons(const BamRecordTag tag, + const Orientation orientation = Orientation::NATIVE, + const bool aligned = false, + const bool exciseSoftClips = false, + const PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + // QV tags + QualityValues FetchQualitiesRaw(const BamRecordTag tag) const; + QualityValues FetchQualities(const BamRecordTag tag, + const Orientation orientation = Orientation::NATIVE, + const bool aligned = false, + const bool exciseSoftClips = false, + const PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + + // UInt tags (e.g. start frame) + std::vector FetchUIntsRaw(const BamRecordTag tag) const; + std::vector FetchUInts(const BamRecordTag tag, + const Orientation orientation = Orientation::NATIVE, + const bool aligned = false, + const bool exciseSoftClips = false, + const PulseBehavior pulseBehavior = PulseBehavior::ALL) const; + +private: + ///\internal + /// marked const to allow calling from const methods + /// but updates our mutable cached values + void CalculateAlignedPositions(void) const; + void CalculatePulse2BaseCache(void) const; + + friend class internal::BamRecordMemory; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/BamRecord.inl" + +#endif // BAMRECORD_H diff --git a/include/pbbam/BamRecordBuilder.h b/include/pbbam/BamRecordBuilder.h new file mode 100644 index 0000000..c6ff877 --- /dev/null +++ b/include/pbbam/BamRecordBuilder.h @@ -0,0 +1,316 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamRecordBuilder.h +/// \brief Defines the BamRecordBuilder class. +// +// Author: Derek Barnett + +#ifndef BAMRECORDBUILDER_H +#define BAMRECORDBUILDER_H + +#include "pbbam/BamRecord.h" +#include "pbbam/BamHeader.h" +#include "pbbam/Config.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The BamRecordBuilder class provides a helper utility for building +/// BamRecords. +/// +/// This class provides a mechanism for building up %BAM data and +/// lazy-encoding/constructing the actual BamRecord. Currently, the methods here +/// really only support filling in the low-level SAM/BAM-style fields, not so +/// much the PacBio-specific fields. +/// +class PBBAM_EXPORT BamRecordBuilder +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty %BAM record builder. + BamRecordBuilder(void); + + /// \brief Creates an empty %BAM record builder, with header info to apply + /// to built records. + /// + /// \param[in] header BamHeader object + /// + explicit BamRecordBuilder(const BamHeader& header); + + /// \brief Creates record builder with inital record data. + /// + /// \param[in] prototype data from this record will be used to seed the + /// builder + /// + BamRecordBuilder(const BamRecord& prototype); + + BamRecordBuilder(const BamRecordBuilder& other); + BamRecordBuilder(BamRecordBuilder&& other); + BamRecordBuilder& operator=(const BamRecordBuilder& other); + BamRecordBuilder& operator=(BamRecordBuilder&& other); + ~BamRecordBuilder(void); + + /// \} + +public: + /// \name Record-Building + /// \{ + + /// \brief Builds a BamRecord from current builder attributes. + /// + /// \returns newly-built BamRecord object + /// + BamRecord Build(void) const; + + /// \brief Replaces an existing BamRecord's data with current builder + /// attributes. + /// + /// \param[out] record resulting record + /// \returns true if successful + /// + bool BuildInPlace(BamRecord& record) const; + + /// \brief Resets builder attributes to default values. + /// + void Reset(void); + + /// \brief Resets builder attributes with \p prototype's data. + /// + /// \param[in] prototype + /// + void Reset(const BamRecord& prototype); + + /// \brief Resets builder attributes with \p prototype's data. + /// + /// \param[in] prototype + /// + void Reset(BamRecord&& prototype); + + /// \} + +public: + + /// \name Core Attribute Setup + /// \{ + + /// \brief Sets the record's (BAI) index bin ID. + /// + /// \param[in] bin BAI index bin ID. + /// \returns reference to this builder + /// + BamRecordBuilder& Bin(const uint32_t bin); + + /// \brief Sets this record's alignment flag, using a raw integer. + /// + /// \param[in] flag raw alignment flag + /// \returns reference to this record + /// + BamRecordBuilder& Flag(const uint32_t flag); + + /// \brief Sets this record's insert size. + /// + /// \param[in] iSize insert size + /// \returns reference to this record + /// + BamRecordBuilder& InsertSize(const int32_t iSize); + + /// \brief Sets this record's map quality. + /// + /// \param[in] mapQual mapping quality - value of 255 indicates "unknown" + /// \returns reference to this record + /// + BamRecordBuilder& MapQuality(const uint8_t mapQual); + + /// \brief Sets this record's mate's mapped position. + /// + /// \param[in] pos mapped position. A value of -1 indicates unmapped. + /// \returns reference to this record + /// + BamRecordBuilder& MatePosition(const int32_t pos); + + /// \brief Sets this record's mate's mapped reference ID + /// + /// \param[in] id reference ID. A value of -1 indicates unmapped. + /// \returns reference to this record + /// + BamRecordBuilder& MateReferenceId(const int32_t id); + + /// \brief Sets this record's mapped position. + /// + /// \param[in] pos mapped position. A value of -1 indicates unmapped. + /// \returns reference to this record + /// + BamRecordBuilder& Position(const int32_t pos); + + /// \brief Sets this record's mapped reference ID + /// + /// \param[in] id reference ID. A value of -1 indicates unmapped. + /// \returns reference to this record + /// + BamRecordBuilder& ReferenceId(const int32_t id); + + /// \} + +public: + /// \name Alignment Flag Setup + /// \{ + + /// \brief Sets whether this record is a PCR/optical duplicate + BamRecordBuilder& SetDuplicate(bool ok); + + /// \brief Sets whether this record failed quality controls + BamRecordBuilder& SetFailedQC(bool ok); + + /// \brief Sets whether this record is the first mate of a pair. + BamRecordBuilder& SetFirstMate(bool ok); + + /// \brief Sets whether this record was aligned. + BamRecordBuilder& SetMapped(bool ok); + + /// \brief Sets whether this record's mate was aligned. + BamRecordBuilder& SetMateMapped(bool ok); + + /// \brief Sets whether this record's mate mapped to reverse strand. + BamRecordBuilder& SetMateReverseStrand(bool ok); + + /// \brief Sets whether this record came from paired-end sequencing. + BamRecordBuilder& SetPaired(bool ok); + + /// \brief Sets whether this record is a read's primary alignment. + BamRecordBuilder& SetPrimaryAlignment(bool ok); + + /// \brief Sets whether this record & its mate were properly mapped, per the + /// aligner. + /// + BamRecordBuilder& SetProperPair(bool ok); + + /// \brief Sets whether this record mapped to reverse strand. + BamRecordBuilder& SetReverseStrand(bool ok); + + /// \brief Sets whether this record is the second mate of a pair. + BamRecordBuilder& SetSecondMate(bool ok); + + /// \brief Sets whether this record is a supplementary alignment. + BamRecordBuilder& SetSupplementaryAlignment(bool ok); + + /// \} + +public: + /// \name Variable-Length Data Setup + /// \{ + + /// \brief Sets the record's CIGAR data. + /// + /// \returns reference to this builder + /// + BamRecordBuilder& Cigar(const PacBio::BAM::Cigar& cigar); + + /// \brief Sets the record's CIGAR data. + /// + /// \returns reference to this builder + /// + BamRecordBuilder& Cigar(PacBio::BAM::Cigar&& cigar); + + /// \brief Sets the record's name. + /// + /// \returns reference to this builder + /// + BamRecordBuilder& Name(const std::string& name); + + /// \brief Sets the record's name. + /// + /// \returns reference to this builder + /// + BamRecordBuilder& Name(std::string&& name); + + /// \brief Sets the record's qualities. + /// + /// \returns reference to this builder + /// + BamRecordBuilder& Qualities(const std::string& qualities); + + /// \brief Sets the record's qualities. + /// + /// \returns reference to this builder + /// + BamRecordBuilder& Qualities(std::string&& qualities); + + /// \brief Sets the record's sequence. + /// + /// \returns reference to this builder + /// + BamRecordBuilder& Sequence(const std::string& sequence); + + /// \brief Sets the record's sequence. + /// + /// \returns reference to this builder + /// + BamRecordBuilder& Sequence(std::string&& sequence); + + /// \brief Sets the record's tags. + /// + /// \returns reference to this builder + /// + BamRecordBuilder& Tags(const TagCollection& tags); + + /// \brief Sets the record's tags. + /// + /// \returns reference to this builder + /// + BamRecordBuilder& Tags(TagCollection&& tags); + + /// \} + +private: + BamHeader header_; + bam1_core_t core_; + std::string name_; + std::string sequence_; + std::string qualities_; + PacBio::BAM::Cigar cigar_; + TagCollection tags_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/BamRecordBuilder.inl" + +#endif // BAMRECORDBUILDER_H diff --git a/include/pbbam/BamRecordImpl.h b/include/pbbam/BamRecordImpl.h new file mode 100644 index 0000000..2c72367 --- /dev/null +++ b/include/pbbam/BamRecordImpl.h @@ -0,0 +1,634 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamRecordImpl.h +/// \brief Defines the BamRecordImpl class. +// +// Author: Derek Barnett + +#ifndef BAMRECORDIMPL_H +#define BAMRECORDIMPL_H + +#include "pbbam/BamRecordTag.h" +#include "pbbam/Cigar.h" +#include "pbbam/Config.h" +#include "pbbam/Position.h" +#include "pbbam/QualityValues.h" +#include "pbbam/TagCollection.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { + +namespace internal { class BamRecordMemory; } + +/// \brief The BamRecordImpl class holds all data necessary for creating, +/// querying or editing a generic %BAM record. +/// +/// For PacBio-specific extensions and convenience methods, see BamRecord. +/// +/// \note This class is mostly an internal implementation detail and will +/// likely be removed from the public API in the future. Please use +/// BamRecord as much as possible. +/// +class PBBAM_EXPORT BamRecordImpl +{ +public: + + /// These flags describe the alignment status of the record. + enum AlignmentFlag + { + PAIRED = 0x0001 ///< Record comes from paired-end sequencing + , PROPER_PAIR = 0x0002 ///< Each mate of a pair was properly aligned ("proper" as determined by aligner) + , UNMAPPED = 0x0004 ///< Record was not mapped by aligner + , MATE_UNMAPPED = 0x0008 ///< Record's mate was not mapped by aligner + , REVERSE_STRAND = 0x0010 ///< Record was aligned to reverse strand (Sequence() is reverse-complemented) + , MATE_REVERSE_STRAND = 0x0020 ///< Record's mate was aligned to reverse strand (mate's Sequence() is reverse-complemented) + , MATE_1 = 0x0040 ///< Record is first mate of pair + , MATE_2 = 0x0080 ///< Record is second mate of pair + , SECONDARY = 0x0100 ///< Record is a secondary alignment + , FAILED_QC = 0x0200 ///< Record failed quality controls + , DUPLICATE = 0x0400 ///< Record is a PCR/optical duplicate + , SUPPLEMENTARY = 0x0800 ///< Record is a supplementary alignment + }; + +public: + /// \name Constructors & Related Methods + /// \{ + + BamRecordImpl(void); + BamRecordImpl(const BamRecordImpl& other); + BamRecordImpl(BamRecordImpl&& other); + BamRecordImpl& operator=(const BamRecordImpl& other); + BamRecordImpl& operator=(BamRecordImpl&& other); + virtual ~BamRecordImpl(void); + + /// \} + +public: + /// \name Core Data + /// \{ + + /// \returns this record's assigned (BAI) index bin ID. + uint32_t Bin(void) const; + + /// \returns this record's alignment flag, in raw integer form. + uint32_t Flag(void) const; + + /// \returns this record's insert size + int32_t InsertSize(void) const; + + /// \returns this record's mapping quality. A value of 255 indicates "unknown" + uint8_t MapQuality(void) const; + + /// \returns this record's mate's mapped position, or -1 if unmapped + PacBio::BAM::Position MatePosition(void) const; + + /// \returns this record's mate's mapped reference ID, or -1 if unmapped + int32_t MateReferenceId(void) const; + + /// \returns this record's mapped position, or -1 if unmapped + PacBio::BAM::Position Position(void) const; + + /// \returns this record's mate's mapped reference ID, or -1 if unmapped + int32_t ReferenceId(void) const; + + /// Sets the record's (BAI) index bin ID. + /// + /// \param[in] bin BAI index bin ID. + /// \returns reference to this record + /// + BamRecordImpl& Bin(uint32_t bin); + + /// Sets this record's alignment flag, using a raw integer. + /// + /// \param[in] flag raw alignment flag + /// \returns reference to this record + /// + BamRecordImpl& Flag(uint32_t flag); + + /// Sets this record's insert size. + /// + /// \param[in] iSize insert size + /// \returns reference to this record + /// + BamRecordImpl& InsertSize(int32_t iSize); + + /// Sets this record's map quality. + /// + /// \param[in] mapQual mapping quality - value of 255 indicates "unknown" + /// \returns reference to this record + /// + BamRecordImpl& MapQuality(uint8_t mapQual); + + /// Sets this record's mate's mapped position. + /// + /// \param[in] pos mapped position. A value of -1 indicates unmapped. + /// \returns reference to this record + /// + BamRecordImpl& MatePosition(PacBio::BAM::Position pos); + + /// Sets this record's mate's mapped reference ID + /// + /// \param[in] id reference ID. A value of -1 indicates unmapped. + /// \returns reference to this record + /// + BamRecordImpl& MateReferenceId(int32_t id); + + /// Sets this record's mapped position. + /// + /// \param[in] pos mapped position. A value of -1 indicates unmapped. + /// \returns reference to this record + /// + BamRecordImpl& Position(PacBio::BAM::Position pos); + + /// Sets this record's mapped reference ID + /// + /// \param[in] id reference ID. A value of -1 indicates unmapped. + /// \returns reference to this record + /// + BamRecordImpl& ReferenceId(int32_t id); + + /// \} + +public: + /// \name Alignment Flags + /// \{ + + /// \returns true if this record is a PCR/optical duplicate + bool IsDuplicate(void) const; + + /// \returns true if this record failed quality controls + bool IsFailedQC(void) const; + + /// \returns true if this record is the first mate of a pair + bool IsFirstMate(void) const; + + /// \returns true if this record was mapped by aligner + bool IsMapped(void) const; + + /// \returns true if this record's mate was mapped by aligner + bool IsMateMapped(void) const; + + /// \returns true if this record's mate was mapped to the reverse strand + bool IsMateReverseStrand(void) const; + + /// \returns true if this record comes from paired-end sequencing + bool IsPaired(void) const; + + /// \returns true if this record is a read's primary alignment + bool IsPrimaryAlignment(void) const; + + /// \returns true if this record & its mate were properly aligned + bool IsProperPair(void) const; + + /// \returns true if this record was mapped to the reverse strand + bool IsReverseStrand(void) const; + + /// \returns true if this record is the second mate of a pair + bool IsSecondMate(void) const; + + /// \returns true if this record is a supplementary alignment + bool IsSupplementaryAlignment(void) const; + + /// Sets whether this record is a PCR/optical duplicate + BamRecordImpl& SetDuplicate(bool ok); + + /// Sets whether this record failed quality controls + BamRecordImpl& SetFailedQC(bool ok); + + /// Sets whether this record is the first mate of a pair. + BamRecordImpl& SetFirstMate(bool ok); + + /// Sets whether this record was aligned. + BamRecordImpl& SetMapped(bool ok); + + /// Sets whether this record's mate was aligned. + BamRecordImpl& SetMateMapped(bool ok); + + /// Sets whether this record's mate mapped to reverse strand. + BamRecordImpl& SetMateReverseStrand(bool ok); + + /// Sets whether this record came from paired-end sequencing. + BamRecordImpl& SetPaired(bool ok); + + /// Sets whether this record is a read's primary alignment. + BamRecordImpl& SetPrimaryAlignment(bool ok); + + /// Sets whether this record & its mate were properly mapped, per the aligner. + BamRecordImpl& SetProperPair(bool ok); + + /// Sets whether this record mapped to reverse strand. + BamRecordImpl& SetReverseStrand(bool ok); + + /// Sets whether this record is the second mate of a pair. + BamRecordImpl& SetSecondMate(bool ok); + + /// Sets whether this record is a supplementary alignment. + BamRecordImpl& SetSupplementaryAlignment(bool ok); + + /// \} + +public: + /// \name Variable-length Data (sequence, qualities, etc.) + /// \{ + + /// \returns the record's CIGAR data as a Cigar object + Cigar CigarData(void) const; + + /// Sets the record's CIGAR data using a Cigar object + /// + /// \param[in] cigar PacBio::BAM::Cigar object + /// \returns reference to this record + /// + BamRecordImpl& CigarData(const Cigar& cigar); + + /// Sets the record's CIGAR data using a CIGAR-formatted string. + /// + /// \param[in] cigarString CIGAR-formatted string + /// \returns reference to this record + /// + BamRecordImpl& CigarData(const std::string& cigarString); + + // TODO: CIGAR iterator - Cigar only or here as well ?? + + /// \returns the record's query name + std::string Name(void) const; + + /// Sets the record's "query name". + /// + /// \param name new name + /// \returns reference to this record + /// + BamRecordImpl& Name(const std::string& name); + + /// \returns the record's quality values (phred-style ASCII) + /// + /// \note Usually Qualities().size() == Sequence.size(). However, in + /// some data sets, the quality values are not provided. In that + /// case, this method will return an empty container. + /// + QualityValues Qualities(void) const; + + /// \returns the record's DNA sequence. + std::string Sequence(void) const; + + size_t SequenceLength(void) const; + + /// \brief Sets the record's DNA sequence and quality values + /// + /// This is an overloaded function. Sets the DNA sequence and quality + /// values, using the length of \p sequence. + /// + /// \note When using this overload (and \p qualities is non-empty), the + /// lengths of \p sequence and \p qualities \b must be equal. + /// + /// \todo How to handle mismatched lengths? + /// + /// \param[in] sequence std::string containing DNA sequence + /// \param[in] qualities std::string containing ASCII quality values + /// + /// \returns reference to this record. + /// + /// \sa SetSequenceAndQualities(const char* sequence, + /// const size_t sequenceLength, const char* qualities) + /// + BamRecordImpl& SetSequenceAndQualities(const std::string& sequence, + const std::string& qualities = std::string()); + + /// \brief Sets the record's DNA sequence and quality values. + /// + /// The \p sequence must consist of IUPAC nucleotide codes {=ACMGRSVTWYHKDBN}. + /// The \p qualities, if not empty, must consist of 'phred'-style ASCII + /// quality values. \p qualities may be an empty string or NULL pointer in + /// cases where there are no such data available. + /// + /// \param[in] sequence C-string containing DNA sequence + /// \param[in] sequenceLength length of DNA sequence + /// \param[in] qualities C-string containing 'phred-style' ASCII + /// quality values + /// + /// \note \p sequence does \b NOT have to be NULL-terminated. Length is + /// explicitly determined by the value of \p sequenceLength provided. + /// + /// \returns reference to this record. + /// + BamRecordImpl& SetSequenceAndQualities(const char* sequence, + const size_t sequenceLength, + const char* qualities = 0); + + /// \brief Sets the record's DNA sequence and quality values. + /// + /// The \p encodedSequence should be preencoded/packed into the BAM binary + /// format. The \p qualities, if not empty, must consist of 'phred'-style + /// ASCII quality values. \p qualities may be an empty string or NULL + /// pointer in cases where there are no such data available. + /// + /// \param[in] encodedSequence C-string containing BAM-format-encoded + /// DNA sequence + /// \param[in] rawSequenceLength length of DNA sequence (not the encoded + /// length) + /// \param[in] qualities C-string containing 'phred-style' ASCII + /// quality values + /// + /// \note \p encodedSequence does \b NOT have to be NULL-terminated. Length + /// is explicitly determined by the value of \p sequenceLength + /// provided. + /// + /// \returns reference to this record. + /// + /// \sa SetSequenceAndQualities(const char* sequence, + /// const size_t sequenceLength, const char* qualities) + /// + BamRecordImpl& SetPreencodedSequenceAndQualities(const char* encodedSequence, + const size_t rawSequenceLength, + const char* qualities = 0); + + /// \} + +public: + /// \name Tag Data + /// \{ + + /// \returns record's full tag data as a TagCollection object + TagCollection Tags(void) const; + + /// \brief Sets the record's full tag data via a TagCollection object + /// + BamRecordImpl& Tags(const TagCollection& tags); + + /// \brief Adds a new tag to this record. + /// + /// \param[in] tagName 2-character tag name. + /// \param[in] value Tag object that describes the type & value of data + /// to be added + /// + /// \note Any value that can be used to implicitly construct a Tag is valid. + /// \code + /// string s; + /// vector v; + /// record.AddTag("XX", s); // will add a string-type tag + /// record.AddTag("YY", v); // will add a uint32-array-type tag + /// \endcode + /// + /// \returns true if tag was successfully added. + /// + bool AddTag(const std::string& tagName, + const Tag& value); + + /// \brief Adds a new tag to this record. + /// + /// This is an overloaded method. + /// + /// \param[in] tag BamRecordTag enum + /// \param[in] value Tag object that describes the type & value of data + /// to be added + /// \returns true if tag was successfully added. + /// + bool AddTag(const BamRecordTag tag, + const Tag& value); + + /// \brief Adds a new tag to this record, with an optional modifier. + /// + /// \param[in] tagName 2-character tag name. + /// \param[in] value Tag object that describes the type & + /// value of data to be added + /// \param[in] additionalModifier optional extra modifier (for explicit + /// modification of an otherwise const Tag) + /// + /// \note Any value that can be used to implicitly construct a Tag is valid. + /// \code + /// char c; + /// string h; + /// record.AddTag("XX", c, TagModifier::ASCII_CHAR); // will add a char-type tag + /// record.AddTag("YY", h, TagModifier::HEX_STRING); // will add a hex string-type tag + /// \endcode + /// + /// \returns true if tag was successfully added. + /// + bool AddTag(const std::string& tagName, + const Tag& value, + const TagModifier additionalModifier); + + /// \brief Adds a new tag to this record, with an optional modifier. + /// + /// This is an overloaded method. + /// + /// \param[in] tag BamRecordTag enum. + /// \param[in] value Tag object that describes the type & + /// value of data to be added + /// \param[in] additionalModifier optional extra modifier (for explicit + /// modification of an otherwise const Tag) + /// + /// \returns true if tag was successfully added. + /// + bool AddTag(const BamRecordTag tag, + const Tag& value, + const TagModifier additionalModifier); + + /// \brief Edits an existing tag on this record. + /// + /// \param[in] tagName 2-character tag name. Name must be present + /// (see HasTag) + /// \param[in] newValue Tag object that describes the type & value of + /// new data to be added + /// + /// \note Any value that can be used to implicitly construct a Tag is valid. + /// \code + /// string s; + /// vector v; + /// record.EditTag("XX", s); // will overwrite tag XX with a string-type tag + /// record.EditTag("YY", v); // will overwrite tag YY with a uint32-array-type tag + /// \endcode + /// + /// \returns true if tag was successfully edited. + /// + bool EditTag(const std::string& tagName, + const Tag& newValue); + + /// \brief Edits an existing tag on this record. + /// + /// This is an overloaded method. + /// + /// \param[in] tag BamRecordTag enum + /// \param[in] newValue Tag object that describes the type & value of + /// new data to be added + /// + /// \returns true if tag was successfully edited. + /// + bool EditTag(const BamRecordTag tag, + const Tag& newValue); + + /// \brief Edits an existing tag on this record. + /// + /// \param[in] tagName 2-character tag name. Name must be + /// present (see HasTag) + /// \param[in] value Tag object that describes the type & + /// value of new data to be added + /// \param[in] additionalModifier optional extra modifier (for explicit + /// modification of an otherwise const Tag) + /// + /// \note Any value that can be used to implicitly construct a Tag is valid. + /// \code + /// char c; + /// string h; + /// record.EditTag("XX", c, TagModifier::ASCII_CHAR); // will overwrite tag XX with a char-type tag + /// record.EditTag("YY", h, TagModifier::HEX_STRING); // will overwrite tag YY with a hex string-type tag + /// \endcode + /// + /// \returns true if tag was successfully edited. + /// + bool EditTag(const std::string& tagName, + const Tag& value, + const TagModifier additionalModifier); + + /// \brief Edits an existing tag on this record. + /// + /// This is an overloaded method. + /// + /// \param[in] tag BamRecordTag enum + /// \param[in] value Tag object that describes the type & + /// value of new data to be added + /// \param[in] additionalModifier optional extra modifier (for explicit + /// modification of an otherwise const Tag) + /// + /// \returns true if tag was successfully edited. + /// + bool EditTag(const BamRecordTag tag, + const Tag& value, + const TagModifier additionalModifier); + + + /// \returns true if a tag with this name is present in this record. + bool HasTag(const std::string& tagName) const; + + /// \returns true if this tag is present in this record. + /// + /// This is an overloaded method. + /// + bool HasTag(const BamRecordTag tag) const; + + /// \brief Removes an existing tag from this record. + /// + /// \param[in] tagName 2-character tag name. + /// + /// \returns true if tag was actaully removed (i.e. false if tagName + /// previously unknown) + /// \sa HasTag + /// + bool RemoveTag(const std::string& tagName); + + /// \brief Removes an existing tag from this record. + /// + /// This is an overloaded method. + /// + /// \param[in] tag BamRecordTag enum + /// + /// \returns true if tag was actaully removed (i.e. false if tagName + /// previously unknown) + /// \sa HasTag + /// + bool RemoveTag(const BamRecordTag tag); + + /// \brief Fetches a tag from this record. + /// + /// \param[in] tagName 2-character tag name. + /// + /// \returns Tag object for the requested name. If name is unknown, a + /// default constructed Tag is returned (Tag::IsNull() is true). + /// + Tag TagValue(const std::string& tagName) const; + + /// \brief Fetches a tag from this record. + /// + /// This is an overloaded method + /// + /// \param[in] tag BamRecordTag enum + /// + /// \returns Tag object for the requested name. If name is unknown, a + /// default constructed Tag is returned (Tag::IsNull() is true). + /// + Tag TagValue(const BamRecordTag tag) const; + + // change above to Tag(); + +// template +// T TagValue(const std::string& tagName) const; + + + /// \} + +private: + // returns a BamRecordImpl object, with a deep copy of @rawData contents + static BamRecordImpl FromRawData(const PBBAM_SHARED_PTR& rawData); + + // internal memory setup/expand methods + void InitializeData(void); + void MaybeReallocData(void); + void UpdateTagMap(void) const; // allowed to be called from const methods + // (lazy update on request) + + // internal tag helper methods + bool AddTagImpl(const std::string& tagName, + const Tag& value, + const TagModifier additionalModifier); + bool RemoveTagImpl(const std::string& tagName); + int TagOffset(const std::string& tagName) const; + + // core seq/qual logic shared by the public API + BamRecordImpl& SetSequenceAndQualitiesInternal(const char* sequence, + const size_t sequenceLength, + const char* qualities, + bool isPreencoded); + +private: + + // data members + PBBAM_SHARED_PTR d_; + mutable std::map tagOffsets_; + + // friends + friend class internal::BamRecordMemory; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/BamRecordImpl.inl" + +#endif // BAMRECORDIMPL_H diff --git a/include/pbbam/BamRecordTag.h b/include/pbbam/BamRecordTag.h new file mode 100644 index 0000000..93768ca --- /dev/null +++ b/include/pbbam/BamRecordTag.h @@ -0,0 +1,93 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamRecordTag.h +/// \brief Defines the BamRecordTag enum. +// +// Author: Derek Barnett + +#ifndef BAMRECORDTAG_H +#define BAMRECORDTAG_H + +namespace PacBio { +namespace BAM { + +enum class BamRecordTag +{ + ALT_LABEL_QV + , ALT_LABEL_TAG + , BARCODE_QUALITY + , BARCODES + , CONTEXT_FLAGS + , DELETION_QV + , DELETION_TAG + , HOLE_NUMBER + , INSERTION_QV + , IPD + , LABEL_QV + , MERGE_QV + , NUM_PASSES + , PKMEAN + , PKMEAN_2 + , PKMID + , PKMID_2 + , PRE_PULSE_FRAMES + , PULSE_CALL + , PULSE_CALL_WIDTH + , PULSE_MERGE_QV + , PULSE_WIDTH + , QUERY_END + , QUERY_START + , READ_ACCURACY + , READ_GROUP + , SCRAP_REGION_TYPE + , SCRAP_ZMW_TYPE + , SNR + , START_FRAME + , SUBSTITUTION_QV + , SUBSTITUTION_TAG + + // + // not tags per se, but faking these here to simplify data fetching + // + , QUAL + , SEQ +}; + +} // namespace BAM +} // namespace PacBio + +#endif // BAMRECORDTAG_H diff --git a/include/pbbam/BamRecordView.h b/include/pbbam/BamRecordView.h new file mode 100644 index 0000000..afbebad --- /dev/null +++ b/include/pbbam/BamRecordView.h @@ -0,0 +1,168 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamRecordView.h +/// \brief Defines the BamRecordView class. +// +// Author: Derek Barnett + +#ifndef BAMRECORDVIEW_H +#define BAMRECORDVIEW_H + +#include "pbbam/BamRecord.h" + +namespace PacBio { +namespace BAM { + +/// \brief Provides a re-usable "view" onto a BamRecord +/// +/// This class acts a convenience wrapper for working with per-base BamRecord +/// data. Most of these BamRecord methods take a list of parameters, to adjust +/// how the underlying data are presented to client code. Often these parameters +/// will be re-used for each BamRecord method call. Thus, to simplify such +/// client code, a BamRecordView can be used to state those parameters once, and +/// then simply request the desired fields. +/// +/// \internal +/// \todo Sync up method names with BamRecord +/// \endinternal +/// +class PBBAM_EXPORT BamRecordView +{ +public: + /// \brief Constructs a view onto \p record using the supplied parameters. + /// + /// For frame or QV data, if \p aligned is true, a value of 0 (Accuracy or + /// QualityValue) will be used at each inserted or padded base location. + /// + /// \param[in] record BamRecord data source. + /// \param[in] orientation Orientation of output. + /// \param[in] aligned if true, gaps/padding will be inserted, per + /// Cigar info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + BamRecordView(const BamRecord& record, + const Orientation orientation, + const bool aligned, + const bool exciseSoftClips, + const PulseBehavior pulseBehavior = PulseBehavior::ALL); + +public: + + /// \returns BamRecord::AltLabelQV with this view's parameters applied + QualityValues AltLabelQVs(void) const; + + /// \returns BamRecord::AltLabelTag with this view's parameters applied + std::string AltLabelTags(void) const; + + /// \returns BamRecord::DeletionQV with this view's parameters applied + QualityValues DeletionQVs(void) const; + + /// \returns BamRecord::DeletionTag with this view's parameters applied + std::string DeletionTags(void) const; + + /// \returns BamRecord::InsertionQV with this view's parameters applied + QualityValues InsertionQVs(void) const; + + /// \returns BamRecord::IPD with this view's parameters applied + Frames IPD(void) const; + + /// \returns BamRecord::LabelQV with this view's parameters applied + QualityValues LabelQVs(void) const; + + /// \returns BamRecord::MergeQV with this view's parameters applied + QualityValues MergeQVs(void) const; + + /// \returns BamRecord::PulseMergeQV with this view's parameters applied + QualityValues PulseMergeQVs(void) const; + + /// \returns BamRecord::Pkmean with this view's parameters applied + std::vector Pkmean(void) const; + + /// \returns BamRecord::Pkmid with this view's parameters applied + std::vector Pkmid(void) const; + + /// \returns BamRecord::Pkmean2 with this view's parameters applied + std::vector Pkmean2(void) const; + + /// \returns BamRecord::Pkmid2 with this view's parameters applied + std::vector Pkmid2(void) const; + + /// \returns BamRecord::PreBaseFrames with this view's parameters applied + Frames PrebaseFrames(void) const; + + /// \returns BamRecord::PrePulseFrames with this view's parameters applied + Frames PrePulseFrames(void) const; + + /// \returns BamRecord::PulseCalls with this view's parameters applied + std::string PulseCalls(void) const; + + /// \returns BamRecord::PulseCallWidth with this view's parameters applied + Frames PulseCallWidth(void) const; + + /// \returns BamRecord::PulseWidths with this view's parameters applied + Frames PulseWidths(void) const; + + /// \returns BamRecord::Qualities with this view's parameters applied + QualityValues Qualities(void) const; + + /// \returns BamRecord::Sequence with this view's parameters applied + std::string Sequence(void) const; + + /// \returns BamRecord::StartFrame with this view's parameters applied + std::vector StartFrames(void) const; + + /// \returns BamRecord::SubstitutionQV with this view's parameters applied + QualityValues SubstitutionQVs(void) const; + + /// \returns BamRecord::SubstitutionTag with this view's parameters applied + std::string SubstitutionTags(void) const; + +private: + const BamRecord& record_; + Orientation orientation_; + bool aligned_; + bool exciseSoftClips_; + PulseBehavior pulseBehavior_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/BamRecordView.inl" + +#endif // BAMRECORDVIEW_H diff --git a/include/pbbam/BamTagCodec.h b/include/pbbam/BamTagCodec.h new file mode 100644 index 0000000..9126900 --- /dev/null +++ b/include/pbbam/BamTagCodec.h @@ -0,0 +1,124 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamTagCodec.h +/// \brief Defines the BamTagCodec class. +// +// Author: Derek Barnett + +#ifndef BAMTAGCODEC_H +#define BAMTAGCODEC_H + +#include "pbbam/Config.h" +#include "pbbam/TagCollection.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The BamTagCodec class provides binary encoding/decoding of %BAM tag +/// data. +/// +/// \note BamTagCodec is mostly an implementation and/or testing detail, and may +/// be removed from the public API. +/// +class PBBAM_EXPORT BamTagCodec +{ +public: + /// \name Tag Collection Methods + /// \{ + + /// \brief Creates a TagCollection from raw BAM data. + /// + /// \param[in] data BAM-formatted (binary) tag data + /// \returns TagCollection containing tag data + /// + static TagCollection Decode(const std::vector& data); + + /// \brief Creates binary BAM data from a TagCollection. + /// + /// \param[in] tags TagCollection containing tag data + /// \returns vector of bytes (encoded BAM data) + /// + static std::vector Encode(const PacBio::BAM::TagCollection& tags); + + /// \} + +public: + /// \name Per-Tag Methods + /// \{ + + /// \brief Determines the SAM/BAM tag code for a Tag. + /// + /// \param[in] tag Tag object to check + /// \param[in] additionalModifier optional extra modifier (allows explicit + /// modification of an otherwise const Tag) + /// + /// \returns the SAM/BAM single char code for tag type + /// + static uint8_t TagTypeCode(const PacBio::BAM::Tag& tag, + const TagModifier& additionalModifier = TagModifier::NONE); + + /// \brief Encodes a single Tag's contents in %BAM binary + /// + /// \note This method does \b NOT encode the tag name & tag type. It does + /// include the element type for array-type tags. + /// + /// \param[in] tag Tag object containing data to encode + /// \param[in] additionalModifier optional extra modifier (allows explicit + /// modification of an otherwise const Tag) + /// + /// \returns vector of bytes (encoded BAM data) + /// + static std::vector ToRawData(const PacBio::BAM::Tag& tag, + const TagModifier& additionalModifier = TagModifier::NONE); + + /// \brief Creates a Tag object from binary BAM data. + /// + /// \param[in] rawData raw BAM bytes (assumed to be the result of + /// htslib's bam_aux_get()) + /// + /// \returns resulting Tag object + /// + static PacBio::BAM::Tag FromRawData(uint8_t* rawData); + + /// \} +}; + +} // namespace BAM +} // namespace PacBio + +#endif // BAMTAGCODEC_H diff --git a/include/pbbam/BamWriter.h b/include/pbbam/BamWriter.h new file mode 100644 index 0000000..b12df9b --- /dev/null +++ b/include/pbbam/BamWriter.h @@ -0,0 +1,210 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamWriter.h +/// \brief Defines the BamWriter class. +// +// Author: Derek Barnett + +#ifndef BAMWRITER_H +#define BAMWRITER_H + +#include "pbbam/BamHeader.h" +#include "pbbam/BamRecord.h" +#include "pbbam/Config.h" +#include "pbbam/IRecordWriter.h" +#include +#include + +namespace PacBio { +namespace BAM { + +class BamFile; + +namespace internal { class BamWriterPrivate; } + +/// \brief The BamWriter class provides a writing interface for creating +/// new %BAM files. +/// +/// \note The underlying buffered data may not be flushed to the file until the +/// destructor is called. Trying to access the file (reading, stat-ing, +/// indexing, etc.) before the BamWriter is destroyed yields undefined +/// behavior. Enclose the BamWriter in some form of local scope (curly +/// braces, a separate function, etc.) to ensure that its destructor is +/// called before proceeding to read-based operations. +/// +/// \code{.cpp} +/// { +/// BamWriter w(...); +/// // write data +/// } +/// // now safe to access the new file +/// \endcode +/// +/// +class PBBAM_EXPORT BamWriter : public IRecordWriter +{ +public: + /// \brief This enum allows you to control the compression level of the + /// output %BAM file. + /// + /// Values are equivalent to zlib compression levels. See its documentation + /// for more details: http://www.zlib.net/manual.html + /// + enum CompressionLevel + { + CompressionLevel_0 = 0 + , CompressionLevel_1 = 1 + , CompressionLevel_2 = 2 + , CompressionLevel_3 = 3 + , CompressionLevel_4 = 4 + , CompressionLevel_5 = 5 + , CompressionLevel_6 = 6 + , CompressionLevel_7 = 7 + , CompressionLevel_8 = 8 + , CompressionLevel_9 = 9 + + , DefaultCompression = -1 + , NoCompression = CompressionLevel_0 + , FastCompression = CompressionLevel_1 + , BestCompression = CompressionLevel_9 + }; + + /// \brief This enum allows you to control whether BAI bin numbers are + /// calculated for output records. + /// + /// For most cases, the default behavior (ON) should be retained for maximum + /// compatibility with downstream tools (e.g. samtools index). Disabling bin + /// calculation should only be used if all records are known to never be + /// mapped, and even then only if profiling revelas the calculation to + /// affect extremely performance-sensitive, "critical paths". + /// + enum BinCalculationMode + { + BinCalculation_ON = 0 + , BinCalculation_OFF + }; + +public: + + /// \name Constructors & Related Methods + /// \{ + + /// \brief Opens a %BAM file for writing & writes the header information. + /// + /// The error status will be set if either operation fails. + /// + /// \note Set \p filename to "-" for stdout. + /// + /// \param[in] filename path to output %BAM file + /// \param[in] header BamHeader object + /// \param[in] compressionLevel zlib compression level + /// \param[in] numThreads number of threads for compression. If set to + /// 0, BamWriter will attempt to determine a + /// reasonable estimate. If set to 1, this will + /// force single-threaded execution. No checks + /// are made against an upper limit. + /// + /// \param[in] binCalculationMode BAI bin calculation mode. The default + /// behavior will ensure proper bin numbers are provided for all + /// records written. This extra step may turned off when bin + /// numbers are not needed. Though if in doubt, keep the default. + /// + /// \throws std::runtmie_error if there was a problem opening the file for + /// writing or if an error occurred while writing the header + /// + BamWriter(const std::string& filename, + const BamHeader& header, + const BamWriter::CompressionLevel compressionLevel = BamWriter::DefaultCompression, + const size_t numThreads = 4, + const BinCalculationMode binCalculationMode = BamWriter::BinCalculation_ON); + + /// Fully flushes all buffered data & closes file. + ~BamWriter(void); + + /// \} + +public: + + /// \name Data Writing & Resource Management + /// \{ + + /// \brief Try to flush any buffered data to file. + /// + /// \note The underlying implementation doesn't necessarily flush buffered + /// data immediately, especially in a multithreaded writer situation. + /// Let the BamWriter go out of scope to fully ensure flushing. + /// + /// \throws std::runtime_error if flush fails + /// + void TryFlush(void); + + /// \brief Write a record to the output %BAM file. + /// + /// \param[in] record BamRecord object + /// + /// \throws std::runtime_error on failure to write + /// + void Write(const BamRecord& record); + + /// \brief Write a record to the output %BAM file. + /// + /// \param[in] record BamRecord object + /// \param[out] vOffset BGZF virtual offset to start of \p record + /// + /// \throws std::runtime_error on failure to write + /// + void Write(const BamRecord& record, int64_t* vOffset); + + /// \brief Write a record to the output %BAM file. + /// + /// \param[in] recordImpl BamRecordImpl object + /// + /// \throws std::runtime_error on failure to write + /// + void Write(const BamRecordImpl& recordImpl); + + /// \} + +private: + std::unique_ptr d_; + DISABLE_MOVE_AND_COPY(BamWriter); +}; + +} // namespace BAM +} // namespace PacBio + +#endif // BAMWRITER_H diff --git a/include/pbbam/BarcodeQuery.h b/include/pbbam/BarcodeQuery.h new file mode 100644 index 0000000..9b55167 --- /dev/null +++ b/include/pbbam/BarcodeQuery.h @@ -0,0 +1,97 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BarcodeQuery.h +/// \brief Defines the BarcodeQuery class. +// +// Author: Derek Barnett + +#ifndef BARCODEQUERY_H +#define BARCODEQUERY_H + +#include "pbbam/Config.h" +#include "pbbam/internal/QueryBase.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The BarcodeQuery class provides iterable access to a DataSet's %BAM +/// records, limiting results to those matching a particular barcode. +/// +/// Example: +/// \include code/BarcodeQuery.txt +/// +/// \note Currently, all %BAM files must have a corresponding ".pbi" index file. +/// Use BamFile::EnsurePacBioIndexExists before creating the query if one +/// may not be present. +/// +class PBBAM_EXPORT BarcodeQuery : public internal::IQuery +{ +public: + /// \brief Creates a new BarcodeQuery, limiting record results to only those + /// annotated with a particular barcode ID. + /// + /// \param[in] barcode filtering criteria + /// \param[in] dataset input data source(s) + /// + /// \sa BamRecord::Barcodes + /// + /// \throws std::runtime_error on failure to open/read underlying %BAM or PBI + /// files. + /// + BarcodeQuery(const int16_t barcode, const DataSet& dataset); + + ~BarcodeQuery(void); + +public: + + /// \brief Main iteration point for record access. + /// + /// Most client code should not need to use this method directly. Use + /// iterators instead. + /// + bool GetNext(BamRecord& r); + +private: + struct BarcodeQueryPrivate; + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // BARCODEQUERY_H diff --git a/include/pbbam/Cigar.h b/include/pbbam/Cigar.h new file mode 100644 index 0000000..c391057 --- /dev/null +++ b/include/pbbam/Cigar.h @@ -0,0 +1,112 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Cigar.h +/// \brief Defines the Cigar class. +// +// Author: Derek Barnett + +#ifndef CIGAR_H +#define CIGAR_H + +#include "pbbam/CigarOperation.h" +#include "pbbam/Config.h" +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief The Cigar class represents the CIGAR string used to report alignment +/// charateristics in SAM/BAM. +/// +/// \note Use of the 'M' operator is forbidden in PacBio BAMs. See +/// CigarOperationType description for more information. +/// +/// \sa https://samtools.github.io/hts-specs/SAMv1.pdf for more information on CIGAR in general. +/// +class PBBAM_EXPORT Cigar : public std::vector +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates a Cigar object from SAM/BAM string input + /// + /// \param [in] stdString SAM/BAM formatted CIGAR data + /// \returns a Cigar object representing the input data + /// + /// \note This class may be removed from the public API in the future, + /// as the constructor taking a std::string accomplishes the same end. + /// + static Cigar FromStdString(const std::string& stdString); + + /// \brief Creates an empty Cigar. + Cigar(void); + + /// \brief Creates a Cigar object from SAM/BAM string input + /// + /// \param [in] cigarString SAM/BAM formatted CIGAR data + /// + Cigar(const std::string& cigarString); + + Cigar(const Cigar& other); + Cigar(Cigar&& other); + Cigar& operator=(const Cigar& other); + Cigar& operator=(Cigar&& other); + ~Cigar(void); + + /// \} + +public: + /// \name Conversion Methods + /// \{ + + /// Converts Cigar object data to SAM/BAM formatted string + /// + /// \returns SAM/BAM formatted std::string + /// + std::string ToStdString(void) const; + + /// \} +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/Cigar.inl" + +#endif // CIGAR_H diff --git a/include/pbbam/CigarOperation.h b/include/pbbam/CigarOperation.h new file mode 100644 index 0000000..9b936ef --- /dev/null +++ b/include/pbbam/CigarOperation.h @@ -0,0 +1,174 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file CigarOperation.h +/// \brief Defines the CigarOperationType enum & CigarOperation class. +// +// Author: Derek Barnett + +#ifndef CIGAROPERATION_H +#define CIGAROPERATION_H + +#include "pbbam/Config.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief Describes a CIGAR operation. +/// +/// Bracketed character is the corresponding SAM/BAM character code. +/// +/// \warning ALIGNMENT_MATCH ('M') is included in this enum to maintain +/// consistency with htslib. However, as of PacBio BAM spec version +/// 3.0b7, this CIGAR operation \b forbidden. Any attempt to read or +/// write a record containing this operation will trigger a +/// std::runtime_error. SEQUENCE_MATCH('=) or SEQUENCE_MISMATCH('X') +/// should be used instead. +/// +enum class CigarOperationType +{ + UNKNOWN_OP = -1 ///< unknown/invalid CIGAR operator + , ALIGNMENT_MATCH = 0 ///< alignment match (can be a sequence match or mismatch) [M] + , INSERTION ///< insertion to the reference [I] + , DELETION ///< deletion from the reference [D] + , REFERENCE_SKIP ///< skipped region from the reference [N] + , SOFT_CLIP ///< soft clipping (clipped sequences present in SEQ) [S] + , HARD_CLIP = 5 ///< hard clipping (clipped sequences NOT present in SEQ) [H] + , PADDING ///< padding (silent deletion from padded reference) [P] + , SEQUENCE_MATCH ///< sequence match [=] + , SEQUENCE_MISMATCH ///< sequence mismatch [X] +}; + +/// \brief The CigarOperation class represents a single CIGAR operation +/// (consisting of a type & length). +/// +class PBBAM_EXPORT CigarOperation +{ +public: + + /// \name Operation Type Conversion Methods + /// \{ + + /// Convert between CigarOperationType enum & SAM/BAM character code. + /// + /// \param[in] type CigarOperationType value + /// \returns SAM/BAM character code + static char TypeToChar(const CigarOperationType type); + + /// Convert between CigarOperationType enum & SAM/BAM character code. + /// + /// \param[in] c SAM/BAM character code + /// \returns CigarOperationType value + static CigarOperationType CharToType(const char c); + + /// \} + +public: + /// \name Constructors & Related Methods + /// \{ + + CigarOperation(void); + CigarOperation(char c, uint32_t length); + CigarOperation(CigarOperationType op, uint32_t length); + CigarOperation(const CigarOperation& other); + CigarOperation(CigarOperation&& other); + CigarOperation& operator=(const CigarOperation& other); + CigarOperation& operator=(CigarOperation&& other); + ~CigarOperation(void); + + /// \} + +public: + + /// \returns operation type as SAM/BAM char code + inline char Char(void) const; + + /// \returns operation length + inline uint32_t Length(void) const; + + /// \returns operation type as CigarOperationType enum value + inline CigarOperationType Type(void) const; + + /// \} + +public: + /// \name Attributes + /// \{ + + /// Sets this operation type. + /// + /// \param[in] opChar SAM/BAM character code + /// \returns reference to this operation + inline CigarOperation& Char(const char opChar); + + /// Sets this operation length. + /// + /// \param[in] length + /// \returns reference to this operation + inline CigarOperation& Length(const uint32_t length); + + /// Sets this operation type. + /// + /// \param[in] opType CigarOperationType value + /// \returns reference to this operation + inline CigarOperation& Type(const CigarOperationType opType); + + /// \} + +public: + /// \name Comparison Operators + /// \{ + + /// \returns true if both CIGAR operation type & length match + inline bool operator==(const CigarOperation& other) const; + + /// \returns true if either CIGAR operation type or length differ + inline bool operator!=(const CigarOperation& other) const; + + /// \} + +private: + CigarOperationType type_; + uint32_t length_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/CigarOperation.inl" + +#endif // CIGAROPERATION_H diff --git a/include/pbbam/ClipType.h b/include/pbbam/ClipType.h new file mode 100644 index 0000000..eb97167 --- /dev/null +++ b/include/pbbam/ClipType.h @@ -0,0 +1,65 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ClipType.h +/// \brief Defines the ClipType enum. +// +// Author: Derek Barnett + +#ifndef CLIPTYPE_H +#define CLIPTYPE_H + +namespace PacBio { +namespace BAM { + +/// \brief This enum defines the modes supported by BamRecord clipping +/// operations. +/// +/// Methods like BamRecord::Clip accept Position parameters - which may be in +/// either polymerase or reference coorindates. Using this enum as a flag +/// indicates how the positions should be interpreted. +/// +enum class ClipType +{ + CLIP_NONE ///< No clipping will be performed. + , CLIP_TO_QUERY ///< Clipping positions are in polymerase coordinates. + , CLIP_TO_REFERENCE ///< Clipping positions are in genomic coordinates. +}; + +} // namespace BAM +} // namespace PacBio + +#endif // CLIPTYPE_H diff --git a/include/pbbam/Compare.h b/include/pbbam/Compare.h new file mode 100644 index 0000000..8570e02 --- /dev/null +++ b/include/pbbam/Compare.h @@ -0,0 +1,430 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Compare.h +/// \brief Defines the Compare class & a number of function objects for +/// comparing BamRecords. +// +// Author: Derek Barnett + +#ifndef COMPARE_H +#define COMPARE_H + +#include "pbbam/BamRecord.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief The Compare class provides utilities for sorting collections of +/// BamRecords. +/// +/// \note The functors provided here currently only support std::less +/// comparisons (i.e. sorting by ascending value). +/// +/// \include code/Compare.txt +/// +struct PBBAM_EXPORT Compare +{ +public: + + /// \name Comparison Type + /// \{ + + /// \brief This enum defines the supported comparison types + /// { ==, !=, <, <=, >, >=, & (contains), ~ (not contains) }. + /// + enum Type { + EQUAL = 0 + , NOT_EQUAL + , LESS_THAN + , LESS_THAN_EQUAL + , GREATER_THAN + , GREATER_THAN_EQUAL + , CONTAINS + , NOT_CONTAINS + }; + + /// \brief Convert operator string to Compare::Type. + /// + /// \include code/Compare_TypeFromOperator.txt + /// + /// \param[in] opString operator string. Can be C++-style operators + /// ("==", "!=", "<=", etc) or alpha equivalents + /// ("eq", "ne", "lte", etc). + /// + /// \returns comparison type from an operator string + /// \throws std::runtime_error if cannot convert opString to Compare::Type + /// \sa Compare::TypeToOperator + /// + static Compare::Type TypeFromOperator(const std::string& opString); + + /// \brief Convert a Compare::Type to printable enum name. + /// + /// \include code/Compare_TypeToName.txt + /// + /// \param[in] type Compare::Type to convert + /// \returns the printable name for a Compare::Type enum value.are::Type + /// \throws std::runtime_error on unknown Compare::Type + /// + static std::string TypeToName(const Compare::Type& type); + + /// \brief Convert a Compare::Type to printable operator. + /// + /// \param[in] type Compare::Type to convert + /// \param[in] asAlpha (optional) flag to print using alpha equivalents + /// e.g. "lte" rather than "<=" + /// \returns the printable operator string + /// \throws std::runtime_error on unknown Compare::Type + /// + static std::string TypeToOperator(const Compare::Type& type, + bool asAlpha = false); + + /// \} + +public: + + /// \name Comparison Function Objects + /// \{ + + /// %Base class for all BamRecord compare functors. + /// + /// Mostly used for method signatures that can accept any comparator. + /// + /// Custom comparators may be used by inheriting from this class. + /// + struct Base : public std::function { }; + +private: + /// \internal + /// + /// Exists to provide the typedef we'll use in the actual + /// MemberFunctionBase, since we need to use it in the template signature. + /// This keeps that a lot easier to read. + /// + template + struct MemberFunctionBaseHelper : public Compare::Base + { + typedef ValueType (BamRecord::*MemberFnType)(void) const; + }; + +public: + /// \brief %Base class for all BamRecord compare functors that take a + /// BamRecord function pointer and compare on its return type. + /// + /// Derived comparators usually need only declare the return value & + /// function pointer in the template signature. This class implements the + /// basic method-calling machinery. + /// + /// Custom comparators will work for any BamRecord member function that does + /// not take any input parameters. + /// + template::MemberFnType fn, + typename CompareType = std::less > + struct MemberFunctionBase : public Compare::MemberFunctionBaseHelper + { + bool operator()(const BamRecord& lhs, const BamRecord& rhs) const; + }; + +public: + + /// \brief Compares on BamRecord::AlignedEnd. + /// + /// Example: + /// \include code/Compare_AlignedEnd.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct AlignedEnd : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::AlignedStart. + /// + /// Example: + /// \include code/Compare_AlignedStart.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct AlignedStart : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::AlignedStrand + /// + /// Example: + /// \include code/Compare_AlignedStrand.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct AlignedStrand : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::BarcodeForward. + /// + /// Example: + /// \include code/Compare_BarcodeForward.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct BarcodeForward : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::BarcodeQuality. + /// + /// Example: + /// \include code/Compare_BarcodeQuality.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct BarcodeQuality : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::BarcodeReverse. + /// + /// Example: + /// \include code/Compare_BarcodeReverse.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct BarcodeReverse: public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::FullName. + /// + /// Example: + /// \include code/Compare_FullName.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct FullName : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::LocalContextFlags. + /// + /// Example: + /// \include code/Compare_LocalContextFlag.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct LocalContextFlag : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::MapQuality. + /// + /// Example: + /// \include code/Compare_MapQuality.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct MapQuality : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::MovieName. + /// + /// Example: + /// \include code/Compare_MovieName.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct MovieName : public MemberFunctionBase { }; + + /// \brief Provides an operator() is essentially a no-op for + /// comparing/sorting. + /// + /// If used in a sorting operation, then no change will occur. + /// + struct None : public Compare::Base + { + bool operator()(const BamRecord&, const BamRecord&) const; + }; + + ///\brief Compares on BamRecord::NumDeletedBases. + /// + /// Example: + /// \include code/Compare_NumDeletedBases.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct NumDeletedBases : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::NumInsertedBases. + /// + /// Example: + /// \include code/Compare_NumInsertedBases.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct NumInsertedBases : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::NumMatches. + /// + /// Example: + /// \include code/Compare_NumMatches.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct NumMatches : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::NumMismatches. + /// + /// Example: + /// \include code/Compare_NumMismatches.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct NumMismatches : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::QueryEnd. + /// + /// Example: + /// \include code/Compare_QueryEnd.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct QueryEnd : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::QueryStart. + /// + /// Example: + /// \include code/Compare_QueryStart.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct QueryStart : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::ReadAccuracy. + /// + /// Example: + /// \include code/Compare_ReadAccuracy.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct ReadAccuracy : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::ReadGroupId. + /// + /// \note Even though the ReadGroupId string contains hex values, it is + /// still just a std::string. Comparisons will use lexical, not + /// numeric ordering. If numeric ordering is desired, use + /// Compare::ReadGroupNumericId instead. + /// + /// Example: + /// \include code/Compare_ReadGroupId.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct ReadGroupId : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::ReadGroupNumericId. + /// + /// Example: + /// \include code/Compare_ReadGroupNumericId.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct ReadGroupNumericId : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::ReferenceEnd. + /// + /// Example: + /// \include code/Compare_ReferenceEnd.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct ReferenceEnd : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::ReferenceId. + /// + /// Example: + /// \include code/Compare_ReferenceId.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct ReferenceId : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::ReferenceName. + /// + /// Example: + /// \include code/Compare_ReferenceName.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct ReferenceName : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::ReferenceStart. + /// + /// Example: + /// \include code/Compare_ReferenceStart.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct ReferenceStart : public MemberFunctionBase { }; + + /// \brief Compares on BamRecord::HoleNumber. + /// + /// Example: + /// \include code/Compare_Zmw.txt + /// + /// \note Currently only supports std::less comparisons (i.e. sorting by + /// ascending value). + /// + struct Zmw : public MemberFunctionBase { }; + + /// \} +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/Compare.inl" + +#endif // COMPARE_H diff --git a/include/pbbam/CompositeBamReader.h b/include/pbbam/CompositeBamReader.h new file mode 100644 index 0000000..f0de942 --- /dev/null +++ b/include/pbbam/CompositeBamReader.h @@ -0,0 +1,269 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file CompositeBamReader.h +/// \brief Defines the composite BAM readers, for working with multiple input +/// files. +// +// Author: Derek Barnett + +#ifndef COMPOSITEBAMREADER_H +#define COMPOSITEBAMREADER_H + +#include "pbbam/BaiIndexedBamReader.h" +#include "pbbam/BamFile.h" +#include "pbbam/BamHeader.h" +#include "pbbam/BamReader.h" +#include "pbbam/BamRecord.h" +#include "pbbam/Config.h" +#include "pbbam/DataSet.h" +#include "pbbam/GenomicInterval.h" +#include "pbbam/PbiIndexedBamReader.h" +#include +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { + +namespace internal { + +/// \internal +/// \brief The CompositeMergeItem class provides a helper struct for composite +/// readers, containing a single-file reader and its "next" record. +/// +struct CompositeMergeItem +{ +public: + std::unique_ptr reader; + BamRecord record; + +public: + CompositeMergeItem(std::unique_ptr&& rdr); + CompositeMergeItem(std::unique_ptr&& rdr, BamRecord&& rec); + CompositeMergeItem(CompositeMergeItem&& other); + CompositeMergeItem& operator=(CompositeMergeItem&& other); + ~CompositeMergeItem(void); +}; + +/// \internal +/// \brief The CompositeMergeItemSorter class provides a helper function object +/// for ordering composite reader results. +/// +/// Essentially just exracts a BamRecord from its parent CompositeMergeItem for +/// further checks. +/// +template +struct CompositeMergeItemSorter : public std::function +{ + bool operator()(const CompositeMergeItem& lhs, + const CompositeMergeItem& rhs); +}; + +} // namespace internal + +/// \brief The GenomicIntervalCompositeBamReader class provides read access to +/// multipe %BAM files, limiting results to a genomic region. +/// +/// Requires a ".bai" file for each input %BAM file. +/// +/// Results will be returned in order of genomic coordinate (first by reference +/// ID, then by position). +/// +class PBBAM_EXPORT GenomicIntervalCompositeBamReader +{ +public: + /// \name Contstructors & Related Methods + /// \{ + + GenomicIntervalCompositeBamReader(const GenomicInterval& interval, + const std::vector& bamFiles); + GenomicIntervalCompositeBamReader(const GenomicInterval& interval, + std::vector&& bamFiles); + GenomicIntervalCompositeBamReader(const GenomicInterval& interval, + const DataSet& dataset); + + /// \} + +public: + /// \name Data Access + /// \{ + + /// Fetches next BAM record in the interval specified, storing in \p record + /// + /// \param[out] record + /// \returns true on success, false if no more data available. + /// + bool GetNext(BamRecord& record); + + /// Sets a new genomic interval of interest. + /// + /// \returns reference to this reader + /// + GenomicIntervalCompositeBamReader& Interval(const GenomicInterval& interval); + + /// \returns the current specified interval + /// + const GenomicInterval& Interval(void) const; + + /// \} + +private: + void UpdateSort(void); + +private: + GenomicInterval interval_; + std::deque mergeItems_; + std::vector filenames_; +}; + +/// \brief Provides read access to multipe %BAM files, limiting results to those +/// passing a PbiFilter. +/// +/// Requires a ".pbi" file for each input %BAM file. +/// +/// \note The template parameter OrderByType is not fully implemented at this +/// time. Use of comparison functor (e.g. Compare::Zmw) for this will +/// currently result in the proper "next" value at each iteration +/// step, independently, but not over the full data set. If all +/// files' "order-by" data values are accessible in increasing order +/// within each file, then the expected ordering will be observed, +/// However, if these data are not sorted within a file, the final results +/// will appear unordered. \n +/// \n +/// Example:\n +/// file 1: { 1, 5, 2, 6 } \n +/// file 2: { 3, 8, 4, 7 } \n +/// results: { 1, 3, 5, 2, 6, 8, 4, 7 } \n +/// \n +/// This a known issue and will be addressed in a future update. But in +/// the meantime, use of Compare::None as the OrderByType is recommended, +/// to explicitly indicate that no particular ordering is expected. +/// +template +class PBBAM_EXPORT PbiFilterCompositeBamReader +{ +public: + typedef internal::CompositeMergeItem value_type; + typedef internal::CompositeMergeItemSorter merge_sorter_type; + typedef std::deque container_type; + typedef typename container_type::iterator iterator; + typedef typename container_type::const_iterator const_iterator; + +public: + /// \name Contstructors & Related Methods + /// \{ + + PbiFilterCompositeBamReader(const PbiFilter& filter, + const std::vector& bamFiles); + PbiFilterCompositeBamReader(const PbiFilter& filter, + std::vector&& bamFiles); + PbiFilterCompositeBamReader(const PbiFilter& filter, + const DataSet& dataset); + + /// \} + +public: + /// \name Data Access + /// \{ + + /// Fetches next BAM record in the interval specified. + /// + /// \returns true on success, false if no more data available. + /// + bool GetNext(BamRecord& record); + + /// Sets a new PBI filter + /// + /// \returns reference to this reader + /// + PbiFilterCompositeBamReader& Filter(const PbiFilter& filter); + + /// \} + +private: + void UpdateSort(void); + +private: + container_type mergeQueue_; + std::vector filenames_; +}; + +/// \brief The SequentialCompositeBamReader class provides read access to +/// multiple %BAM files, reading through the entire contents of each +/// file. +/// +/// Input files will be accessed in the order provided to the constructor. Each +/// file's contents will be exhausted before moving on to the next one (as +/// opposed to a "round-robin" scheme). +/// +class PBBAM_EXPORT SequentialCompositeBamReader +{ +public: + /// \name Contstructors & Related Methods + /// \{ + + SequentialCompositeBamReader(const std::vector& bamFiles); + SequentialCompositeBamReader(std::vector&& bamFiles); + SequentialCompositeBamReader(const DataSet& dataset); + + /// \} + +public: + /// \name Data Access + /// \{ + + /// Fetches next BAM record from the . + /// + /// \returns true on success, false if no more data available. + /// + bool GetNext(BamRecord& record); + + /// \} + +private: + std::deque > readers_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/CompositeBamReader.inl" + +#endif // COMPOSITEBAMREADER_H diff --git a/include/pbbam/Config.h b/include/pbbam/Config.h new file mode 100644 index 0000000..2521288 --- /dev/null +++ b/include/pbbam/Config.h @@ -0,0 +1,227 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Config.h +/// \brief Defines library-wide macros & global variables. +// +// Author: Derek Barnett + +#ifndef PBBAM_CONFIG_H +#define PBBAM_CONFIG_H + +#include + +#ifndef INT8_MAX +#define INT8_MAX 127 +#endif +#ifndef INT16_MAX +#define INT16_MAX 32767 +#endif +#ifndef INT32_MAX +#define INT32_MAX 2147483647 +#endif +#ifndef INT64_MAX +#define INT64_MAX 9223372036854775807LL +#endif +#ifndef INT8_MIN +#define INT8_MIN -128 +#endif +#ifndef INT16_MIN +#define INT16_MIN -32768 +#endif +#ifndef INT32_MIN +#define INT32_MIN (-INT32_MAX-1) +#endif +#ifndef INT64_MIN +#define INT64_MIN (-INT64_MAX-1) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX 255 +#endif +#ifndef UINT16_MAX +#define UINT16_MAX 65535 +#endif +#ifndef UINT32_MAX +#define UINT32_MAX 4294967295U +#endif +#ifndef UINT64_MAX +#define UINT64_MAX 18446744073709551615ULL +#endif + +/// \name Library Import/Export +/// \{ + +#ifndef PBBAM_LIBRARY_EXPORT +# if defined(WIN32) +# define PBBAM_LIBRARY_EXPORT __declspec(dllexport) +# else +# define PBBAM_LIBRARY_EXPORT __attribute__((visibility("default"))) +# endif +#endif + +#ifndef PBBAM_LIBRARY_IMPORT +# if defined(WIN32) +# define PBBAM_LIBRARY_IMPORT __declspec(dllimport) +# else +# define PBBAM_LIBRARY_IMPORT +# endif +#endif + +#ifndef PBBAM_EXPORT +# if defined(PBBAM_LIBRARY) +# define PBBAM_EXPORT PBBAM_LIBRARY_EXPORT +# else +# define PBBAM_EXPORT PBBAM_LIBRARY_IMPORT +# endif +#endif + +/// \} + +/// \name Shared Pointer Settings +/// \{ + +// uncomment this define, or pass via command-line (-DPBBAM_USE_BOOST_SHARED_PTR), +// to use boost::shared_ptr instead of std::shared_ptr +// +//#define PBBAM_USE_BOOST_SHARED_PTR + +#ifdef PBBAM_USE_BOOST_SHARED_PTR +# include +# define PBBAM_SHARED_PTR boost::shared_ptr +#else +# include +# define PBBAM_SHARED_PTR std::shared_ptr +#endif + +/// \} + +/// \name Class Definition Helpers +/// \{ + +/// \brief Disables the use of copy constructors and assignment operators for a +/// class. +/// +/// To use, place the macro in a class's private section: +/// \code{.cpp} +/// struct Foo { +/// private: +/// DISABLE_COPY(Foo); +/// }; +/// \endcode +/// +#ifndef DISABLE_COPY +#define DISABLE_COPY(Class) \ + Class(const Class&); \ + Class& operator=(const Class&) +#endif + +/// \brief Disables the use of move constructors and assignment operators for a +/// class. +/// +/// To use, place the macro in a class's private section: +/// \code{.cpp} +/// struct Foo { +/// private: +/// DISABLE_MOVE(Foo); +/// }; +/// \endcode +/// +#ifndef DISABLE_MOVE +#define DISABLE_MOVE(Class) \ + Class(Class&&); \ + Class& operator=(Class&&); +#endif + +/// \brief Disables the use of copy & move constructors and assignment operators f +/// or a class. +/// +/// To use, place the macro in a class's private section: +/// \code{.cpp} +/// struct Foo { +/// private: +/// DISABLE_MOVE_AND_COPY(Foo); +/// }; +/// \endcode +/// +#ifndef DISABLE_MOVE_AND_COPY +#define DISABLE_MOVE_AND_COPY(Class) \ + DISABLE_MOVE(Class) \ + DISABLE_COPY(Class) +#endif + +/// \} + +// \brief Auto-validation +// +// To validate BAM components (header, records, etc.) you can either use the +// Validator API provided, or enable auto-validation. To compile pbbam for +// auto-validation, add the -DPacBioBAM_auto_validate=ON option to your cmake +// invocation. +// +// +#ifndef PBBAM_AUTOVALIDATE +# define PBBAM_AUTOVALIDATE 0 +#endif + +/// \} + +namespace PacBio { +namespace BAM { + +/// \name Verbosity Settings +/// \{ + +/// \brief Sets the desired verbosity level of htslib warnings. +/// +/// Change this value to allow debug/warning statements from htslib itself. +/// The valid range seems to be [0-3], where 0 indicates OFF, and 3 is the +/// most verbose. +/// +/// By default, pbbam disables htslib statements to keep output channels clean. +/// We rely on exceptions & their associated messages instead. +/// +/// This global variable is obviously not thread-safe by any means. But as a +/// debug flag, it is unlikely to cause any real issues. The worst case would be +/// unexpected presence/absence of output statements. +/// +extern int HtslibVerbosity; + +/// \} + +} // namespace BAM +} // namespace PacBio + +#endif // PBBAM_CONFIG_H diff --git a/include/pbbam/DataSet.h b/include/pbbam/DataSet.h new file mode 100644 index 0000000..af1b14f --- /dev/null +++ b/include/pbbam/DataSet.h @@ -0,0 +1,820 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file DataSet.h +/// \brief Defines the DataSet class. +// +// Author: Derek Barnett + +#ifndef DATASET_H +#define DATASET_H + +#include "pbbam/BamFile.h" +#include "pbbam/Config.h" +#include "pbbam/DataSetTypes.h" +#include +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief The DataSet class represents a %PacBio analyis dataset (e.g. from +/// XML). +/// +/// \nosubgrouping +/// +/// It provides resource paths, filters, and metadata associated with a dataset +/// under analysis. +/// +class PBBAM_EXPORT DataSet +{ +public: + /// \name DataSet Type + /// \{ + + /// \brief This enum defines the currently-supported DataSet types. + /// + enum TypeEnum { + GENERIC = 0 + , ALIGNMENT + , BARCODE + , CONSENSUS_ALIGNMENT + , CONSENSUS_READ + , CONTIG + , HDF_SUBREAD + , REFERENCE + , SUBREAD + }; + + /// \brief Converts printable dataset type to type enum. + /// + /// \param[in] typeName printable dataset type + /// \returns dataset type enum + /// \throws std::runtime_error if \p typeName is unknown + /// + static DataSet::TypeEnum NameToType(const std::string& typeName); + + /// \brief Converts dataset type enum to printable name. + /// + /// \param[in] type dataset type enum + /// \returns printable dataset type + /// \throws std::runtime_error if \p type is unknown + /// + static std::string TypeToName(const DataSet::TypeEnum& type); + + /// \} + +public: + + /// \name Constructors & Related Methods + /// \{ + + /// \brief Constructs an empty, generic DataSet. + /// + DataSet(void); + + /// \brief Constructs an empty DataSet of the type specified. + /// + /// \param[in] type dataset type + /// \throws std::runtime_error if \p type is unknown + /// + DataSet(const DataSet::TypeEnum type); + + /// \brief Constructs a DataSet from a %BAM file. + /// + /// This currently defaults to a SubreadSet, with an ExternalResource + /// pointing to BamFile::Filename. + /// + /// \param[in] bamFile BamFile object + /// + DataSet(const BamFile& bamFile); + + /// \brief Loads a DataSet from a file. + /// + /// \p filename may be one of three types, indicated by its extension:\n + /// - %BAM ("*.bam") \n + /// - FOFN ("*.fofn") \n + /// - DataSetXML ("*.xml") \n + /// + /// \param[in] filename input filename + /// \throws std::runtime_error if \p filename has an unsupported extension, + /// or if a valid DataSet could not be created from its contents + /// + DataSet(const std::string& filename); + + /// \brief Constructs a DataSet from a list of files. + /// + /// \param[in] filenames input filenames + /// \throws std::runtime_error if DataSet could not be created from + /// \p filenames + /// + DataSet(const std::vector& filenames); + + DataSet(const DataSet& other); + DataSet(DataSet&& other); + DataSet& operator=(const DataSet& other); + DataSet& operator=(DataSet&& other); + ~DataSet(void); + + /// \brief Creates a DataSet from "raw" XML data. + /// + /// \param[in] xml DataSetXML text + /// + static DataSet FromXml(const std::string& xml); + + /// \} + +public: + /// \name Operators + /// \{ + + /// \brief Merges DataSet contents. + /// + /// Adds contents of \p other to this dataset object + /// + /// \param[in] other some other dataset to add to this one + /// \returns reference to this dataset object + /// + DataSet& operator+=(const DataSet& other); + + /// \} + +public: + /// \name Serialization + /// \{ + + /// \brief Saves dataset XML to file. + /// + /// \param[in] outputFilename destination for XML contents + /// + /// \throws std::runtime_error if file could be opened or if DataSet + /// elements could not be converted to XML + /// + void Save(const std::string& outputFilename); + + /// \brief Saves dataset XML to output stream, e.g. std::cout, + /// std::stringstream. + /// + /// \param[out] out destination for XML contents + /// + /// \throws std::runtime_error if DataSet elements could not be converted to + /// XML + /// + void SaveToStream(std::ostream& out); + + /// \} + +public: + + /// \name Attributes + /// \{ + /// + + /// \brief Fetches the value of a DataSet root element's attribute. + /// + /// These are the attributes attached to the root dataset element: \n + /// \verbatim \endverbatim + /// + /// Built-in accessors exist for the standard attributes (e.g. CreatedAt) + /// but additional attributes can be used as well via these generic + /// Attribute methods. + /// + /// \param[in] name root element's attribute name + /// \returns const reference to attribute's value (empty string if not + /// present) + /// + const std::string& Attribute(const std::string& name) const; + + /// \brief Fetches the value of dataset's CreatedAt attribute. + /// + /// \returns const reference to attribute's value (empty string if not + /// present) + /// + const std::string& CreatedAt(void) const; + + /// \brief Fetches the value of dataset's Format attribute. + /// + /// \returns const reference to attribute's value (empty string if not + /// present) + /// + const std::string& Format(void) const; + + /// \brief Fetches the value of dataset's MetaType attribute. + /// + /// \returns const reference to attribute's value (empty string if not + /// present) + /// + const std::string& MetaType(void) const; + + /// \brief Fetches the value of dataset's ModifiedAt attribute. + /// + /// \returns const reference to attribute's value (empty string if not + /// present) + /// + const std::string& ModifiedAt(void) const; + + /// \brief Fetches the value of dataset's Name attribute. + /// + /// \returns const reference to attribute's value (empty string if not + /// present) + /// + const std::string& Name(void) const; + + /// \brief Fetches the value of dataset's ResourceId attribute. + /// + /// \returns const reference to attribute's value (empty string if not + /// present) + /// + const std::string& ResourceId(void) const; + + /// \brief Fetches the value of dataset's Tags attribute. + /// + /// \returns const reference to attribute's value (empty string if not + /// present) + /// + const std::string& Tags(void) const; + + /// \brief Fetches the value of dataset's TimeStampedName attribute. + /// + /// \returns const reference to attribute's value (empty string if not + /// present) + /// + const std::string& TimeStampedName(void) const; + + /// \brief Fetches the value of dataset's UniqueId attribute. + /// + /// \returns const reference to attribute's value (empty string if not + /// present) + /// + const std::string& UniqueId(void) const; + + /// \brief Fetches the value of dataset's Version attribute. + /// + /// \returns const reference to attribute's value (empty string if not + /// present) + /// + const std::string& Version(void) const; + + /// \} + +public: + /// \name DataSet Type + /// \{ + + /// \brief Fetches the dataset's type. + /// + /// \returns dataset type enum + /// + PacBio::BAM::DataSet::TypeEnum Type(void) const; + + /// \brief Fetches the dataset's type. + /// + /// \returns printable dataset type + /// + std::string TypeName(void) const; + + /// \} + +public: + /// \name Child Elements + /// \{ + + /// \brief Fetches the dataset's Extensions element. + /// + /// \returns const reference to child element + /// \throws std::runtime_error if element does not exist + /// + const PacBio::BAM::Extensions& Extensions(void) const; + + /// \brief Fetches the dataset's ExternalResources element. + /// + /// \returns const reference to child element + /// \throws std::runtime_error if element does not exist + /// + const PacBio::BAM::ExternalResources& ExternalResources(void) const; + + /// \brief Fetches the dataset's Filters element. + /// + /// \returns const reference to child element + /// + const PacBio::BAM::Filters& Filters(void) const; + + /// \brief Fetches the dataset's DataSetMetadata element. + /// + /// \returns const reference to child element + /// + const PacBio::BAM::DataSetMetadata& Metadata(void) const; + + /// \brief Fetches the dataset's DataSets element. + /// + /// \returns const reference to child element + /// + const PacBio::BAM::SubDataSets& SubDataSets(void) const; + + /// \} + +public: + /// \name Resource Handling + /// \{ + + /// \brief Returns this dataset's primary %BAM resources, with relative + /// filepaths already resolved. + /// + /// Primary resources are those listed as top-level %ExternalResources, not + /// associated files (indices, references, scraps %BAMs, etc.). + /// + /// \returns vector of BamFiles + /// + /// \sa DataSet::ResolvedResourceIds + /// + std::vector BamFiles(void) const; + + /// \brief Returns all primary external resource filepaths, with relative + /// paths resolved. + /// + /// Primary resources are those listed as top-level %ExternalResources, not + /// associated files (indices, references, scraps %BAMs, etc.). + /// + /// \sa ResolvePath + /// + /// \returns resourceIds + /// + std::vector ResolvedResourceIds(void) const; + + /// \brief Resolves a filepath (that may be relative to the dataset). + /// + /// A DataSet's resources may be described using absolute filepaths or with + /// relative paths. For absolute paths, nothing is changed from the input. + /// For relative paths, these are resolved using the DataSet's own path + /// as a starting point. A DataSet's own path will be one of:\n + /// 1 - the location of its XML or %BAM input file, e.g. created using + /// DataSet("foo.xml") or DataSet("foo.bam")\n + /// 2 - application's current working directory for all other DataSet + /// construction methods { DataSet(), DataSet(type), + /// DataSet("foo.fofn") }\n + /// + /// \param[in] originalPath input file path (absolute or relative) + /// \returns resolved path + /// + std::string ResolvePath(const std::string& originalPath) const; + + /// \returns sequence chemistry info for all read groups in this dataset + /// + /// \sa ReadGroupInfo::SequencingChemistry + /// + std::set SequencingChemistries(void) const; + + /// \} + +public: + /// \name XML Namespace Handling + /// \{ + + /// \brief Access this dataset's namespace info. + /// + /// \returns const reference to dataset's NamespaceRegistry + /// + const NamespaceRegistry& Namespaces(void) const; + + /// \} + +public: + /// \name Attributes + /// \{ + + /// \brief Fetches the value of a DataSet root element's attribute. + /// + /// These are the attributes attached to the root dataset element: \n + /// \verbatim \endverbatim + /// + /// Built-in accessors exist for the standard attributes (e.g. CreatedAt) + /// but additional attributes can be used as well via these generic methods. + /// + /// A new attribute will be created if it does not yet exist. + /// + /// \param[in] name root element's attribute name + /// \returns non-const reference to attribute's value (empty string if this + /// is a new attribute) + /// + std::string& Attribute(const std::string& name); + + /// \brief Fetches the value of dataset's CreatedAt attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute's value (empty string if this + /// is a new attribute) + /// + std::string& CreatedAt(void); + + /// \brief Fetches the value of dataset's Format attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute's value (empty string if this + /// is a new attribute) + /// + std::string& Format(void); + + /// \brief Fetches the value of dataset's MetaType attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute's value (empty string if this + /// is a new attribute) + /// + std::string& MetaType(void); + + /// \brief Fetches the value of dataset's ModifiedAt attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute's value (empty string if this + /// is a new attribute) + /// + std::string& ModifiedAt(void); + + /// \brief Fetches the value of dataset's Name attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute's value (empty string if this + /// is a new attribute) + /// + std::string& Name(void); + + /// \brief Fetches the value of dataset's ResourceId attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute's value (empty string if this + /// is a new attribute) + /// + std::string& ResourceId(void); + + /// \brief Fetches the value of dataset's Tags attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute's value (empty string if this + /// is a new attribute) + /// + std::string& Tags(void); + + /// \brief Fetches the value of dataset's TimeStampedName attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute's value (empty string if this + /// is a new attribute) + /// + std::string& TimeStampedName(void); + + /// \brief Fetches the value of dataset's UniqueId attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute's value (empty string if this + /// is a new attribute) + /// + std::string& UniqueId(void); + + /// \brief Fetches the value of dataset's Version attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute's value (empty string if this + /// is a new attribute) + /// + std::string& Version(void); + + /// \} + +public: + /// \name Attributes + /// \{ + + /// \brief Sets this dataset's XML attribute \p name, with \p value + /// + /// These are the attributes attached to the root dataset element: \n + /// \verbatim \endverbatim + /// + /// Built-in accessors exist for the standard attributes (e.g. CreatedAt) + /// but additional attributes can be used as well via these generic methods. + /// + /// The attribute will be created if it does not yet exist. + /// + /// \param[in] name root element's attribute name + /// \param[in] value new value for the attribute + /// \returns reference to this dataset object + /// + DataSet& Attribute(const std::string& name, const std::string& value); + + /// \brief Sets this dataset's CreatedAt attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] createdAt new value for the attribute + /// \returns reference to this dataset object + /// + DataSet& CreatedAt(const std::string& createdAt); + + /// \brief Sets this dataset's Format attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] format new value for the attribute + /// \returns reference to this dataset object + /// + DataSet& Format(const std::string& format); + + /// \brief Sets this dataset's MetaType attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] metatype new value for the attribute + /// \returns reference to this dataset object + /// + DataSet& MetaType(const std::string& metatype); + + /// \brief Sets this dataset's ModifiedAt attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] modifiedAt new value for the attribute + /// \returns reference to this dataset object + /// + DataSet& ModifiedAt(const std::string& modifiedAt); + + /// \brief Sets this dataset's Name attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] name new value for the attribute + /// \returns reference to this dataset object + /// + DataSet& Name(const std::string& name); + + /// \brief Sets this dataset's ResourceId attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] resourceId new value for the attribute + /// \returns reference to this dataset object + /// + DataSet& ResourceId(const std::string& resourceId); + + /// \brief Sets this dataset's Tags attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] tags new value for the attribute + /// \returns reference to this dataset object + /// + DataSet& Tags(const std::string& tags); + + /// \brief Sets this dataset's TimeStampedName attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] timeStampedName new value for the attribute + /// \returns reference to this dataset object + /// + DataSet& TimeStampedName(const std::string& timeStampedName); + + /// \brief Sets this dataset's UniqueId attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] uuid new value for the attribute + /// \returns reference to this dataset object + /// + DataSet& UniqueId(const std::string& uuid); + + /// \brief Sets this dataset's Version attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] version new value for the attribute + /// \returns reference to this dataset object + /// + DataSet& Version(const std::string& version); + + /// \} + +public: + /// \name DataSet Type + /// \{ + + /// \brief Edits dataset type. + /// + /// \param[in] type new dataset type + /// \returns reference to this dataset object + /// + DataSet& Type(const PacBio::BAM::DataSet::TypeEnum type); + + /// \} + +public: + /// \name Child Elements + /// \{ + + /// \brief Fetches the dataset's Extensions element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::Extensions& Extensions(void); + + /// \brief Fetches the dataset's ExternalResources element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::ExternalResources& ExternalResources(void); + + /// \brief Fetches the dataset's Filters element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::Filters& Filters(void); + + /// \brief Fetches the dataset's DataSetMetadata element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::DataSetMetadata& Metadata(void); + + /// \brief Fetches the dataset's DataSets element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::SubDataSets& SubDataSets(void); + + /// \} + +public: + /// \name Child Elements + /// \{ + + /// \brief Sets this dataset's Extensions element. + /// + /// This element will be created if it does not yet exist. + /// + /// \param[in] extensions new value for the element + /// \returns reference to this dataset object + /// + DataSet& Extensions(const PacBio::BAM::Extensions& extensions); + + /// \brief Sets this dataset's ExternalResources element. + /// + /// This element will be created if it does not yet exist. + /// + /// \param[in] resources new value for the element + /// \returns reference to this dataset object + /// + DataSet& ExternalResources(const PacBio::BAM::ExternalResources& resources); + + /// \brief Sets this dataset's Filters element. + /// + /// This element will be created if it does not yet exist. + /// + /// \param[in] filters new value for the element + /// \returns reference to this dataset object + /// + DataSet& Filters(const PacBio::BAM::Filters& filters); + + /// \brief Sets this dataset's DataSetMetadata element. + /// + /// This element will be created if it does not yet exist. + /// + /// \param[in] metadata new value for the element + /// \returns reference to this dataset object + /// + DataSet& Metadata(const PacBio::BAM::DataSetMetadata& metadata); + + /// \brief Sets this dataset's DataSets element. + /// + /// This element will be created if it does not yet exist. + /// + /// \param[in] subdatasets new value for the element + /// \returns reference to this dataset object + /// + DataSet& SubDataSets(const PacBio::BAM::SubDataSets& subdatasets); + + /// \} + +public: + /// \name XML Namespace Handling + /// \{ + + /// \brief Access this dataset's namespace info. + /// + /// \returns non-const reference to dataset's NamespaceRegistry + /// + NamespaceRegistry& Namespaces(void); + + /// \} + +private: + std::unique_ptr d_; + std::string path_; +}; + +/// \name DataSet Timestamp Utilities +/// \{ + +/// \brief Fetches current time, in "DataSetXML format". +/// +/// \returns DataSetXML formatted timestamp +/// +/// \sa ToDataSetFormat +/// +PBBAM_EXPORT std::string CurrentTimestamp(void); + +/// \brief Converts a time_point to "DataSetXML-formatted" timestamp. +/// +/// This is the format used as a component of the DataSet::TimeStampedName +/// (yymmdd_HHmmssttt>. +/// +/// \returns "DataSetXML-formatted" timestamp +/// +PBBAM_EXPORT std::string ToDataSetFormat(const std::chrono::system_clock::time_point& tp); + +/// \brief Converts a time_t to "DataSetXML-formatted" timestamp. +/// +/// This is the format used as a component of the DataSet::TimeStampedName +/// (yymmdd_HHmmssttt>. +/// +/// \returns "DataSetXML-formatted" timestamp +/// +PBBAM_EXPORT std::string ToDataSetFormat(const time_t& tp); + +/// \brief Converts a time_point to ISO-8601 formatted timestamp. +/// +/// This is the format used in DataSet::CreatedAt and DataSet::ModifiedAt. +/// +/// \returns ISO-8601 formatted timestamp +/// +PBBAM_EXPORT std::string ToIso8601(const std::chrono::system_clock::time_point& tp); + +/// \brief Converts a time_t to ISO-8601 formatted timestamp. +/// +/// This is the format used in DataSet::CreatedAt and DataSet::ModifiedAt. +/// +/// \returns ISO-8601 formatted timestamp +/// +PBBAM_EXPORT std::string ToIso8601(const time_t& t); + +/// \} + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/DataSet.inl" + +#endif // DATASET_H diff --git a/include/pbbam/DataSetTypes.h b/include/pbbam/DataSetTypes.h new file mode 100644 index 0000000..23df643 --- /dev/null +++ b/include/pbbam/DataSetTypes.h @@ -0,0 +1,904 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file DataSetTypes.h +/// \brief Defines the public DataSet component classes. +// +// Author: Derek Barnett + +#ifndef DATASETTYPES_H +#define DATASETTYPES_H + +#include "pbbam/BamFile.h" +#include "pbbam/Config.h" +#include "pbbam/DataSetXsd.h" +#include "pbbam/internal/DataSetBaseTypes.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The DataSetMetadata class represents the %DataSetMetadata child +/// element in DataSetXML. +/// +/// A few top-level elements are built-in, but as pbbam is not primarily a +/// DataSetXML API, most of the metadata hierarchy needs to be manually managed. +/// +class PBBAM_EXPORT DataSetMetadata : public internal::DataSetElement +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Constructs a DataSetMetadata with required fields. + DataSetMetadata(const std::string& numRecords, + const std::string& totalLength); + + /// \} + +public: + /// \name Operators + /// \{ + + /// \brief Merges DataSetMetadata contents. + /// + /// Adds contents of \p other to this metadata object + /// + /// \param[in] other some other metadata to add to this one + /// \returns reference to this object + /// + DataSetMetadata& operator+=(const DataSetMetadata& other); + + /// \} + +public: + /// \name Child Elements + /// \{ + + /// \brief Fetches the text of the NumRecords element. + /// + /// \returns const reference to element text (empty string if not present) + /// + const std::string& NumRecords(void) const; + + /// \brief Fetches the text of the TotalLength element. + /// + /// \returns const reference to element text (empty string if not present) + /// + const std::string& TotalLength(void) const; + + /// \brief Fetches the Provenance element. + /// + /// \returns const reference to child element + /// \throws std::runtime_error if element does not exist + /// + const PacBio::BAM::Provenance& Provenance(void) const; + + /// \} + +public: + /// \name Child Elements + /// \{ + + /// \brief Fetches the text of the NumRecords element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to element text + /// + std::string& NumRecords(void); + + /// \brief Fetches the text of the TotalLength element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to element text + /// + std::string& TotalLength(void); + + /// \brief Fetches Provenance element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::Provenance& Provenance(void); + + /// \} + +public: + /// \name Child Elements + /// \{ + + /// \brief Sets the text of the NumRecords element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns reference to this metadata object + /// + DataSetMetadata& NumRecords(const std::string& numRecords); + + /// \brief Sets the text of the TotalLength element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns reference to this metadata object + /// + DataSetMetadata& TotalLength(const std::string& totalLength); + + /// \brief Sets the Provenance child element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns reference to this metadata object + /// + DataSetMetadata& Provenance(const PacBio::BAM::Provenance& provenance); + + /// \} +}; + +/// \brief The ExtensionElement class represents an %ExtensionElement element in +/// DataSetXML. +/// +class PBBAM_EXPORT ExtensionElement : public internal::DataSetElement { +public: + ExtensionElement(void); +}; + +/// \brief The Extensions class represents an %Extensions element in DataSetXML. +/// +/// The Extensions element is essentially just a list of ExtensionElement +/// objects. +/// +class PBBAM_EXPORT Extensions : public internal::DataSetListElement +{ +public: + /// \brief Creates an empty extensions list. + Extensions(void); +}; + +class ExternalResources; + +/// \brief The ExternalResource class represents an %ExternalResource element in +/// DataSetXML. +/// +/// An ExternalResource can itself have a child element, ExternalResources, that +/// lists related files (e.g. index files). +/// +class PBBAM_EXPORT ExternalResource : public internal::IndexedDataType +{ +public: + /// \brief Creates an ExternalResource from a BamFile object. + /// + /// The metatype & resourceId are automatically set. + /// + ExternalResource(const BamFile& bamFile); + + /// \brief Creates an ExternalResource with provided \p metatype and + /// \p filename as resource ID. + /// + ExternalResource(const std::string& metatype, + const std::string& filename); + +public: + /// \brief Fetches the resource's ExternalResources child element. + /// + /// \returns const reference to child element + /// \throws std::runtime_error if element does not exist + /// + const PacBio::BAM::ExternalResources& ExternalResources(void) const; + +public: + /// \brief Fetches the resource's ExternalResources child element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::ExternalResources& ExternalResources(void); + + /// \brief Sets this resource's ExternalResources child element. + /// + /// This element will be created if it does not yet exist. + /// + /// \param[in] resources new value for the element + /// \returns reference to this resource object + /// + ExternalResource& ExternalResources(const PacBio::BAM::ExternalResources& resources); + +public: + /// \brief Converts an ExternalResource to a BamFile object + /// + /// \returns corresponding BamFile object for this ExternalResource + /// \throws std::runtime_error if fails to open %BAM file (e.g. does not + /// exist, not a %BAM file, etc.) + /// + /// \deprecated Use the results from DataSet::BamFiles instead. This method + /// cannot resolve relative filepaths and will be removed in the + /// near future. + /// + BamFile ToBamFile(void) const; +}; + +/// \brief The ExternalResources class represents an %ExternalResources element +/// in DataSetXML. +/// +/// The ExternalResources element is essentially just a list of ExternalResource +/// elements. +/// +class PBBAM_EXPORT ExternalResources : public internal::DataSetListElement +{ +public: + /// \brief Creates an empty resource list. + ExternalResources(void); + + /// \brief Merges \p other resource list with this one. + ExternalResources& operator+=(const ExternalResources& other); + +public: + /// \brief Adds an ExternalResource to this list. + void Add(const ExternalResource& ext); + + /// \brief Removes an ExternalResource from this list. + void Remove(const ExternalResource& ext); + +public: + /// \brief Converts resource list to BamFile objects. + /// + /// \deprecated Use DataSet::BamFiles instead. This method cannot resolve + /// relative filepaths and will be removed in the near future. + /// + std::vector BamFiles(void) const; +}; + +/// \brief The FileIndex class represents a %FileIndex element in DataSetXML. +/// +/// A FileIndex is used as an auxiliary to an ExternalResource, providing +/// information about a data file's index file (e.g. for %BAM files, *.bai or +/// *.pbi). +/// +class PBBAM_EXPORT FileIndex : public internal::InputOutputDataType +{ +public: + /// \brief Creates a FileIndex with provided \p metatype and \p filename as + /// resource ID. + /// + FileIndex(const std::string& metatype, + const std::string& filename); +}; + +/// \brief The FileIndices class represents a %FileIndices element in DataSetXML. +/// +/// The FileIndices element is essentially just a list of FileIndex elements, +/// providing information about a data file's index files (e.g. for %BAM files +/// this will usually be *.bai and/or *.pbi). +/// +class PBBAM_EXPORT FileIndices : public internal::DataSetListElement +{ +public: + /// \brief Creates an empty index list. + FileIndices(void); + +public: + /// \brief Adds a FileIndex to this list. + void Add(const FileIndex& index); + + /// \brief Removes a FileIndex from this list. + void Remove(const FileIndex& index); +}; + +/// \brief The Filter class represents a %Filter element in DataSetXML. +/// +/// The Filter element allows analysis pipelines to describe filters on data +/// that should be respected downstream, without needing to create filtered +/// intermediate files. +/// +/// A filter consists of a list of Property elements, each of which must be +/// passed (logical AND) to pass the filter, e.g. property1 && property2 && +/// property3. +/// +class PBBAM_EXPORT Filter : public internal::DataSetElement +{ +public: + /// \brief Creates an empty filter. + Filter(void); + +public: + /// \brief Fetches the filter's property list element. + /// + /// \returns const reference to child element + /// \throws std::runtime_error if element does not exist + /// + const PacBio::BAM::Properties& Properties(void) const; + +public: + /// \brief Fetches the filter's property list child element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::Properties& Properties(void); + + /// \brief Sets this filter's Properties child element. + /// + /// This element will be created if it does not yet exist. + /// + /// \param[in] properties new value for the element + /// \returns reference to this filter object + /// + Filter& Properties(const PacBio::BAM::Properties& properties); +}; + +/// \brief The Filters class represents a %Filters list element in DataSetXML. +/// +/// The Filters element is essentially a list of Filter elements. For analysis +/// purpose, each filter is considered separately (logical OR) to consider which +/// data passes, e.g. filter1 || filter2 || filter3. +/// +class PBBAM_EXPORT Filters : public internal::DataSetListElement +{ +public: + /// \brief Creates an empty filter list. + Filters(void); + + /// \brief Merges \p other filter list with this one. + Filters& operator+=(const Filters& other); + +public: + /// \brief Adds a filter to this list. + void Add(const Filter& filter); + + /// \brief Removes a filter from this list. + void Remove(const Filter& filter); +}; + +/// \brief The ParentTool class represents a %ParentTool element in DataSetXML. +/// +class PBBAM_EXPORT ParentTool : public internal::BaseEntityType { +public: + /// \brief Creates an empty %ParentTool element. + ParentTool(void); +}; + +/// \brief The Property class represents a %Property element in DataSetXML. +/// +/// A Property is the primary building block of %DataSetXML filtering. The +/// %Property element describes a data record's property (or field), some value, +/// and a comparison operator. +/// +/// For example, one could filter all %BAM records with a read accuracy at or +/// above 0.9. In C++ this could be constructed like: +/// \code{.cpp} +/// Property p("accuracy", "0.9", ">="); +/// \endcode +/// +class PBBAM_EXPORT Property : public internal::DataSetElement +{ +public: + /// \brief Constructs a filter property. + Property(const std::string& name, + const std::string& value, + const std::string& op); + +public: + + /// \brief Fetches the value of property's Name attribute. + /// + /// \returns const reference to attribute value + /// + const std::string& Name(void) const; + + /// \brief Fetches the value of property's Operator attribute. + /// + /// \returns const reference to attribute value + /// + const std::string& Operator(void) const; + + /// \brief Fetches the value of property's Value attribute. + /// + /// \returns const reference to attribute value + /// + const std::string& Value(void) const; + +public: + + /// \brief Fetches the value of property's Name attribute. + /// + /// \returns non-const reference to attribute value + /// + std::string& Name(void); + + /// \brief Fetches the value of property's Operator attribute. + /// + /// \returns non-const reference to attribute value + /// + std::string& Operator(void); + + /// \brief Fetches the value of property's Value attribute. + /// + /// \returns nonconst reference to attribute value + /// + std::string& Value(void); + +public: + /// \brief Sets this property's Name attribute. + /// + /// \param[in] name new value for the attribute + /// \returns reference to this property object + /// + Property& Name(const std::string& name); + + /// \brief Sets this property's Operator attribute. + /// + /// \param[in] op new value for the attribute + /// \returns reference to this property object + /// + Property& Operator(const std::string& op); + + /// \brief Sets this property's Value attribute. + /// + /// \param[in] value new value for the attribute + /// \returns reference to this property object + /// + Property& Value(const std::string& value); +}; + +/// \brief The Properties class represents a %Properties list element in +/// DataSetXML. +/// +/// The Properties element is essentially a list of Property elements. +/// +class PBBAM_EXPORT Properties : public internal::DataSetListElement +{ +public: + /// \brief Creates an empty property list. + Properties(void); + +public: + /// \brief Adds a property to this list. + void Add(const Property& property); + + /// \brief Removes a property from this list. + void Remove(const Property& property); +}; + +/// \brief The Provenance class represents a %Provenance element in DataSetXML. +/// +class PBBAM_EXPORT Provenance : public internal::DataSetElement +{ +public: + /// \brief Creates a empty provenance element. + Provenance(void); + +public: + /// \brief Fetches the value of CreatedBy attribute. + /// + /// \returns const reference to attribute value (empty string if not + /// present) + /// + const std::string& CreatedBy(void) const; + + /// \brief Fetches the value of CommonServicesInstanceId attribute. + /// + /// \returns const reference to attribute value (empty string if not + /// present) + /// + const std::string& CommonServicesInstanceId(void) const; + + /// \brief Fetches the value of CreatorUserId attribute. + /// + /// \returns const reference to attribute value (empty string if not + /// present) + /// + const std::string& CreatorUserId(void) const; + + /// \brief Fetches the value of ParentJobId attribute. + /// + /// \returns const reference to attribute value (empty string if not + /// present) + /// + const std::string& ParentJobId(void) const; + + /// \brief Fetches the ParentTool child element. + /// + /// \returns const reference to child element + /// \throws std::runtime_error if element does not exist + /// + const PacBio::BAM::ParentTool& ParentTool(void) const; + +public: + + /// \brief Fetches the value of CreatedBy attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute value (empty string if this is + /// a new attribute) + /// + std::string& CreatedBy(void); + + /// \brief Fetches the value of CommonServicesInstanceId attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute value (empty string if this is + /// a new attribute) + /// + std::string& CommonServicesInstanceId(void); + + /// \brief Fetches the value of CreatorUserId attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute value (empty string if this is + /// a new attribute) + /// + std::string& CreatorUserId(void); + + /// \brief Fetches the value of ParentJobId attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \returns non-const reference to attribute value (empty string if this is + /// a new attribute) + /// + std::string& ParentJobId(void); + + /// \brief Fetches the ParentTool element element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::ParentTool& ParentTool(void); + +public: + + /// \brief Sets the CreatedBy attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] createdBy new value for the attribute + /// \returns reference to this object + /// + Provenance& CreatedBy(const std::string& createdBy); + + /// \brief Sets the CommonServicesInstanceId attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] id new value for the attribute + /// \returns reference to this object + /// + Provenance& CommonServicesInstanceId(const std::string& id); + + /// \brief Sets the CreatorUserId attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] id new value for the attribute + /// \returns reference to this object + /// + Provenance& CreatorUserId(const std::string& id); + + /// \brief Sets the ParentJobId attribute. + /// + /// This attribute will be created if it does not yet exist. + /// + /// \param[in] id new value for the attribute + /// \returns reference to this object + /// + Provenance& ParentJobId(const std::string& id); + + /// \brief Sets the ParentTool child element. + /// + /// This element will be created if it does not yet exist. + /// + /// \param[in] tool new value for the element + /// \returns reference to this dataset object + /// + Provenance& ParentTool(const PacBio::BAM::ParentTool& tool); +}; + +class SubDataSets; + +/// \brief The DataSetBase class provides the attributes & child elements shared +/// by all dataset types. +/// +/// Client code should not need to use this class directly. It should be +/// considered as more of an implementation detail and may in fact be removed +/// from public API in the future. The top-level DataSet is the recommended +/// entry point. +/// +class PBBAM_EXPORT DataSetBase : public internal::StrictEntityType +{ +public: + + /// \brief Creates a DataSetBase object, or one of its subclasses, from an + /// XML element name (e.g. SubreadSet) + /// + static std::shared_ptr Create(const std::string& typeName); + +public: + /// \brief Creates an empty, generic DataSetBase. + DataSetBase(void); + +protected: + /// \brief Creates a DataSetBase with key values initialized. + DataSetBase(const std::string& metatype, + const std::string& label, + const XsdType& xsd); + + /// \brief Returns a new DataSetBase containing a deep copy of contents + DataSetBase* DeepCopy(void) const; + +public: + /// \brief Merges dataset contents. + /// + /// Adds contents of \p other to this dataset object + /// + /// \param[in] other some other dataset to add to this one + /// \returns reference to this dataset object + /// + DataSetBase& operator+=(const DataSetBase& other); + +public: + /// \brief Fetches the dataset's ExternalResources element. + /// + /// \returns const reference to child element + /// \throws std::runtime_error if element does not exist + /// + const PacBio::BAM::ExternalResources& ExternalResources(void) const; + + /// \brief Fetches the dataset's Filters element. + /// + /// \returns const reference to child element + /// + const PacBio::BAM::Filters& Filters(void) const; + + /// \brief Fetches the dataset's DataSetMetadata element. + /// + /// \returns const reference to child element + /// + const PacBio::BAM::DataSetMetadata& Metadata(void) const; + + /// \brief Fetches the dataset's DataSets element. + /// + /// \returns const reference to child element + /// + const PacBio::BAM::SubDataSets& SubDataSets(void) const; + +public: + /// \brief Access this dataset's namespace info. + /// + /// \returns const reference to dataset's NamespaceRegistry + /// + const NamespaceRegistry& Namespaces(void) const; + +public: + /// \brief Fetches the dataset's ExternalResources element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::ExternalResources& ExternalResources(void); + + /// \brief Fetches the dataset's Filters element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::Filters& Filters(void); + + /// \brief Fetches the dataset's DataSetMetadata element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::DataSetMetadata& Metadata(void); + + /// \brief Fetches the dataset's DataSets element. + /// + /// This element will be created if it does not yet exist. + /// + /// \returns non-const reference to child element + /// + PacBio::BAM::SubDataSets& SubDataSets(void); + +public: + /// \brief Sets this dataset's ExternalResources element. + /// + /// This element will be created if it does not yet exist. + /// + /// \param[in] resources new value for the element + /// \returns reference to this dataset object + /// + DataSetBase& ExternalResources(const PacBio::BAM::ExternalResources& resources); + + /// \brief Sets this dataset's Filters element. + /// + /// This element will be created if it does not yet exist. + /// + /// \param[in] filters new value for the element + /// \returns reference to this dataset object + /// + DataSetBase& Filters(const PacBio::BAM::Filters& filters); + + /// \brief Sets this dataset's DataSetMetadata element. + /// + /// This element will be created if it does not yet exist. + /// + /// \param[in] metadata new value for the element + /// \returns reference to this dataset object + /// + DataSetBase& Metadata(const PacBio::BAM::DataSetMetadata& metadata); + + /// \brief Sets this dataset's DataSets element. + /// + /// This element will be created if it does not yet exist. + /// + /// \param[in] subdatasets new value for the element + /// \returns reference to this dataset object + /// + DataSetBase& SubDataSets(const PacBio::BAM::SubDataSets& subdatasets); + +public: + /// \brief Access this dataset's namespace info. + /// + /// \returns non-const reference to dataset's NamespaceRegistry + /// + NamespaceRegistry& Namespaces(void); + +private: + NamespaceRegistry registry_; +}; + +/// \brief The AlignmentSet class represents an %AlignmentSet root element in +/// DataSetXML. +/// +class PBBAM_EXPORT AlignmentSet : public DataSetBase +{ +public: + /// \brief Creates an empty AlignmentSet dataset. + AlignmentSet(void); +}; + +/// \brief The BarcodeSet class represents a %BarcodeSet root element in +/// DataSetXML. +/// +class PBBAM_EXPORT BarcodeSet : public DataSetBase +{ +public: + /// \brief Creates an empty BarcodeSet dataset. + BarcodeSet(void); +}; + +/// \brief The ConsensusAlignmentSet class represents a %ConsensusAlignmentSet +/// root element in DataSetXML. +/// +class PBBAM_EXPORT ConsensusAlignmentSet : public DataSetBase +{ +public: + /// \brief Creates an empty ConsensusAlignmentSet dataset. + ConsensusAlignmentSet(void); +}; + +/// \brief The ConsensusReadSet class represents a %ConsensusReadSet root +/// element in DataSetXML. +/// +class PBBAM_EXPORT ConsensusReadSet : public DataSetBase +{ +public: + /// \brief Creates an empty ConsensusReadSet dataset. + ConsensusReadSet(void); +}; + +/// \brief The ContigSet class represents a %ContigSet root element in +/// DataSetXML. +/// +class PBBAM_EXPORT ContigSet : public DataSetBase +{ +public: + /// \brief Creates an empty ContigSet dataset. + ContigSet(void); +}; + +/// \brief The HdfSubreadSet class represents a %HdfSubreadSet root element in +/// DataSetXML. +/// +class PBBAM_EXPORT HdfSubreadSet : public DataSetBase +{ +public: + /// \brief Creates an empty HdfSubreadSet dataset. + HdfSubreadSet(void); +}; + +/// \brief The ReferenceSet class represents a %ReferenceSet root element in +/// DataSetXML. +/// +class PBBAM_EXPORT ReferenceSet : public DataSetBase +{ +public: + /// \brief Creates an empty ReferenceSet dataset. + ReferenceSet(void); +}; + +/// \brief The SubDataSets class represents a %DataSets list element in +/// DataSetXML. +/// +/// The SubDataSets element is essentially a list of DataSets. +/// +class PBBAM_EXPORT SubDataSets : public internal::DataSetListElement +{ +public: + /// \brief Creates an empty list of sub-datasets. + SubDataSets(void); + +public: + /// \brief Adds \p other sub-dataset to this list. + SubDataSets& operator+=(const DataSetBase& other); // single + + /// \brief Adds \p other sub-dataset list to this list. + SubDataSets& operator+=(const SubDataSets& other); // list + +public: + /// \brief Adds a sub-dataset to this list. + void Add(const DataSetBase& subdataset); + + /// \brief Removes a sub-dataset from this list. + void Remove(const DataSetBase& subdataset); +}; + +/// \brief The SubreadSet class represents a %SubreadSet root element in +/// DataSetXML. +/// +class PBBAM_EXPORT SubreadSet : public DataSetBase +{ +public: + /// \brief Creates an empty SubreadSet dataset. + SubreadSet(void); +}; + +} // namespace BAM +} // namespace PacBio + +#include "internal/DataSetTypes.inl" + +#endif // DATASETTYPES_H diff --git a/include/pbbam/DataSetXsd.h b/include/pbbam/DataSetXsd.h new file mode 100644 index 0000000..8d0ec38 --- /dev/null +++ b/include/pbbam/DataSetXsd.h @@ -0,0 +1,157 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file DataSetXsd.h +/// \brief Defines the XSD- and namespace-related classes for DataSetXML. +// +// Author: Derek Barnett + +#ifndef DATASETXSD_H +#define DATASETXSD_H + +#include "pbbam/Config.h" +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief The XsdType enum defines the supported XSD namespaces. +/// +enum class XsdType +{ + NONE + + , AUTOMATION_CONSTRAINTS + , BASE_DATA_MODEL + , COLLECTION_METADATA + , COMMON_MESSAGES + , DATA_MODEL + , DATA_STORE + , DATASETS + , DECL_DATA + , PART_NUMBERS + , PRIMARY_METRICS + , REAGENT_KIT + , RIGHTS_AND_ROLES + , SAMPLE_INFO + , SEEDING_DATA +}; + +/// \brief The NamespaceInfo class provides XML namespace info (prefix & URI). +/// +class PBBAM_EXPORT NamespaceInfo +{ +public: + /// \brief Creates an empty entry. + /// + /// This constructor only exists for STL container compatibility. + /// + NamespaceInfo(void); + + /// \brief Creates a valid info entry. + NamespaceInfo(const std::string& name, + const std::string& uri); + +public: + /// \brief Fetches namespace name (i.e. prefix) + const std::string& Name(void) const { return name_; } + + /// \brief Fetches namespace URI. + const std::string& Uri(void) const { return uri_; } + +private: + std::string name_; + std::string uri_; +}; + +/// \brief The NamespaceRegistry class provides a per-dataset registry of XML +/// namespace information. +/// +/// This is used to format XML output - properly prefixing element labels with +/// namespace as appropriate. +/// +class PBBAM_EXPORT NamespaceRegistry +{ +public: + /// \name Constructors & Related Methods + /// \{ + + NamespaceRegistry(void); + NamespaceRegistry(const NamespaceRegistry& other); + NamespaceRegistry(NamespaceRegistry&& other); + NamespaceRegistry& operator=(const NamespaceRegistry& other); + NamespaceRegistry& operator=(NamespaceRegistry&& other); + ~NamespaceRegistry(void); + + /// \} + +public: + /// \name Registry Access + /// \{ + + /// \brief Fetches namespace info for the dataset's default XSD type. + const NamespaceInfo& DefaultNamespace(void) const; + + /// \brief Fetches dataset's default XSD type. + XsdType DefaultXsd(void) const; + + /// \brief Fetches namespace info for the requested XSD type. + const NamespaceInfo& Namespace(const XsdType& xsd) const; + + /// \brief Registers namespace info for a particular XSD type. + void Register(const XsdType& xsd, const NamespaceInfo& namespaceInfo); + + /// \brief Updates dataset's default XSD type. + void SetDefaultXsd(const XsdType& xsd); + + /// \brief Fetches the XSD type for \p elementLabel. + XsdType XsdForElement(const std::string& elementLabel) const; + + /// \brief Fetches the XSD type for a particular URI. + XsdType XsdForUri(const std::string& uri) const; + + /// \} + +private: + std::map data_; + XsdType defaultXsdType_; +}; + +} // namespace PacBio +} // namespace BAM + +#endif // DATASETXSD_H diff --git a/include/pbbam/EntireFileQuery.h b/include/pbbam/EntireFileQuery.h new file mode 100644 index 0000000..10c06ff --- /dev/null +++ b/include/pbbam/EntireFileQuery.h @@ -0,0 +1,97 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file EntireFileQuery.h +/// \brief Defines the EntireFileQuery class. +// +// Author: Derek Barnett + +#ifndef ENTIREFILEQUERY_H +#define ENTIREFILEQUERY_H + +#include "pbbam/internal/QueryBase.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The EntireFileQuery class provides iterable access to a DataSet's +/// %BAM records, reading through the entire contents of each file. +/// +/// Input files will be accessed in the order listed in the DataSet. +/// +/// \include code/EntireFileQuery.txt +/// +/// Iteration is not limited to only 'const' records. The files themselves will +/// not be affected, but individual records may be modified if needed. +/// +/// \include code/EntireFileQuery_NonConst.txt +/// +/// \note DataSets can be implicitly constructed from %BAM filenames as well. +/// Thus a single %BAM file can be read through using the following: +/// +/// \include code/EntireFileQuery_BamFilename.txt +/// +class PBBAM_EXPORT EntireFileQuery : public internal::IQuery +{ +public: + /// \brief Creates a new EntireFileQuery, reading through the entire + /// contents of a dataset. + /// + /// \param[in] dataset input data source(s) + /// \throws std::runtime_error on failure to open/read underlying %BAM + /// files. + /// + EntireFileQuery(const PacBio::BAM::DataSet& dataset); + ~EntireFileQuery(void); + +public: + /// \brief Main iteration point for record access. + /// + /// Most client code should not need to use this method directly. Use + /// iterators instead. + /// + bool GetNext(BamRecord& r); + +private: + struct EntireFileQueryPrivate; + std::unique_ptr d_; +}; + +} // namespace BAM +} // namspace PacBio + +#endif // ENTIREFILEQUERY_H diff --git a/include/pbbam/FastaReader.h b/include/pbbam/FastaReader.h new file mode 100644 index 0000000..dc19e53 --- /dev/null +++ b/include/pbbam/FastaReader.h @@ -0,0 +1,113 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file FastaReader.h +/// \brief Defines the FastaReader class. +// +// Author: Derek Barnett + +#ifndef FASTAREADER_H +#define FASTAREADER_H + +#include "pbbam/FastaSequence.h" +#include +#include + +namespace PacBio { +namespace BAM { + +namespace internal { struct FastaReaderPrivate; } + +/// +/// \brief The FastaReader provides sequential access to FASTA records. +/// +class FastaReader +{ +public: + /// + /// \brief Reads all FASTA sequences from a file + /// + /// \param fn FASTA filename + /// \return vector of FastaSequence results + /// + static std::vector ReadAll(const std::string& fn); + +public: + /// \name Constructors & Related Methods + /// \{ + + explicit FastaReader(const std::string& fn); + FastaReader(FastaReader&& other); + FastaReader& operator=(FastaReader&& other); + ~FastaReader(void); + + // copy is disabled + FastaReader(const FastaReader&) = delete; + FastaReader& operator=(const FastaReader&) = delete; + + /// \} + +public: + /// \name Sequence Access + /// \{ + + /// + /// \brief GetNext + /// + /// \code{cpp} + /// + /// FastaReader reader{ fn }; + /// FastaSequence f; + /// while (reader.GetNext(f)) { + /// // do stuff with f + /// } + /// \endcode + /// + /// \param[out] record + /// \return success/failure + /// + bool GetNext(FastaSequence& record); + + /// \} + +private: + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // FASTAREADER_H diff --git a/include/pbbam/FastaSequence.h b/include/pbbam/FastaSequence.h new file mode 100644 index 0000000..7748506 --- /dev/null +++ b/include/pbbam/FastaSequence.h @@ -0,0 +1,103 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file FastaSequence.h +/// \brief Defines the FastaSequence class. +// +// Author: Derek Barnett + +#ifndef FASTASEQUENCE_H +#define FASTASEQUENCE_H + +#include + +namespace PacBio { +namespace BAM { + +/// +/// \brief The FastaSequence class represents a FASTA record (name & bases) +/// +class FastaSequence +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// + /// \brief FastaSequence + /// \param name + /// \param bases + /// + explicit FastaSequence(std::string name, std::string bases); + + FastaSequence(void) = default; + FastaSequence(const FastaSequence&) = default; + FastaSequence(FastaSequence&&) = default; + FastaSequence& operator=(const FastaSequence&) = default; + FastaSequence& operator=(FastaSequence&&) = default; + ~FastaSequence(void) = default; + + /// \} + +public: + /// \name Attributes + /// \{ + + /// + /// \brief Name + /// \return + /// + std::string Name(void) const; + + /// + /// \brief Bases + /// \return + /// + std::string Bases(void) const; + + /// \} + +private: + std::string name_; + std::string bases_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "internal/FastaSequence.inl" + +#endif // FASTASEQUENCE_H diff --git a/include/pbbam/FrameEncodingType.h b/include/pbbam/FrameEncodingType.h new file mode 100644 index 0000000..3b5a52b --- /dev/null +++ b/include/pbbam/FrameEncodingType.h @@ -0,0 +1,66 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file FrameEncodingType.h +/// \brief Defines the FrameEncodingType enum. +// +// Author: Derek Barnett + +#ifndef FRAMEENCODINGTYPE_H +#define FRAMEENCODINGTYPE_H + +namespace PacBio { +namespace BAM { + +/// \brief This enum defines the possible encoding modes used in Frames data +/// (e.g. BamRecord::IPD or BamRecord::PulseWidth). +/// +/// The LOSSY mode is the default in production output; LOSSLESS mode +/// being used primarily for internal applications. +/// +/// \sa https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/BAM.rst +/// for more information on pulse frame encoding schemes. +/// +enum class FrameEncodingType +{ + LOSSY ///< 8-bit compression (using CodecV1) of frame data + , LOSSLESS ///< 16-bit native frame data +}; + +} // namespace BAM +} // namespace PacBio + +#endif // FRAMEENCODINGTYPE_H diff --git a/include/pbbam/Frames.h b/include/pbbam/Frames.h new file mode 100644 index 0000000..326701b --- /dev/null +++ b/include/pbbam/Frames.h @@ -0,0 +1,187 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Frames.h +/// \brief Defines the Frames class. +// +// Author: Derek Barnett + +#ifndef FRAMES_H +#define FRAMES_H + +#include "pbbam/Config.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The Frames class represents pulse frame data. +/// +/// Frame data may be stored in either their raw, 16-bit values or +/// using a lossy, 8-bit compression scheme. +/// +/// This class is used to store the data and convert between the 2 storage types. +/// +class PBBAM_EXPORT Frames +{ +public: + /// \name Conversion Methods + /// \{ + + /// \brief Constructs a Frames object from encoded (lossy, 8-bit) data. + /// + /// \note This method should probably not be needed often by client code + /// working with frame data. It exists primarily for (internal) + /// parsing & interpretation of the %BAM file contents. The method is + /// available, though, should the conversion operation be needed. + /// + /// \param[in] codedData encoded data + /// \returns Frames object + /// + static Frames Decode(const std::vector& codedData); + + /// \brief Creates encoded, compressed frame data from raw input data. + /// + /// \param[in] frames raw frame data + /// \returns lossy, 8-bit encoded frame data + /// + static std::vector Encode(const std::vector& frames); + + /// \} + +public: + /// \name Constructors & Related Methods + /// \{ + + Frames(void); + Frames(const std::vector& frames); + Frames(std::vector&& frames); + Frames(const Frames& other); + Frames(Frames&& other); + Frames& operator=(const Frames& other); + Frames& operator=(Frames&& other); + ~Frames(void); + + /// \} + +public: + /// \name Access Data + /// \{ + + /// \returns Frame data in expanded (not encoded) form + std::vector& DataRaw(void); + const std::vector& Data(void) const; + + /// \} + +public: + /// \name Conversion Methods + /// \{ + + /// \returns Frame data in (lossy, 8-bit) encoded form. + std::vector Encode(void) const; + + /// \} + +public: + /// \name Comparison Operators + /// \{ + + bool operator==(const Frames& other) const; + bool operator!=(const Frames& other) const; + + /// \} + +public: + /// \name STL Compatbility + /// \{ + + /// \returns A const_iterator to the beginning of the sequence. + std::vector::const_iterator cbegin(void) const; + + /// \returns A const_iterator to the element past the end of the sequence. + std::vector::const_iterator cend(void) const; + + /// \returns A const_iterator to the beginning of the sequence. + std::vector::const_iterator begin(void) const; + + /// \returns A const_iterator to the element past the end of the sequence. + std::vector::const_iterator end(void) const; + + /// \returns An iterator to the beginning of the sequence. + std::vector::iterator begin(void); + + /// \returns An iterator to the element past the end of the sequence. + std::vector::iterator end(void); + + /// \returns The number of frame data points. + size_t size(void) const; + + /// \returns True if the container is empty, false otherwise. + bool empty(void) const; + + /// \} + +public: + /// \name Access Data + /// \{ + + /// Sets this record's data. + /// + /// \param[in] frames data in expanded (not encoded) form + /// \returns reference to this object + /// + Frames& Data(const std::vector& frames); + + /// Sets this record's data. + /// + /// \param[in] frames data in expanded (not encoded) form + /// \returns reference to this object + /// + Frames& Data(std::vector&& frames); + + /// \} + +private: + std::vector data_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/Frames.inl" + +#endif // FRAMES_H diff --git a/include/pbbam/GenomicInterval.h b/include/pbbam/GenomicInterval.h new file mode 100644 index 0000000..a7d4986 --- /dev/null +++ b/include/pbbam/GenomicInterval.h @@ -0,0 +1,188 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file GenomicInterval.h +/// \brief Defines the GenomicInterval class. +// +// Author: Derek Barnett + +#ifndef GENOMICINTERVAL_H +#define GENOMICINTERVAL_H + +#include "pbbam/Config.h" +#include "pbbam/Interval.h" +#include "pbbam/Position.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The GenomicInterval class represents a genomic interval (reference +/// name and 0-based coordinates). +/// +class PBBAM_EXPORT GenomicInterval +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty genomic interval + GenomicInterval(void); + + /// \brief Creates a genomic interval on sequence with \p name, using range: + /// [\p start, \p stop) + GenomicInterval(const std::string& name, + const Position& start, + const Position& stop); + + /// \brief Creates a genomic interval, using REGION string + /// + /// ":-" ("chr8:200-600") + /// + /// \note The htslib/samtools REGION string expects start positions to be + /// 1-based. However, throughout pbbam (including the rest of this + /// class), we stick to 0-based start coordinates. Thus, while the + /// syntax matches that of samtools, we are using a 0-based start + /// coordinate here. + /// + GenomicInterval(const std::string& zeroBasedRegionString); + + GenomicInterval(const GenomicInterval& other); + GenomicInterval& operator=(const GenomicInterval& other); + + ~GenomicInterval(void); + + /// \} + +public: + /// \name Comparison Operators + /// \{ + + /// \returns true if same id & underlying interval + bool operator==(const GenomicInterval& other) const; + + /// \returns true if either ids or underlying intervals differ + bool operator!=(const GenomicInterval& other) const; + + /// \} + +public: + /// \name Interval Operations + /// \{ + + /// \returns true if same id and underlying Interval::CoveredBy() other. + bool CoveredBy(const GenomicInterval& other) const; + + /// \returns true if same id and underlying Interval::Covers() other. + bool Covers(const GenomicInterval& other) const; + + /// \returns true if same id and underlying Interval::Intersects() other. + bool Intersects(const GenomicInterval& other) const; + + /// \returns true if underlying Interval::IsValid(), and id/endpoints are + /// non-negative. + /// + bool IsValid(void) const; + + /// \returns length of underlying + size_t Length(void) const; + + /// \} + + +public: + /// \name Attributes + /// \{ + + /// \returns interval reference name + std::string Name(void) const; + + /// \returns underlying Interval object + PacBio::BAM::Interval Interval(void) const; + + /// \returns interval start coordinate + Position Start(void) const; + + /// \returns interval stop coordinate + Position Stop(void) const; + + /// \} + +public: + /// \name Attributes + /// \{ + + /// Sets this interval's reference name. + /// + /// \param[in] name + /// \returns reference to this interval + /// + GenomicInterval& Name(const std::string& name); + + /// Sets this underlying Interval + /// + /// \param[in] interval + /// \returns reference to this interval + /// + GenomicInterval& Interval(const PacBio::BAM::Interval& interval); + + /// Sets this interval's start coordinate. + /// + /// \param[in] start + /// \returns reference to this interval + /// + GenomicInterval& Start(const Position start); + + /// Sets this interval's stop coordinate. + /// + /// \param[in] stop + /// \returns reference to this interval + /// + GenomicInterval& Stop(const Position stop); + + /// \} + +private: + std::string name_; + PacBio::BAM::Interval interval_; +}; + +} // namespace BAM +} // namspace PacBio + +#include "pbbam/internal/GenomicInterval.inl" + +#endif // GENOMICINTERVAL_H diff --git a/include/pbbam/GenomicIntervalQuery.h b/include/pbbam/GenomicIntervalQuery.h new file mode 100644 index 0000000..7df7721 --- /dev/null +++ b/include/pbbam/GenomicIntervalQuery.h @@ -0,0 +1,112 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file GenomicIntervalQuery.h +/// \brief Defines the GenomicIntervalQuery class. +// +// Author: Derek Barnett + +#ifndef GENOMICINTERVALQUERY_H +#define GENOMICINTERVALQUERY_H + +#include "pbbam/GenomicInterval.h" +#include "pbbam/internal/QueryBase.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The GenomicIntervalQuery class provides iterable access to a +/// DataSet's %BAM records, limiting results to those overlapping a +/// GenomicInterval. +/// +/// Example: +/// \include code/GenomicIntervalQuery.txt +/// +/// \note Currently, all %BAM files must have a corresponding ".bai" index file. +/// Use BamFile::EnsureStandardIndexExists before creating the query if +/// one may not be present. +/// +class PBBAM_EXPORT GenomicIntervalQuery : public internal::IQuery +{ +public: + + /// \brief Constructs a new GenomiIntervalQuery, limiting record results to + /// only those overalpping a GenomicInterval. + /// + /// \param[in] interval genomic interval of interest + /// \param[in] dataset input data source(s) + /// + /// \throws std::runtime_error on failure to open/read underlying %BAM or + /// BAI files. + /// + GenomicIntervalQuery(const GenomicInterval& interval, + const PacBio::BAM::DataSet& dataset); + ~GenomicIntervalQuery(void); + +public: + /// \brief Main iteration point for record access. + /// + /// Most client code should not need to use this method directly. Use + /// iterators instead. + /// + bool GetNext(BamRecord& r); + +public: + /// \brief Sets a new genomic interval. + /// + /// This allows the same dataset/query to be re-used over multiple regions of + /// interest: + /// + /// \include code/GenomicIntervalQuery_Reuse.txt + /// + /// \param[in] interval new genomic interval + /// \returns reference to this query + /// + GenomicIntervalQuery& Interval(const GenomicInterval& interval); + + /// \returns Current genomic interval active on this query. + const GenomicInterval& Interval(void) const; + +private: + struct GenomicIntervalQueryPrivate; + std::unique_ptr d_; +}; + +} // namespace BAM +} // namspace PacBio + +#endif // GENOMICINTERVALQUERY_H diff --git a/include/pbbam/IRecordWriter.h b/include/pbbam/IRecordWriter.h new file mode 100644 index 0000000..9acf1db --- /dev/null +++ b/include/pbbam/IRecordWriter.h @@ -0,0 +1,92 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file IRecordWriter.h +/// \brief Defines the IRecordWriter interface. +// +// Author: Derek Barnett + +#ifndef IRECORDWRITER_H +#define IRECORDWRITER_H + +namespace PacBio { +namespace BAM { + +class BamRecord; +class BamRecordImpl; + +class IRecordWriter +{ +public: + virtual ~IRecordWriter(void); + +public: + + /// \brief Try to flush any buffered data to file. + /// + /// \note The underlying implementation may not necessarily flush buffered + /// data immediately, especially in a multithreaded writer situation. + /// Let the writer go out of scope to fully ensure flushing. + /// + /// \throws std::runtime_error if flush fails + /// + virtual void TryFlush(void) =0; + + + /// \brief Write a record to the output %BAM file. + /// + /// \param[in] record BamRecord object + /// + /// \throws std::runtime_error on failure to write + /// + virtual void Write(const BamRecord& record) =0; + + /// \brief Write a record to the output %BAM file. + /// + /// \param[in] recordImpl BamRecordImpl object + /// + /// \throws std::runtime_error on failure to write + /// + virtual void Write(const BamRecordImpl& recordImpl) =0; + +protected: + IRecordWriter(void); +}; + +} // namespace BAM +} // namespace PacBio + +#endif // IRECORDWRITER_H diff --git a/include/pbbam/IndexedFastaReader.h b/include/pbbam/IndexedFastaReader.h new file mode 100644 index 0000000..b382d96 --- /dev/null +++ b/include/pbbam/IndexedFastaReader.h @@ -0,0 +1,170 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file IndexedFastaReader.h +/// \brief Defines the IndexedFastaReader class. +// +// Author: David Alexander + +#ifndef INDEXEDFASTAREADER_H +#define INDEXEDFASTAREADER_H + +#include "pbbam/Orientation.h" +#include "pbbam/Position.h" +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { + +class GenomicInterval; +class BamRecord; + +/// \brief The IndexedFastaReader class provides random-access to FASTA file +/// data. +/// +class IndexedFastaReader { + +public: + /// \name Constructors & Related Methods + /// \{ + + IndexedFastaReader(void) = delete; + IndexedFastaReader(const std::string& filename); + IndexedFastaReader(const IndexedFastaReader& src); + IndexedFastaReader& operator=(const IndexedFastaReader& rhs); + ~IndexedFastaReader(void); + + /// \} + +public: + /// name Sequence Access + /// \{ + + /// \brief Fetches FASTA sequence for desired interval. + /// + /// \param[in] id reference sequence name + /// \param[in] begin start position + /// \param[in] end end position + /// + /// \returns sequence string at desired interval + /// + /// \throws std::runtime_error on failure to fetch sequence + /// + std::string Subsequence(const std::string& id, + Position begin, + Position end) const; + + /// \brief Fetches FASTA sequence for desired interval. + /// + /// \param[in] interval desired interval + /// + /// \returns sequence string at desired interval + /// + /// \throws std::runtime_error on failure to fetch sequence + /// + std::string Subsequence(const GenomicInterval& interval) const; + + /// \brief Fetches FASTA sequence for desired interval. + /// + /// \param[in] htslibRegion htslib/samtools-formatted REGION string + /// representing the desired interval + /// + /// \returns sequence string at desired interval + /// + /// \throws std::runtime_error on failure to fetch sequence + /// + std::string Subsequence(const char* htslibRegion) const; + + /// \brief Fetches FASTA sequence corresponding to a BamRecord, oriented and + /// gapped as requested. + /// + /// For example, "native" orientation and "gapped" will return the reference + /// sequence with gaps inserted, as would align against the read in "native" + /// orientation. + /// + /// \param[in] bamRecord input BamRecord to derive interval/CIGAR + /// data + /// \param[in] orientation orientation of output + /// \param[in] gapped if true, gaps/padding will be inserted, per + /// record's CIGAR info. + /// \param[in] exciseSoftClips if true, any soft-clipped positions will be + /// removed from query ends + /// + /// \returns sequence string over the record's interval + /// + /// \throws std::runtime_error on failure to fetch sequence + /// + std::string ReferenceSubsequence(const BamRecord& bamRecord, + const Orientation orientation=Orientation::GENOMIC, + const bool gapped=false, + const bool exciseSoftClips=false) const; + + /// \} + +public: + /// \name File Attributes + /// \{ + + /// \returns true if FASTA file contains a sequence matching \p name + bool HasSequence(const std::string& name) const; + + /// \returns number of sequences stored in FASTA file + int NumSequences(void) const; + + /// \returns length of FASTA sequence + /// + /// \throws std::runtime_error if length could not be determined + /// + int SequenceLength(const std::string& name) const; + + /// \} + +private: + std::string filename_; + faidx_t* handle_; + +private: + void Close(void); + bool Open(const std::string& filename); +}; + +} // namespace BAM +} // namespace PacBio + +#endif // INDEXEDFASTAREADER_H diff --git a/include/pbbam/Interval.h b/include/pbbam/Interval.h new file mode 100644 index 0000000..3f5a40e --- /dev/null +++ b/include/pbbam/Interval.h @@ -0,0 +1,151 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Interval.h +/// \brief Defines the Interval class. +// +// Author: Derek Barnett + +#ifndef INTERVAL_H +#define INTERVAL_H + +#include "pbbam/Config.h" +#include + +#define BOOST_ICL_USE_STATIC_BOUNDED_INTERVALS +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief Represents a half-open (right-open) interval [start, stop) +/// +/// \note This class is agnostic whether the values are 0-based or 1-based. +/// Client code should primarily work with GenomicInterval, which does +/// enforce this distinction. +/// +template +class Interval +{ +public: + typedef boost::icl::discrete_interval interval_type; + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty interval [0,0) + Interval(void); + + /// \brief Creates a 'singleton' interval [val,val+1) + Interval(const T val); + + /// brief Creates an interval from [start, stop) */ + Interval(const T start, const T stop); + + Interval(const Interval& other); + + /// \} + +public: + /// \name Comparison Operators + /// \{ + + /// \returns true if both intervals share the same endpoints + bool operator==(const Interval& other) const; + + /// \returns true if either interval's endpoints differ + bool operator!=(const Interval& other) const; + + /// \} + +public: + /// \name Attributes + /// \{ + + /// \returns interval's start coordinate + T Start(void) const; + + /// Sets this interval's start coordinate. + /// + /// \param[in] start + /// \returns reference to this interval + /// + Interval& Start(const T& start); + + /// \returns interval's stop coordinate + T Stop(void) const; + + /// Sets this interval's stop coordinate. + /// + /// \param[in] stop + /// \returns reference to this interval + /// + Interval& Stop(const T& stop); + + /// \} + +public: + /// \name Interval Operations + + /// \returns true if this interval is fully covered by (or contained in) \p other + bool CoveredBy(const Interval& other) const; + + //// \returns true if this interval covers (or contains) \p other + bool Covers(const Interval& other) const; + + /// \returns true if intervals interset + bool Intersects(const Interval& other) const; + + /// \returns true if interval is valid (e.g. start < stop) + bool IsValid(void) const; + + /// \returns interval length + size_t Length(void) const; + + /// \} + +private: + interval_type data_; +}; + +} // namespace BAM +} // namspace PacBio + +#include "pbbam/internal/Interval.inl" + +#endif // GENOMICINTERVAL_H diff --git a/include/pbbam/LocalContextFlags.h b/include/pbbam/LocalContextFlags.h new file mode 100644 index 0000000..0c59707 --- /dev/null +++ b/include/pbbam/LocalContextFlags.h @@ -0,0 +1,77 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file LocalContextFlags.h +/// \brief Defines the LocalContextFlags enum & helper method(s). +// +// Author: Lance Hepler + +#ifndef LOCALCONTEXTFLAGS_H +#define LOCALCONTEXTFLAGS_H + +#include "pbbam/Config.h" + +namespace PacBio { +namespace BAM { + +/// \brief The LocalContextFlags enum defines the flags that can be used +/// to describe a subread's "local context", i.e. whether it is +/// flanked by barcodes/adapters or its pass orientation. +/// +enum LocalContextFlags : uint8_t +{ + NO_LOCAL_CONTEXT = 0, ///< No context information available + ADAPTER_BEFORE = 1, ///< Adapter precedes subread + ADAPTER_AFTER = 2, ///< Adapter follows subread + BARCODE_BEFORE = 4, ///< Barcode precedes subread + BARCODE_AFTER = 8, ///< Barcode follows subread + FORWARD_PASS = 16, ///< Subread's orientation is 'forward pass' + REVERSE_PASS = 32 ///< Subread's orientation is 'reverse pass' +}; + + +/// \returns a LocalContextFlags value containing the result of the bitwise-OR +/// operation of \p lhs and \p rhs. +// constexpr is implicitly inline +constexpr LocalContextFlags operator|(const LocalContextFlags lhs, const LocalContextFlags rhs) +{ + return static_cast(static_cast(lhs) | static_cast(rhs)); +} + +} // namespace BAM +} // namespace PacBio + +#endif // LOCALCONTEXTFLAGS_H diff --git a/include/pbbam/MD5.h b/include/pbbam/MD5.h new file mode 100644 index 0000000..03a1979 --- /dev/null +++ b/include/pbbam/MD5.h @@ -0,0 +1,57 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file MD5.h +/// \brief Defines basic MD5 hash utilities +// +// Author: Brett Bowman + +#ifndef MD5_H +#define MD5_H + +#include + +namespace PacBio { +namespace BAM { + +/// \brief MD5 hash of a string as a 32-digit hexadecimal string +/// +std::string MD5Hash(const std::string& str); + +} // namespace BAM +} // namespace PacBio + +#endif // MD5_H diff --git a/include/pbbam/Orientation.h b/include/pbbam/Orientation.h new file mode 100644 index 0000000..c354822 --- /dev/null +++ b/include/pbbam/Orientation.h @@ -0,0 +1,69 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Orientation.h +/// \brief Defines the Orientation enum. +// +// Author: Derek Barnett + +#ifndef ORIENTATION_H +#define ORIENTATION_H + +#include "pbbam/Config.h" + +namespace PacBio { +namespace BAM { + +/// \brief This enum defines the orientations recognized by BamRecord, for +/// presenting "per-base" data. +/// +/// Orientation::NATIVE indicates that data should be presented in the subread's +/// original form. +/// +/// Orientation::GENOMIC indicates that data should be presented relative to +/// genomic forward strand. This means that data will be reversed (or +/// reverse-complemented) if the subread was aligned to the reverse strand. +/// +enum class Orientation +{ + NATIVE ///< Present data in 'raw' original orientation, regardless of aligned Strand + , GENOMIC ///< Present data in aligned orientation, always relative to Strand::FORWARD. +}; + +} // namespace BAM +} // namespace PacBio + +#endif // ORIENTATION_H diff --git a/include/pbbam/PbiBasicTypes.h b/include/pbbam/PbiBasicTypes.h new file mode 100644 index 0000000..4006ed4 --- /dev/null +++ b/include/pbbam/PbiBasicTypes.h @@ -0,0 +1,108 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiBasicTypes.h +/// \brief Defines the basic data structures used in PBI lookups. +// +// Author: Derek Barnett + +#ifndef PBIBASICTYPES_H +#define PBIBASICTYPES_H + +#include "pbbam/Compare.h" +#include "pbbam/Config.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief The IndexResultBlock class represents a contiguous group of records +/// returned from a PBI lookup. +/// +/// Contiguous reads that satisfy a PBI lookup query will be merged down into a +/// single block. This helps to minimize the number of seeks in subsequent read +/// operations. +/// +/// An PBI-enabled reader or query can iterate over a list of IndexResultBlocks; +/// for each block, seeking to the first record and then sequentially reading +/// 'numReads' consecutive records before needing to seek again. +/// +struct PBBAM_EXPORT IndexResultBlock +{ +public: + IndexResultBlock(void); + IndexResultBlock(size_t idx, size_t numReads); + +public: + bool operator==(const IndexResultBlock& other) const; + bool operator!=(const IndexResultBlock& other) const; + +public: + size_t firstIndex_; ///< index of block's first record in BAM/PBI files (e.g. i-th record) + size_t numReads_; ///< number of reads in this block + int64_t virtualOffset_; ///< virtual offset of first record in this block +}; + +/// \brief container of PBI result blocks +/// +typedef std::deque IndexResultBlocks; + +/// \brief container of raw PBI indices +/// +/// This is the primary result of PbiFilter -associated classes. This raw list +/// can participate in set operations (union, intersect) for compound filters, +/// and then be merged down into IndexResultBlocks for actual data file +/// random-access. +/// +typedef std::vector IndexList; + +/// \brief pair representing a range of PBI indices: where interval +/// is [first, second) +/// +/// Used primarily by the PBI's CoordinateSortedData components. +/// +/// \sa PbiReferenceEntry, PbiRawReferenceData, & ReferenceLookupData +/// +typedef std::pair IndexRange; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/PbiBasicTypes.inl" + +#endif // PBIBASICTYPES_H diff --git a/include/pbbam/PbiBuilder.h b/include/pbbam/PbiBuilder.h new file mode 100644 index 0000000..d1d83bc --- /dev/null +++ b/include/pbbam/PbiBuilder.h @@ -0,0 +1,210 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiBuilder.h +/// \brief Defines the PbiBuilder class. +// +// Author: Derek Barnett + +#ifndef PBIBUILDER_H +#define PBIBUILDER_H + +#include "pbbam/Config.h" +#include +#include + +namespace PacBio { +namespace BAM { + +class BamRecord; +class PbiRawData; + +namespace internal { class PbiBuilderPrivate; } + +/// \brief The PbiBuilder class construct PBI index data from %BAM record data. +/// +/// Records are added one-by-one. This allows for either whole-file indexing of +/// existing %BAM files or for indexing "on-the-fly" alongside a %BAM file as it +/// is generated. +/// +/// For simple PBI creation from existing %BAM files, see PbiFile::CreateFrom. +/// This is the recommended approach, unless finer control or additional +/// processing is needed. +/// +class PBBAM_EXPORT PbiBuilder +{ +public: + /// \brief This enum allows you to control the compression level of the + /// output PBI file. + /// + /// Values are equivalent to zlib compression levels. See its documentation + /// for more details: http://www.zlib.net/manual.html + /// + enum CompressionLevel + { + CompressionLevel_0 = 0 + , CompressionLevel_1 = 1 + , CompressionLevel_2 = 2 + , CompressionLevel_3 = 3 + , CompressionLevel_4 = 4 + , CompressionLevel_5 = 5 + , CompressionLevel_6 = 6 + , CompressionLevel_7 = 7 + , CompressionLevel_8 = 8 + , CompressionLevel_9 = 9 + + , DefaultCompression = -1 + , NoCompression = CompressionLevel_0 + , FastCompression = CompressionLevel_1 + , BestCompression = CompressionLevel_9 + }; + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Initializes builder to write data to \p pbiFilename. + /// + /// \param[in] pbiFilename output filename + /// \param[in] compressionLevel zlib compression level + /// \param[in] numThreads number of threads for compression. If set to + /// 0, PbiBuilder will attempt to determine a + /// reasonable estimate. If set to 1, this will + /// force single-threaded execution. No checks + /// are made against an upper limit. + /// + /// \throws std::runtime_error if PBI file cannot be opened for writing + /// + PbiBuilder(const std::string& pbiFilename, + const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression, + const size_t numThreads = 4); + + /// \brief Initializes builder to write data to \p pbiFilename. + /// + /// Reference data-tracking structures will be initialized to expect + /// \p numReferenceSequences. (This is useful so that we can mark any + /// references that lack observed data appropriately). + /// + /// \param[in] pbiFilename output filename + /// \param[in] numReferenceSequences number of possible reference + /// sequences, e.g. BamHeader::NumSequences + /// \param[in] compressionLevel zlib compression level + /// \param[in] numThreads number of threads for compression. If set to + /// 0, PbiBuilder will attempt to determine a + /// reasonable estimate. If set to 1, this will + /// force single-threaded execution. No checks + /// are made against an upper limit. + /// + /// \throws std::runtime_error if PBI file cannot be opened for writing + /// + PbiBuilder(const std::string& pbiFilename, + const size_t numReferenceSequences, + const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression, + const size_t numThreads = 4); + + /// \brief Initializes builder to write data to \p pbiFilename. + /// + /// Reference data-tracking structures will be initialized to expect + /// \p numReferenceSequences, but only if \p isCoordinateSorted is true. + /// + /// \param[in] pbiFilename output filename + /// \param[in] numReferenceSequences number of possible reference + /// sequences, e.g. BamHeader::NumSequences + /// \param[in] isCoordinateSorted if false, disables reference + /// sequence tracking + /// (BamHeader::SortOrder != "coordinate") + /// \param[in] compressionLevel zlib compression level + /// \param[in] numThreads number of threads for compression. If set to + /// 0, PbiBuilder will attempt to determine a + /// reasonable estimate. If set to 1, this will + /// force single-threaded execution. No checks + /// are made against an upper limit. + /// + /// \throws std::runtime_error if PBI file cannot be opened for writing + /// + PbiBuilder(const std::string& pbiFilename, + const size_t numReferenceSequences, + const bool isCoordinateSorted, + const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression, + const size_t numThreads = 4); + + /// \brief Destroys builder, writing its data out to PBI file. + /// + /// On destruction, data summaries are calculated, raw data is written to + /// file, and file handle closed. + /// + ~PbiBuilder(void); + + /// \} + +public: + /// \name Index Building + /// \{ + + /// \brief Adds \p record's data to underlying raw data structure. + /// + /// \note \p vOffset is a BGZF \b virtual offset into the %BAM file. To get + /// this value, you should use one of the following: \n + /// - while reading existing %BAM: BamReader::VirtualTell \n + /// - while writing new %BAM: BamWriter::Write(const BamRecord& record, int64_t* vOffset) \n + /// + /// + /// To build a PBI index while generating a %BAM file: + /// \include code/PbiBuilder_WithWriter.txt + /// + /// To build a PBI index from an existing %BAM file: + /// \include code/PbiBuilder_WithReader.txt + /// + /// \param[in] record input BamRecord to pull index data from + /// \param[in] vOffset \b virtual offset into %BAM file where record begins + /// + void AddRecord(const BamRecord& record, const int64_t vOffset); + + /// \returns const reference to current raw index data. Mostly only used for + /// testing; shouldn't be needed by most client code. + /// + const PbiRawData& Index(void) const; + + /// \} + +private: + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // PBIBUILDER_H diff --git a/include/pbbam/PbiFile.h b/include/pbbam/PbiFile.h new file mode 100644 index 0000000..89bffa3 --- /dev/null +++ b/include/pbbam/PbiFile.h @@ -0,0 +1,96 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiFile.h +/// \brief Defines the PbiFile enums, typedefs, and methods. +// +// Author: Derek Barnett + +#ifndef PBIFILE_H +#define PBIFILE_H + +#include "pbbam/Config.h" +#include "pbbam/PbiBuilder.h" +#include + +namespace PacBio { +namespace BAM { + +class BamFile; + +namespace PbiFile +{ + /// \brief This enum describes the PBI file sections + /// + enum Section + { + BASIC = 0x0000 ///< BasicData (required) + , MAPPED = 0x0001 ///< MappedData (always optional) + , REFERENCE = 0x0002 ///< ReferenceData (always optional) + , BARCODE = 0x0004 ///< BarcodeData (always optional) + + , ALL = BASIC | MAPPED | REFERENCE | BARCODE ///< Synonym for 'all sections' + }; + + /// \brief Helper typedef for storing multiple Section flags. + /// + typedef uint16_t Sections; + + /// \brief This enum describes the PBI file version. + enum VersionEnum + { + Version_3_0_0 = 0x030000 ///< v3.0.0 + , Version_3_0_1 = 0x030001 ///< v3.0.1 + + , CurrentVersion = Version_3_0_1 ///< Synonym for the current PBI version. + }; + + /// \brief Builds PBI index data from the supplied %BAM file and writes a + /// ".pbi" file. + /// + /// \param[in] bamFile source %BAM file + /// + /// \throws std::runtime_error if index file could not be created + /// + PBBAM_EXPORT void CreateFrom(const BamFile& bamFile, + const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression, + const size_t numThreads = 4); + +} // namespace PbiFile +} // namespace BAM +} // namespace PacBio + +#endif // PBIFILE_H diff --git a/include/pbbam/PbiFilter.h b/include/pbbam/PbiFilter.h new file mode 100644 index 0000000..65ef7ef --- /dev/null +++ b/include/pbbam/PbiFilter.h @@ -0,0 +1,343 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiFilter.h +/// \brief Defines the PbiFilter class & helper 'concept'. +// +// Author: Derek Barnett + +#ifndef PBIFILTER_H +#define PBIFILTER_H + +#include "pbbam/DataSet.h" +#include "pbbam/PbiBasicTypes.h" +#include "pbbam/PbiIndex.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { + +namespace internal { struct PbiFilterPrivate; } + +/// \brief The PbiFilterConcept class provides compile-time enforcement of the +/// required interface for PbiFilter's child filters. +/// +template +struct PbiFilterConcept +{ + BOOST_CONCEPT_USAGE(PbiFilterConcept) + { + // All PBI filters (built-in or client-define) need only provide this + // interface: + // + // bool Accepts(const PbiRawData& index, const size_t row) const; + // + const PbiRawData index; + bool result = filter.Accepts(index, 0); + (void)result; + } + +private: + T filter; +// PbiRawData index; +}; + +/// \brief The PbiFilter class provides a mechanism for performing PBI-enabled +/// lookups. +/// +/// The PbiFilter API is designed to be flexible, both built-in and for +/// client-side customization. Built-in filters are provided for common queries, +/// and client code can define and use custom filters as well. More complex +/// filtering rules can be constructed via composition of simpler child filters. +/// +/// Filter objects used as children of PbiFilter need only provide a method that +/// matches this signature: +/// +/// \include code/PbiFilter_Interface.txt +/// +/// This requirement is enforced internally, using the PbiFilterConcept to +/// require a compatible interface without requiring inheritance. This approach +/// allows composition of heterogeneous filter types without worrying about a +/// class hierarchy, pointer ownership across library/client boundaries, etc. +/// +/// Thus a client application can define a custom filter if the built-in filters +/// do not quite meet requirements. This filter may then be used in further +/// PbiFilter composition, or directly to PbiFilterQuery +/// +/// \include code/PbiFilter_CustomFilter.txt +/// +/// As mentioned above, complex filters can be built up using multiple "child" +/// filters. These complex filters are constructed by using either +/// PbiFilter::Union (logical-OR over all direct children) or +/// PbiFilter::Intersection (logical-AND over direct children). +/// +/// \include code/PbiFilter_Composition.txt +/// +class PBBAM_EXPORT PbiFilter +{ +public: + enum CompositionType + { + INTERSECT + , UNION + }; + +public: + /// \name Set Operations + /// \{ + + /// \brief Creates a PbiFilter that acts as intersection of the input + /// filters. + /// + /// A record must satisfy \b all of this filter's direct "child" filters. + /// + /// Equivalent to: + /// \include code/PbiFilter_Intersection_Copy.txt + /// + /// \param[in] filters vector of child filters + /// \returns composite filter + /// + static PbiFilter Intersection(const std::vector& filters); + + /// \brief Creates a PbiFilter that acts as an intersection of the input + /// filters. + /// + /// A record must satisfy \b all of this filter's direct "child" filters. + /// + /// Equivalent to: + /// \include code/PbiFilter_Intersection_Move.txt + /// + /// \param[in] filters vector of child filters + /// \returns composite filter + /// + static PbiFilter Intersection(std::vector&& filters); + + /// \brief Creates a PbiFilter that acts as a union of the input filters. + /// + /// A record must satisfy \b any of this filter's direct "child" filters. + /// + /// Equivalent to: + /// \include code/PbiFilter_Union_Copy.txt + /// + /// \param[in] filters vector of child filters + /// \returns composite filter + /// + static PbiFilter Union(const std::vector& filters); + + /// \brief Creates a PbiFilter that acts as a union of the input filters. + /// + /// A record must satisfy \b any of this filter's direct "child" filters. + /// + /// Equivalent to: + /// \include code/PbiFilter_Union_Move.txt + /// + /// \param[in] filters vector of child filters + /// \returns composite filter + /// + static PbiFilter Union(std::vector&& filters); + + /// \} + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates a PbiFilter from a %DataSet's described filters. + /// + /// A DataSet may contain a Filters element, itself a list of Filter + /// elements. Each Filter element will contain a Properties element, itself + /// a list of Property elements. + /// + /// The Filters hierarchy looks like this (in its XML output): + /// \verbinclude examples/plaintext/PbiFilter_DataSetXmlFilters.txt + /// + /// The resulting PbiFilter represents a union over all Filter elements, + /// with each Filter element requiring an intersection of all of its + /// Property criteria. These Property elements are mapped to built-in PBI + /// filter types. To use the labels in the example XML above, the filter + /// created here is equivalent to: + /// + /// (A && B) || (C && D) + /// + /// If a DataSet lacks any Filters, then an empty PbiFilter will be created + /// - corresponding to the dataset's entire contents. + /// + /// \param[in] dataset maybe containing filters + /// \returns composite filter + /// + static PbiFilter FromDataSet(const DataSet& dataset); + +public: + + /// \brief Creates an empty filter. + /// + /// \note An empty filter will result in all records being returned, e.g. + /// for query iteration. + /// + /// \param[in] type composition type. Any additional child filters added to + /// this composite will be treated according to this type. + /// If INTERSECT, a record must match all child filters. If + /// UNION, a record must match any child filter. + /// + PbiFilter(const CompositionType type = INTERSECT); + + /// \brief Creates a composite filter (of INTERSECT type) with an initial + /// child filter. + /// + /// \note T must satisfy PbiFilterConcept + /// + /// \param[in] filter initial child filter + /// + template + PbiFilter(const T& filter); + + /// \brief Creates a composite filter (of INTERSECT type) with an initial + /// child filter. + /// + /// \note T must satisfy PbiFilterConcept + /// + /// \param[in] filter initial child filter + /// + template + PbiFilter(T&& filter); + + /// \brief Creates a composite filter (of INTERSECT type) with a list of + /// initial child filters. + /// + /// \param[in] filters initial child filters + /// + PbiFilter(const std::vector& filters); + + /// \brief Creates composite filter (of INTERSECT type) with a list of + /// initial child filters. + /// + /// \param[in] filters initial child filters + /// + PbiFilter(std::vector&& filters); + + PbiFilter(const PbiFilter& other); + PbiFilter(PbiFilter&& other) noexcept; + PbiFilter& operator=(const PbiFilter& other); + PbiFilter& operator=(PbiFilter&& other) noexcept; + ~PbiFilter(void); + + /// \} + +public: + /// \name Composition + /// \{ + + /// \brief Adds a new child filter of type T. + /// + /// \param[in] filter additional child filter. Type T must satisfy + /// PbiFilterConcept. + /// \returns reference to this filter + /// + template + PbiFilter& Add(const T& filter); + + /// \brief Adds a new child filter of type T. + /// + /// \param[in] filter additional child filter. Type T must satisfy + /// PbiFilterConcept. + /// \returns reference to this filter + /// + template + PbiFilter& Add(T&& filter); + + /// \brief Adds a new child filter. + /// + /// \param[in] filter additional child filter + /// \returns reference to this filter + /// + PbiFilter& Add(const PbiFilter& filter); + + /// \brief Adds a new child filter. + /// + /// \param[in] filter additional child filter + /// \returns reference to this filter + /// + PbiFilter& Add(PbiFilter&& filter); + + /// \brief Add child filters. + /// + /// \param[in] filters additional child filters + /// \returns reference to this filter + /// + PbiFilter& Add(const std::vector& filters); + + /// \brief Add child filters. + /// + /// \param[in] filters additional child filters + /// \returns reference to this filter + /// + PbiFilter& Add(std::vector&& filters); + + /// \returns true if this filter has no child filters. + bool IsEmpty(void) const; + + /// \} + +public: + /// \name Lookup + /// \{ + + /// \brief Performs the PBI index lookup, combining child results a + /// composite filter. + /// + /// \param[in] idx PBI (raw) index object + /// \param[in] row record number in %BAM/PBI files + /// + /// \returns true if record at \p row passes this filter criteria, + /// including children (if any) + /// + bool Accepts(const BAM::PbiRawData& idx, const size_t row) const; + + /// \} + +private: + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/PbiFilter.inl" +#include "pbbam/PbiFilterTypes.h" + +#endif // PBIFILTER_H diff --git a/include/pbbam/PbiFilterQuery.h b/include/pbbam/PbiFilterQuery.h new file mode 100644 index 0000000..120a30d --- /dev/null +++ b/include/pbbam/PbiFilterQuery.h @@ -0,0 +1,96 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiFilterQuery.h +/// \brief Defines the PbiFilterQuery class. +// +// Author: Derek Barnett + +#ifndef PBIFILTERQUERY_H +#define PBIFILTERQUERY_H + +#include "pbbam/Config.h" +#include "pbbam/PbiFilter.h" +#include "pbbam/internal/QueryBase.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The PbiFilter class provides iterable access to a DataSet's %BAM +/// records, limiting results to those matching filter criteria. +/// +/// Example: +/// \include code/PbiFilterQuery.txt +/// +/// \note Currently, all %BAM files must have a corresponding ".pbi" index file. +/// Use BamFile::EnsurePacBioIndexExists before creating the query if one +/// may not be present. +/// +class PBBAM_EXPORT PbiFilterQuery : public internal::IQuery +{ +public: + /// \brief Creates a new PbiFilterQuery, limiting record results to only + /// those matching filter criteria + /// + /// \param[in] filter filtering criteria + /// \param[in] dataset input data source(s) + /// + /// \throws std::runtime_error on failure to open/read underlying %BAM or + /// PBI files. + /// + PbiFilterQuery(const PbiFilter& filter, const DataSet& dataset); + + ~PbiFilterQuery(void); + +public: + + /// \brief Main iteration point for record access. + /// + /// Most client code should not need to use this method directly. Use + /// iterators instead. + /// + bool GetNext(BamRecord& r); + +private: + struct PbiFilterQueryPrivate; + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // PBIFILTERQUERY_H diff --git a/include/pbbam/PbiFilterTypes.h b/include/pbbam/PbiFilterTypes.h new file mode 100644 index 0000000..52524ce --- /dev/null +++ b/include/pbbam/PbiFilterTypes.h @@ -0,0 +1,1023 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiFilterTypes.h +/// \brief Defines the built-in PBI filters. +// +// Author: Derek Barnett + +#ifndef PBIFILTERTYPES_H +#define PBIFILTERTYPES_H + +#include "pbbam/Compare.h" +#include "pbbam/PbiFilter.h" +#include "pbbam/PbiIndex.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { + +namespace internal { + +/// \internal +/// +/// Provides basic container for value/compare-type pair +/// +template +struct FilterBase +{ +public: + T value_; + boost::optional > multiValue_; + Compare::Type cmp_; +protected: + FilterBase(const T& value, const Compare::Type cmp); + FilterBase(T&& value, const Compare::Type cmp); + FilterBase(const std::vector& values); + FilterBase(std::vector&& values); +protected: + bool CompareHelper(const T& lhs) const; +private: + bool CompareSingleHelper(const T& lhs) const; + bool CompareMultiHelper(const T& lhs) const; +}; + +/// \internal +/// +/// Dispatches the lookup to BarcodeLookupData +/// +template +struct BarcodeDataFilterBase : public FilterBase +{ +protected: + BarcodeDataFilterBase(const T& value, const Compare::Type cmp); + BarcodeDataFilterBase(T&& value, const Compare::Type cmp); + BarcodeDataFilterBase(const std::vector& values); + BarcodeDataFilterBase(std::vector&& values); +public: + bool Accepts(const PbiRawData& idx, const size_t row) const; +}; + +/// \internal +/// +/// Dispatches the lookup to BasicLookupData +/// +template +struct BasicDataFilterBase : public FilterBase +{ +protected: + BasicDataFilterBase(const T& value, const Compare::Type cmp); + BasicDataFilterBase(T&& value, const Compare::Type cmp); + BasicDataFilterBase(const std::vector& values); + BasicDataFilterBase(std::vector&& values); +public: + bool Accepts(const PbiRawData& idx, const size_t row) const; +}; + +/// \internal +/// +/// Dispatches the lookup to MappedLookupData +/// +template +struct MappedDataFilterBase : public FilterBase +{ +protected: + MappedDataFilterBase(const T& value, const Compare::Type cmp); + MappedDataFilterBase(T&& value, const Compare::Type cmp); + MappedDataFilterBase(const std::vector& values); + MappedDataFilterBase(std::vector&& values); +public: + bool Accepts(const PbiRawData& idx, const size_t row) const; +}; + +} // namespace internal + +/// \brief The PbiAlignedEndFilter class provides a PbiFilter-compatible filter +/// on aligned end. +/// +/// Example: \include code/PbiAlignedEndFilter.txt +/// +/// \sa BamRecord::AlignedEnd +/// +struct PbiAlignedEndFilter + : public internal::MappedDataFilterBase +{ +public: + /// \brief Creates a filter on aligned end. + /// + /// \param[in] position value to compare on + /// \param[in] cmp compare type + /// + PbiAlignedEndFilter(const uint32_t position, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiAlignedLengthFilter class provides a PbiFilter-compatible +/// filter on aligned length. +/// +/// Example: \include code/PbiAlignedLengthFilter.txt +/// +/// \sa BamRecord::AlignedEnd, BamRecord::AlignedStart +/// +struct PbiAlignedLengthFilter : public internal::FilterBase +{ +public: + /// \brief Creates a filter on aligned length. + /// + /// \param[in] length value to compare on + /// \param[in] cmp compare type + /// + PbiAlignedLengthFilter(const uint32_t length, + const Compare::Type cmp = Compare::EQUAL); + +public: + /// \brief Performs the actual index lookup. + /// + /// Most client code should not need to use this method directly. + /// + bool Accepts(const PbiRawData& idx, const size_t row) const; +}; + +/// \brief The PbiAlignedStartFilter class provides a PbiFilter-compatible +/// filter on aligned start. +/// +/// Example: \include code/PbiAlignedStartFilter.txt +/// +/// \sa BamRecord::AlignedStart +/// +struct PbiAlignedStartFilter + : public internal::MappedDataFilterBase +{ +public: + /// \brief Creates a filter on aligned start. + /// + /// \param[in] position value to compare on + /// \param[in] cmp compare type + /// + PbiAlignedStartFilter(const uint32_t position, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiAlignedStrandFilter class provides a PbiFilter-compatible +/// filter on aligned strand. +/// +/// Example: \include code/PbiAlignedStrandFilter.txt +/// +/// \sa BamRecord::AlignedStrand +/// +struct PbiAlignedStrandFilter + : public internal::MappedDataFilterBase +{ +public: + /// \brief Creates a strand filter. + /// + /// \param[in] strand strand value to compare on + /// \param[in] cmp compare type + /// + PbiAlignedStrandFilter(const Strand strand, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiBarcodeFilter class provides a PbiFilter-compatible filter on +/// barcode ID. +/// +/// Any record with this barcode ID (forward or reverse) will pass this filter. +/// +/// Example: \include code/PbiBarcodeFilter.txt +/// +/// \sa BamRecord::BarcodeForward, BamRecord::BarcodeReverse +/// +struct PbiBarcodeFilter +{ +public: + /// \brief Creates a single-value barcode filter. + /// + /// \param[in] barcode barcode ID to compare on + /// \param[in] cmp compare type + /// + PbiBarcodeFilter(const int16_t barcode, + const Compare::Type cmp = Compare::EQUAL); + + /// \brief Creates a 'whitelisted' barcode filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly, in either bc_forward or bc_reverse. + /// + /// \param[in] whitelist barcode IDs to compare on + /// + PbiBarcodeFilter(const std::vector& whitelist); + + /// \brief Creates a 'whitelisted' barcode filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly, in either bc_forward or bc_reverse. + /// + /// \param[in] whitelist barcode IDs to compare on + /// + PbiBarcodeFilter(std::vector&& whitelist); + +public: + /// \brief Performs the actual index lookup. + /// + /// Most client code should not need to use this method directly. + /// + bool Accepts(const PbiRawData& idx, const size_t row) const; + +private: + PbiFilter compositeFilter_; +}; + +/// \brief The PbiBarcodeForwardFilter class provides a PbiFilter-compatible +/// filter on forward barcode ID. +/// +/// Example: \include code/PbiBarcodeForwardFilter.txt +/// +/// \sa BamRecord::BarcodeForward +/// +struct PbiBarcodeForwardFilter + : public internal::BarcodeDataFilterBase +{ +public: + /// \brief Creates a single-value forward barcode filter. + /// + /// \param[in] bcFwdId (forward) barcode ID to compare on + /// \param[in] cmp compare type + /// + PbiBarcodeForwardFilter(const int16_t bcFwdId, + const Compare::Type cmp = Compare::EQUAL); + + /// \brief Creates a 'whitelisted' forward barcode filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly, in bc_forward. + /// + /// \param[in] whitelist barcode IDs to compare on + /// + PbiBarcodeForwardFilter(const std::vector& whitelist); + + /// \brief Creates a 'whitelisted' forward barcode filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly, in bc_forward. + /// + /// \param[in] whitelist barcode IDs to compare on + /// + PbiBarcodeForwardFilter(std::vector&& whitelist); +}; + +/// \brief The PbiBarcodeQualityFilter class provides a PbiFilter-compatible +/// filter on barcode quality. +/// +/// Example: \include code/PbiBarcodeQualityFilter.txt +/// +/// \sa BamRecord::BarcodeQuality +/// +struct PbiBarcodeQualityFilter + : public internal::BarcodeDataFilterBase +{ +public: + /// \brief Creates a single-value barcode quality filter. + /// + /// \param[in] bcQuality barcode quality to compare on + /// \param[in] cmp compare type + /// + PbiBarcodeQualityFilter(const uint8_t bcQuality, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiBarcodeReverseFilter class provides a PbiFilter-compatible +/// filter on forward barcode ID. +/// +/// Example: \include code/PbiBarcodeReverseFilter.txt +/// +/// \sa BamRecord::BarcodeReverse +/// +struct PbiBarcodeReverseFilter + : public internal::BarcodeDataFilterBase +{ +public: + /// \brief Creates a single-value reverse barcode filter. + /// + /// \param[in] bcRevId (reverse) barcode ID to compare on + /// \param[in] cmp compare type + /// + PbiBarcodeReverseFilter(const int16_t bcRevId, + const Compare::Type cmp = Compare::EQUAL); + + /// \brief Creates a 'whitelisted' reverse barcode filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly, in bc_reverse. + /// + /// \param[in] whitelist barcode IDs to compare on + /// + PbiBarcodeReverseFilter(const std::vector& whitelist); + + /// \brief Creates a 'whitelisted' reverse barcode filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly, in bc_reverse. + /// + /// \param[in] whitelist barcode IDs to compare on + /// + PbiBarcodeReverseFilter(std::vector&& whitelist); +}; + +/// \brief The PbiBarcodesFilter class provides a PbiFilter-compatible filter on +/// both forward & reverse barcode IDs. +/// +/// A record must match both IDs to pass the filter. +/// +/// Example: \include code/PbiBarcodesFilter.txt +/// +/// \sa BamRecord::Barcodes +/// +struct PbiBarcodesFilter +{ +public: + /// \brief Creates a barcodes filter from a std::pair of IDs. + /// + /// pair.first -> BarcodeForward\n + /// pair.second -> BarcodeReverse + /// + /// \param[in] barcodes barcode IDs to compare on + /// \param[in] cmp compare type + /// + PbiBarcodesFilter(const std::pair barcodes, + const Compare::Type cmp = Compare::EQUAL); + + /// \brief Creates a barcodes filter from forward & reverse IDs. + /// + /// \param[in] bcForward forward barcode ID to compare on + /// \param[in] bcReverse reverse barcode ID to compare on + /// \param[in] cmp compare type + /// + PbiBarcodesFilter(const int16_t bcForward, + const int16_t bcReverse, + const Compare::Type cmp = Compare::EQUAL); +public: + /// \brief Performs the actual index lookup. + /// + /// Most client code should not need to use this method directly. + /// + bool Accepts(const PbiRawData& idx, const size_t row) const; + +private: + PbiFilter compositeFilter_; +}; + +/// \brief The PbiIdentityFilter class provides a PbiFilter-compatible filter on +/// read identity (% aligned match). +/// +/// Read identity is equivalent to: 1.0 - (nMM + nDel + nIns)/readLength. +/// +/// Example: \include code/PbiIdentityFilter.txt +/// +struct PbiIdentityFilter : public internal::FilterBase +{ +public: + /// \brief Creates a read identity filter. + /// + /// \param[in] identity value to compare on + /// \param[in] cmp compare type + /// + PbiIdentityFilter(const float identity, + const Compare::Type cmp = Compare::EQUAL); + +public: + /// \brief Performs the actual index lookup. + /// + /// Most client code should not need to use this method directly. + /// + bool Accepts(const PbiRawData& idx, const size_t row) const; +}; + +/// \brief The PbiLocalContextFilter class provides a PbiFilter-compatible +/// filter on local context (adapter, barcode, etc.). +/// +/// The primary Compare::Type operators intended for this filter are: +/// Compare::EQUAL, Compare::NOT_EQUAL, Compare::CONTAINS, and +/// Compare::NOT_CONTAINS. +/// +/// Example: \include code/PbiLocalContextFilter.txt +/// +struct PbiLocalContextFilter + : public internal::BasicDataFilterBase +{ +public: + PbiLocalContextFilter(const LocalContextFlags& flags, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiMapQualityFilter class provides a PbiFilter-compatible filter on +/// mapping quality. +/// +/// Example: \include code/PbiMapQualityFilter.txt +/// +/// \sa BamRecord::MapQuality +/// +struct PbiMapQualityFilter + : public internal::MappedDataFilterBase +{ +public: + /// \brief Creates a map quality filter. + /// + /// \param[in] mapQual value to compare on + /// \param[in] cmp compare type + /// + PbiMapQualityFilter(const uint8_t mapQual, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiMovieNameFilter class provides a PbiFilter-compatible filter +/// on movie name. +/// +/// Example: \include code/PbiMovieNameFilter.txt +/// +/// \sa BamRecord::MovieName +/// +struct PbiMovieNameFilter +{ +public: + /// \brief Creates a single-value movie name filter. + /// + /// \param[in] movieName movie name to compare on + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match movie name, exactly. + /// + PbiMovieNameFilter(const std::string& movieName); + + /// \brief Creates a 'whitelisted' movie name filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist movie names to compare on + /// + PbiMovieNameFilter(const std::vector& whitelist); + + /// \brief Creates a 'whitelisted' movie name filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist movie names to compare on + /// + PbiMovieNameFilter(std::vector&& whitelist); + +public: + /// \brief Performs the actual index lookup. + /// + /// Most client code should not need to use this method directly. + /// + bool Accepts(const PbiRawData& idx, const size_t row) const; + +private: + PbiFilter compositeFilter_; +}; + +/// \brief The PbiNumDeletedBasesFilter class provides a PbiFilter-compatible +/// filter on the number of deleted bases. +/// +/// Example: \include code/PbiNumDeletedBasesFilter.txt +/// +/// \sa BamRecord::NumDeletedBases +/// +struct PbiNumDeletedBasesFilter + : public internal::MappedDataFilterBase +{ +public: + /// \brief Creates a filter on the number of deleted bases. + /// + /// \param[in] numDeletions value to compare on + /// \param[in] cmp compare type + /// + PbiNumDeletedBasesFilter(const size_t numDeletions, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiNumInsertededBasesFilter class provides a PbiFilter-compatible +/// filter on the number of inserted bases. +/// +/// Example: \include code/PbiNumInsertedBasesFilter.txt +/// +/// \sa BamRecord::NumInsertedBases +/// +struct PbiNumInsertedBasesFilter + : public internal::MappedDataFilterBase +{ +public: + /// \brief Creates a filter on the number of inserted bases. + /// + /// \param[in] numInsertions value to compare on + /// \param[in] cmp compare type + /// + PbiNumInsertedBasesFilter(const size_t numInsertions, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiNumMatchesFilter class provides a PbiFilter-compatible filter +/// on the number of matched bases. +/// +/// Example: \include code/PbiNumMatchesFilter.txt +/// +/// \sa BamRecord::NumMatches +/// +struct PbiNumMatchesFilter + : public internal::MappedDataFilterBase +{ +public: + /// \brief Creates a filter on the number of matched bases. + /// + /// \param[in] numMatchedBases value to compare on + /// \param[in] cmp compare type + /// + PbiNumMatchesFilter(const size_t numMatchedBases, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiNumMismatchesFilter class provides a PbiFilter-compatible +/// filter on the number of mismatched bases. +/// +/// Example: \include code/PbiNumMismatchesFilter.txt +/// +/// \sa BamRecord::NumMismatches +/// +struct PbiNumMismatchesFilter + : public internal::MappedDataFilterBase +{ +public: + /// \brief Creates a filter on the number of mismatched bases. + /// + /// \param[in] numMismatchedBases value to compare on + /// \param[in] cmp compare type + /// + PbiNumMismatchesFilter(const size_t numMismatchedBases, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiQueryEndFilter class provides a PbiFilter-compatible filter +/// on query end. +/// +/// Example: \include code/PbiQueryEndFilter.txt +/// +/// \sa BamRecord::QueryEnd +/// +struct PbiQueryEndFilter + : public internal::BasicDataFilterBase +{ +public: + /// \brief Creates a filter on query end position. + /// + /// \param[in] position value to compare on + /// \param[in] cmp compare type + /// + PbiQueryEndFilter(const int32_t position, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiQueryLengthFilter class provides a PbiFilter-compatible filter +/// on query length. +/// +/// queryLength = (queryEnd - queryStart) +/// +/// Example: \include code/PbiQueryLengthFilter.txt +/// +/// \sa BamRecord::QueryEnd, BamRecord::QueryStart +/// +struct PbiQueryLengthFilter : public internal::FilterBase +{ +public: + /// \brief Creates a filter on query length + /// + /// \param[in] length value to compare on + /// \param[in] cmp compare type + /// + PbiQueryLengthFilter(const int32_t length, + const Compare::Type cmp = Compare::EQUAL); + +public: + /// \brief Performs the actual index lookup. + /// + /// Most client code should not need to use this method directly. + /// + bool Accepts(const PbiRawData& idx, const size_t row) const; +}; + +/// \brief The PbiQueryNameFilter class provides a PbiFilter-compatible filter +/// on name length. +/// +/// Example: \include code/PbiQueryNameFilter.txt +/// +/// \sa BamRecord::FullName +/// +struct PbiQueryNameFilter +{ +public: + /// \brief Creates a single-value query name filter. + /// + /// \param[in] qname query name to compare on + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match query name, exactly. + /// + PbiQueryNameFilter(const std::string& qname); + + /// \brief Creates a 'whitelisted' query name filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist query names to compare on + /// + PbiQueryNameFilter(const std::vector& whitelist); + + PbiQueryNameFilter(const PbiQueryNameFilter& other); + ~PbiQueryNameFilter(void); + +public: + /// \brief Performs the actual index lookup. + /// + /// Most client code should not need to use this method directly. + /// + bool Accepts(const PbiRawData& idx, const size_t row) const; + +private: + struct PbiQueryNameFilterPrivate; + std::unique_ptr d_; +}; + +/// \brief The PbiQueryStartFilter class provides a PbiFilter-compatible filter +/// on query start. +/// +/// Example: \include code/PbiQueryStartFilter.txt +/// +/// \sa BamRecord::QueryStart +/// +struct PbiQueryStartFilter + : public internal::BasicDataFilterBase +{ +public: + /// \brief Creates a filter on query start position. + /// + /// \param[in] position value to compare on + /// \param[in] cmp compare type + /// + PbiQueryStartFilter(const int32_t position, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiReadAccuracyFilter class provides a PbiFilter-compatible filter +/// on read accuracy. +/// +/// Example: \include code/PbiReadAccuracyFilter.txt +/// +/// \sa BamRecord::ReadAccuracy +/// +struct PbiReadAccuracyFilter + : public internal::BasicDataFilterBase +{ +public: + /// \brief Creates a filter on read accuracy. + /// + /// \param[in] accuracy value to compare on + /// \param[in] cmp compare type + /// + PbiReadAccuracyFilter(const Accuracy accuracy, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiReadGroupFilter class provides a PbiFilter-compatible filter +/// on read group. +/// +/// Example: \include code/PbiReadGroupFilter.txt +/// +/// \sa BamRecord::ReadGroup, +/// BamRecord::ReadGroupId, +/// BamRecord::ReadGroupNumericId +/// +struct PbiReadGroupFilter + : public internal::BasicDataFilterBase +{ +public: + /// \brief Creates a filter on read group (numeric) ID value + /// + /// \param[in] rgId numeric read group ID + /// \param[in] cmp compare type + /// + /// \sa BamRecord::ReadGroupNumericId + /// + PbiReadGroupFilter(const int32_t rgId, + const Compare::Type cmp = Compare::EQUAL); + + /// \brief Creates a filter on printable read group ID value + /// + /// \param[in] rgId read group ID string + /// \param[in] cmp compare type + /// + /// \sa BamRecord::ReadGroupId + /// + PbiReadGroupFilter(const std::string rgId, + const Compare::Type cmp = Compare::EQUAL); + + /// \brief Creates a filter on read group (object). + /// + /// \param[in] rg read group object + /// \param[in] cmp compare type + /// + /// \sa BamRecord::ReadGroup + /// + PbiReadGroupFilter(const ReadGroupInfo& rg, + const Compare::Type cmp = Compare::EQUAL); + + /// \brief Creates a 'whitelisted' filter on read group numeric IDs. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist read group IDs to compare on + /// + PbiReadGroupFilter(const std::vector& whitelist); + + /// \brief Creates a 'whitelisted' filter on read group numeric IDs. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist read group IDs to compare on + /// + PbiReadGroupFilter(std::vector&& whitelist); + + /// \brief Creates a 'whitelisted' filter on read group printable IDs. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist read group ID strings to compare on + /// + PbiReadGroupFilter(const std::vector& whitelist); + + /// \brief Creates a 'whitelisted' filter on read group printable IDs. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist read group ID strings to compare on + /// + PbiReadGroupFilter(std::vector&& whitelist); + + /// \brief Creates a 'whitelisted' filter using read group objects. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist read group objects to compare on + /// + PbiReadGroupFilter(const std::vector& whitelist); + + /// \brief Creates a 'whitelisted' filter using read group objects. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist read group objects to compare on + /// + PbiReadGroupFilter(std::vector&& whitelist); +}; + +/// \brief The PbiReferenceEndFilter class provides a PbiFilter-compatible +/// filter on reference end. +/// +/// Example: \include code/PbiReferenceEndFilter.txt +/// +/// \sa BamRecord::ReferenceEnd +/// +struct PbiReferenceEndFilter + : public internal::MappedDataFilterBase +{ +public: + /// \brief Creates a filter on reference end. + /// + /// \param[in] tEnd value to compare on + /// \param[in] cmp compare type + /// + PbiReferenceEndFilter(const uint32_t tEnd, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiReferenceIdFilter class provides a PbiFilter-compatible +/// filter on reference ID. +/// +/// Example: \include code/PbiReferenceIdFilter.txt +/// +/// \sa BamRecord::ReferenceId +/// +struct PbiReferenceIdFilter + : public internal::MappedDataFilterBase +{ +public: + /// \brief Creates a single-value reference ID filter. + /// + /// \param[in] tId reference ID to compare on + /// \param[in] cmp compare type + /// + PbiReferenceIdFilter(const int32_t tId, + const Compare::Type cmp = Compare::EQUAL); + + /// \brief Creates a 'whitelisted' reference ID filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist reference IDs to compare on + /// + PbiReferenceIdFilter(const std::vector& whitelist); + + /// \brief Creates a 'whitelisted' reference ID filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist reference IDs to compare on + /// + PbiReferenceIdFilter(std::vector&& whitelist); +}; + +/// \brief The PbiReferenceNameFilter class provides a PbiFilter-compatible +/// filter on reference name. +/// +/// Example: \include code/PbiReferenceNameFilter.txt +/// +/// \sa BamRecord::ReferenceName +/// +struct PbiReferenceNameFilter +{ +public: + /// \brief Creates a single-value reference name filter. + /// + /// \param[in] rname reference ID to compare on + /// \param[in] cmp compare type + /// + PbiReferenceNameFilter(const std::string& rname, + const Compare::Type cmp = Compare::EQUAL); + + /// \brief Creates a 'whitelisted' reference name filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist reference names to compare on + /// + PbiReferenceNameFilter(const std::vector& whitelist); + + /// \brief Creates a 'whitelisted' reference name filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist reference names to compare on + /// + PbiReferenceNameFilter(std::vector&& whitelist); + +public: + /// \brief Performs the actual index lookup. + /// + /// Most client code should not need to use this method directly. + /// + bool Accepts(const PbiRawData& idx, const size_t row) const; + +private: + mutable bool initialized_; + mutable PbiFilter subFilter_; + std::string rname_; + boost::optional > rnameWhitelist_; + Compare::Type cmp_; + +private: + // marked const so we can delay setup of filter in Accepts(), once we have + // access to PBI/BAM input. modified values marked mutable accordingly + void Initialize(const PbiRawData& idx) const; +}; + +/// \brief The PbiReferenceStartFilter class provides a PbiFilter-compatible +/// filter on reference start. +/// +/// Example: \include code/PbiReferenceStartFilter.txt +/// +/// \sa BamRecord::ReferenceStart +/// +struct PbiReferenceStartFilter + : public internal::MappedDataFilterBase +{ +public: + /// \brief Creates a filter on reference start. + /// + /// \param[in] tStart value to compare on + /// \param[in] cmp compare type + /// + PbiReferenceStartFilter(const uint32_t tStart, + const Compare::Type cmp = Compare::EQUAL); +}; + +/// \brief The PbiZmwFilter class provides a PbiFilter-compatible filter on +/// ZMW hole number. +/// +/// Example: \include code/PbiZmwFilter.txt +/// +/// \sa BamRecord::HoleNumber +/// +struct PbiZmwFilter : public internal::BasicDataFilterBase +{ +public: + /// \brief Creates a single-value ZMW hole number filter. + /// + /// \param[in] zmw value to compare on + /// \param[in] cmp compare type + /// + PbiZmwFilter(const int32_t zmw, + const Compare::Type cmp = Compare::EQUAL); + + /// \brief Creates a 'whitelisted' ZMW hole number filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist ZMW hole numbers to compare on + /// + PbiZmwFilter(const std::vector& whitelist); + + /// \brief Creates a 'whitelisted' ZMW hole number filter. + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Records will match at least one value from the + /// whitelist, exactly. + /// + /// \param[in] whitelist ZMW hole numbers to compare on + /// + PbiZmwFilter(std::vector&& whitelist); +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/PbiFilterTypes.inl" + +#endif // PBIFILTERTYPES_H diff --git a/include/pbbam/PbiIndex.h b/include/pbbam/PbiIndex.h new file mode 100644 index 0000000..09b61b8 --- /dev/null +++ b/include/pbbam/PbiIndex.h @@ -0,0 +1,162 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiIndex.h +/// \brief Defines the PbiIndex class. +// +// Author: Derek Barnett + +#ifndef PBIINDEX_H +#define PBIINDEX_H + +#include "pbbam/Config.h" +#include "pbbam/PbiFile.h" +#include "pbbam/PbiLookupData.h" +#include +#include + +namespace PacBio { +namespace BAM { + +namespace internal { class PbiIndexPrivate; } + +/// \brief The PbiIndex class provides an representation of PBI index data that +/// is rearranged for quick lookups. +/// +/// The PbiIndex class itself provides access to a few high-level attributes +/// (e.g. version, number of records, etc.). The actual lookup data is stored +/// in its member components: +/// BasicLookupData, +/// MappedLookupData, +/// ReferenceLookupData, & +/// BarcodeLookupData . +/// +class PBBAM_EXPORT PbiIndex +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates a PbiIndex lookup structure from a PBI file. + /// + /// \param[in] pbiFilename filename + /// + /// \throws std::runtime_error if failed to load data from file + /// + PbiIndex(const std::string& pbiFilename); + + PbiIndex(const PbiIndex& other); + PbiIndex(PbiIndex&& other); + PbiIndex& operator=(const PbiIndex& other); + PbiIndex& operator=(PbiIndex&& other); + ~PbiIndex(void); + + /// \} + +public: + /// \name PBI General Attributes + /// \{ + + /// \returns true if index has BarcodeData section + bool HasBarcodeData(void) const; + + /// \returns true if index has MappedData section + bool HasMappedData(void) const; + + /// \returns true if index has ReferenceData section + bool HasReferenceData(void) const; + + /// \returns true if index has \b section + /// \param[in] section PbiFile::Section identifier + /// + bool HasSection(const PbiFile::Section section) const; + + /// \returns index filename ("*.pbi") + /// + /// \note Returns an empty string if the underlying data was generated, not + /// loaded from file. + /// + std::string Filename(void) const; + + /// \returns enum flags representing the file sections present + PbiFile::Sections FileSections(void) const; + + /// \returns the number of records in the PBI (& associated %BAM) + uint32_t NumReads(void) const; + + /// \returns the PBI file's version + PbiFile::VersionEnum Version(void) const; + + /// \} + +public: + /// \name Lookup Data Components + /// \{ + + /// \returns const reference to BarcodeData lookup structure + /// + /// May be empty, check result of HasBarcodeData. + /// + const BarcodeLookupData& BarcodeData(void) const; + + /// \returns const reference to BasicData lookup structure + const BasicLookupData& BasicData(void) const; + + /// \returns const reference to MappedData lookup structure + /// + /// May be empty, check result of HasMappedData. + /// + const MappedLookupData& MappedData(void) const; + + /// \returns const reference to reference data lookup structure + /// + /// May be empty, check result of HasReferenceData. + /// + const ReferenceLookupData& ReferenceData(void) const; + + /// } + +private: + PbiIndex(void); + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "internal/PbiIndex.inl" + +#endif // PBIINDEX_H diff --git a/include/pbbam/PbiIndexedBamReader.h b/include/pbbam/PbiIndexedBamReader.h new file mode 100644 index 0000000..17c46b5 --- /dev/null +++ b/include/pbbam/PbiIndexedBamReader.h @@ -0,0 +1,174 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiIndexedBamReader.h +/// \brief Defines the PbiIndexedBamReader class. +// +// Author: Derek Barnett + +#ifndef PBIINDEXEDBAMREADER_H +#define PBIINDEXEDBAMREADER_H + +#include "pbbam/BamFile.h" +#include "pbbam/BamReader.h" +#include "pbbam/PbiBasicTypes.h" +#include "pbbam/PbiFilter.h" +#include "pbbam/PbiIndex.h" +#include + +namespace PacBio { +namespace BAM { + +namespace internal { struct PbiIndexedBamReaderPrivate; } + +/// \brief The PbiIndexedBamReader class provides read-only iteration over %BAM +/// records, limited to some filtering criteria. +/// +/// The PacBio BAM index (*.pbi) is used to allow random-access operations. +/// +class PBBAM_EXPORT PbiIndexedBamReader : public BamReader +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Constructs %BAM reader, with an initial filter. + /// + /// All reads that satisfy the filter will be available. + /// + /// \param[in] filter PbiFilter or compatible object + /// \param[in] bamFilename input %BAM filename + /// + /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be + /// read + /// + PbiIndexedBamReader(const PbiFilter& filter, const std::string& bamFilename); + + /// \brief Constructs %BAM reader, with an initial filter. + /// + /// All reads that satisfy the filter will be available. + /// + /// \param[in] filter PbiFilter or compatible object + /// \param[in] bamFile input BamFile object + /// + /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be + /// read + /// + PbiIndexedBamReader(const PbiFilter& filter, const BamFile& bamFile); + + /// \brief Constructs %BAM reader, with an initial filter. + /// + /// All reads that satisfy the filter will be available. + /// + /// \param[in] filter PbiFilter or compatible object + /// \param[in] bamFile input BamFile object + /// + /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be + /// read + /// + PbiIndexedBamReader(const PbiFilter& filter, BamFile&& bamFile); + + /// \brief Constructs %BAM reader, with no initial filter. + /// + /// Useful for delaying either specifying the filtering criteria or + /// performing the PBI lookups. + /// + /// \param[in] bamFilename input %BAM filename + /// + /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be + /// read + /// + PbiIndexedBamReader(const std::string& bamFilename); + + /// \brief Constructs %BAM reader, with no initial filter. + /// + /// Useful for delaying either specifying the filtering criteria or + /// performing the PBI lookups. + /// + /// \param[in] bamFile input BamFile object + /// + /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be + /// read + /// + PbiIndexedBamReader(const BamFile& bamFile); + + /// \brief Constructs %BAM reader, with no initial filter. + /// + /// Useful for delaying either specifying the filtering criteria or + /// performing the PBI lookups. + /// + /// \param[in] bamFile input BamFile object + /// + /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be + /// read + /// + PbiIndexedBamReader(BamFile&& bamFile); + + ~PbiIndexedBamReader(void); + + /// \} + +public: + /// \name Filtering & Index Data + /// \{ + + /// \returns the current filter active on this reader + const PbiFilter& Filter(void) const; + +// /// \returns the reader's underlying index data +// const PbiIndex& Index(void) const; + +public: + /// \brief Sets a new filter on the reader. + /// + /// \param[in] filter + /// \returns reference to this reader + /// + PbiIndexedBamReader& Filter(const PbiFilter& filter); + + /// \} + +protected: + int ReadRawData(BGZF* bgzf, bam1_t* b); + +private: + std::unique_ptr d_; +}; + +} // namespace internal +} // namespace BAM + +#endif // PBIINDEXEDBAMREADER_H diff --git a/include/pbbam/PbiLookupData.h b/include/pbbam/PbiLookupData.h new file mode 100644 index 0000000..398c349 --- /dev/null +++ b/include/pbbam/PbiLookupData.h @@ -0,0 +1,718 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiLookupData.h +/// \brief Defines the classes used for PBI data lookup. +// +// Author: Derek Barnett + +#ifndef PBILOOKUPDATA_H +#define PBILOOKUPDATA_H + +#include "pbbam/Config.h" +#include "pbbam/Compare.h" +#include "pbbam/PbiBasicTypes.h" +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { + +class PbiRawBarcodeData; +class PbiRawBasicData; +class PbiRawMappedData; +class PbiRawReferenceData; + +/// \brief The OrderedLookup class provides a quick lookup structure for +/// PBI index data, where key values are sorted. +/// +/// The main, underlying lookup structure is essentailly a std::map, where the +/// key is some value (e.g. readAccuracy) and the value is the list of indices +/// (i-th record) in the %BAM file. +/// +/// This lookup class is one of the main building blocks for the PBI index +/// lookup components. +/// +/// \param T type of key stored (Accuracy for readAccuracy, int32_t for ZMW, +/// etc.) +/// +template +class OrderedLookup +{ +public: + typedef T key_type; + typedef IndexList value_type; + typedef std::map container_type; + typedef typename container_type::iterator iterator; + typedef typename container_type::const_iterator const_iterator; + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty OrderedLookup structure. + /// + OrderedLookup(void); + + /// \brief Creates an OrderedLookup struture, from another's underlying + /// lookup container. + /// + /// \param[in] data lookup data container + /// + OrderedLookup(const container_type& data); + + /// \brief Creates an OrderedLookup struture, from another's underlying + /// lookup container. + /// + /// \param[in] data lookup data container + /// + OrderedLookup(container_type&& data); + + /// \brief Creates an OrderedLookup struture, from raw data. + /// + /// \param[in] rawData raw data values, where i is the index into the %BAM + /// file, and rawData[i] is the key value + /// + OrderedLookup(const std::vector& rawData); + + /// \brief Creates an OrderedLookup struture, from raw data. + /// + /// \param[in] rawData raw data values, where i is the index into the %BAM + /// file, and rawData[i] is the key value + /// + OrderedLookup(std::vector&& rawData); + + /// \} + +public: + /// \name Operators + /// \{ + + /// \returns true if this lookup is same as \p other + bool operator==(const OrderedLookup& other) const; + + /// \returns true if this lookup is not the same as \p other + bool operator!=(const OrderedLookup& other) const; + + /// \} + +public: + /// \name STL-Compatibility Methods + /// \{ + + /// \returns an iterator to the first element in the underlying container + iterator begin(void); + + /// \returns a const iterator to the first element in the underlying + /// container + const_iterator begin(void) const; + + /// \returns a const iterator to the first element in the underlying + /// + const_iterator cbegin(void) const; + + /// \returns an iterator after the last element in the underlying container + iterator end(void); + + /// \returns a const iterator after the last element in the underlying + /// container + const_iterator end(void) const; + + /// \returns a const iterator after the last element in the underlying + /// container + const_iterator cend(void) const; + + /// \returns true if underlying container is empty + bool empty(void) const; + + /// \returns number of keys in the container + size_t size(void) const; + + /// \} + +public: + /// \name Lookup Data + /// \{ + + /// \brief Performs a lookup into the underlying data. + /// + /// \param[in] key key value to lookup + /// \param[in] compare compare type + /// + /// \returns sorted list of unique indices that satisfy the lookup key & + /// compare type + /// + IndexList LookupIndices(const key_type& key, + const Compare::Type& compare) const; + + /// \brief Converts the lookup structure back into its raw data. + /// + /// \returns raw data values, where i is the index into the %BAM file, and + /// rawData[i] is the key value + /// + std::vector Unpack(void) const; + + /// \} + +private: + IndexList LookupInclusiveRange(const const_iterator& begin, + const const_iterator& end) const; + + IndexList LookupExclusiveRange(const const_iterator& begin, + const const_iterator& end, + const key_type& key) const; + +private: + container_type data_; +}; + +/// \brief The UnorderedLookup class provides a quick lookup structure for +/// PBI index data, where key values are not sorted. +/// +/// The main, underlying lookup structure is essentailly a std::unordered_map, +/// where the key is some value (e.g. read group ID) and the value is the list +/// of indices (i-th record) in the %BAM file. +/// +/// This lookup class is one of the main building blocks for the PBI index +/// lookup components. +/// +/// \param T type of key stored (Accuracy for readAccuracy, int32_t for ZMW, +/// etc.) +/// +template +class UnorderedLookup +{ +public: + typedef T key_type; + typedef IndexList value_type; + typedef std::unordered_map container_type; + typedef typename container_type::iterator iterator; + typedef typename container_type::const_iterator const_iterator; + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty UnorderedLookup structure. + /// + UnorderedLookup(void); + + /// \brief Creates an UnorderedLookup struture, from another's underlying + /// lookup container. + /// + /// \param[in] data lookup data container + /// + UnorderedLookup(const container_type& data); + + /// \brief Creates an UnorderedLookup struture, from another's underlying + /// lookup container. + /// + /// \param[in] data lookup data container + /// + UnorderedLookup(container_type&& data); + + /// \brief Creates an UnorderedLookup struture, from raw data. + /// + /// \param[in] rawData raw data values, where i is the index into the %BAM + /// file, and rawData[i] is the key value + /// + UnorderedLookup(const std::vector& rawData); + + /// \brief Creates an UnorderedLookup struture, from raw data. + /// + /// \param[in] rawData raw data values, where i is the index into the %BAM + /// file, and rawData[i] is the key value + /// + UnorderedLookup(std::vector&& rawData); + + /// \} + +public: + /// \name Operators + /// \{ + + /// \returns true if this lookup is same as \p other + bool operator==(const UnorderedLookup& other) const; + + /// \returns true if this lookup is not the same as \p other + bool operator!=(const UnorderedLookup& other) const; + + /// \} + +public: + /// \name STL-Compatibility Methods + /// \{ + + /// \returns an iterator to the first element in the underlying container + iterator begin(void); + + /// \returns a const iterator to the first element in the underlying + /// container + const_iterator begin(void) const; + + /// \returns a const iterator to the first element in the underlying + /// + const_iterator cbegin(void) const; + + /// \returns an iterator after the last element in the underlying container + iterator end(void); + + /// \returns a const iterator after the last element in the underlying + /// container + const_iterator end(void) const; + + /// \returns a const iterator after the last element in the underlying + /// container + const_iterator cend(void) const; + + /// \returns true if underlying container is empty + bool empty(void) const; + + /// \returns number of keys in the container + size_t size(void) const; + + /// \} + +public: + /// \name Lookup Data + /// \{ + + /// \brief Performs a lookup into the underlying data. + /// + /// \param[in] key key value to lookup + /// \param[in] compare compare type + /// + /// \returns sorted list of unique indices that satisfy the lookup key & + /// compare type + /// + IndexList LookupIndices(const key_type& key, + const Compare::Type& compare) const; + + /// \brief Converts the lookup structure back into its raw data. + /// + /// \returns raw data values, where i is the index into the %BAM file, and + /// rawData[i] is the key value + /// + std::vector Unpack(void) const; + + /// \} + +private: + template + IndexList LookupHelper(const key_type& key, + const Compare& cmp) const; + +private: + container_type data_; +}; + +/// \brief The BasicLookupData class provides quick lookup access to the +/// "BasicData" section of the PBI index. +/// +class PBBAM_EXPORT BasicLookupData +{ +public: + /// \brief This enum describes the component fields of the BasicData + /// section. + enum Field + { + RG_ID + , Q_START + , Q_END + , ZMW + , READ_QUALITY + , CONTEXT_FLAG + , VIRTUAL_OFFSET + }; + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty lookup data object. + BasicLookupData(void); + + /// \brief Creates a lookup data object from the corresponding raw data. + /// + /// \param[in] rawData raw data loaded from a PBI file + /// + BasicLookupData(const PbiRawBasicData& rawData); + + /// \} + +public: + /// \name Lookup Data Methods + /// \{ + + /// \brief Adds \b virtual file offset data to the index lookup result + /// blocks. + /// + /// A PBI lookup will result in a number of index lists, depending on the + /// complexity of the PbiFilter involved. These index lists are then merged + /// down into blocks of contiguous values, where each block describes a + /// particular record index and the number of subsequent, contiguous reads + /// that immediately follow it. In this manner, we need only perform seeks + /// to the first record of each block. + /// + /// This method takes such blocks and annotates them with the corresponding + /// \b virtual file offset. Subsequent %BAM readers can use this information + /// to control file seeks. + /// + /// \param[in,out] blocks + /// + /// \throws std::out_of_range if a block has an invalid index value + /// + void ApplyOffsets(IndexResultBlocks& blocks) const; + + /// \brief This method dispatches a single-value lookup query to the proper + /// data member. + /// + /// Client code, such as custom filters, should use this when possible, only + /// touching the raw fields for more complex operations (e.g. when unpacking + /// is necessary). + /// + /// \param[in] field section field to lookup + /// \param[in] value value to lookup + /// \param[in] compareType compare type + /// + /// \returns sorted list of unique indices that satisfy the lookup + /// + template + IndexList Indices(const BasicLookupData::Field& field, + const T& value, + const Compare::Type& compareType = Compare::EQUAL) const; + + /// \brief This method dispatches a multi-value lookup query to the proper + /// data member. + /// + /// Client code, such as custom filters, should use this when possible, only + /// touching the raw fields for more complex operations (e.g. when unpacking + /// is necessary). + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Results will correspond to an exact match on at + /// least one value in the list. + /// + /// \param[in] field section field to lookup + /// \param[in] values values to lookup + /// + /// \returns sorted list of unique indices that satisfy the lookup + /// + template + IndexList IndicesMulti(const BasicLookupData::Field& field, + const std::vector& values) const; + + /// \returns the \b virtual file offsets for all records + /// + const std::vector& VirtualFileOffsets(void) const; + + /// \} + +public: + /// \brief Lookup Data Members + /// \{ + + // map ordering doesn't make sense, optimize for direct lookup + UnorderedLookup rgId_; + + // numeric comparisons make sense, keep key ordering preserved + OrderedLookup qStart_; + OrderedLookup qEnd_; + OrderedLookup holeNumber_; + OrderedLookup readQual_; + + // see if this works, or if can use unordered, 'direct' query + OrderedLookup ctxtFlag_; + + // offsets + std::vector fileOffset_; + + /// \} +}; + +/// \brief The MappedLookupData class provides quick lookup access to the +/// "MappedData" section of the PBI index. +/// +class PBBAM_EXPORT MappedLookupData +{ +public: + /// \brief This enum describes the component fields of the MappedData + /// section. + enum Field + { + T_ID + , T_START + , T_END + , A_START + , A_END + , N_M + , N_MM + , N_INS + , N_DEL + , MAP_QUALITY + , STRAND + }; + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty lookup data object. + MappedLookupData(void); + + /// \brief Creates a lookup data object from the corresponding raw data. + /// + /// \param[in] rawData raw data loaded from a PBI file + /// + MappedLookupData(const PbiRawMappedData& rawData); + + /// \} + +public: + /// \name Lookup Data Methods + /// \{ + + /// \brief This method dispatches a single-value lookup query to the proper + /// data member. + /// + /// Client code, such as custom filters, should use this when possible, only + /// touching the raw fields for more complex operations (e.g. when unpacking + /// is necessary). + /// + /// \param[in] field section field to lookup + /// \param[in] value value to lookup + /// \param[in] compareType compare type + /// + /// \returns sorted list of unique indices that satisfy the lookup + /// + template + IndexList Indices(const MappedLookupData::Field& field, + const T& value, + const Compare::Type& compareType = Compare::EQUAL) const; + + /// \brief This method dispatches a multi-value lookup query to the proper + /// data member. + /// + /// Client code, such as custom filters, should use this when possible, only + /// touching the raw fields for more complex operations (e.g. when unpacking + /// is necessary). + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Results will correspond to an exact match on at + /// least one value in the list. + /// + /// \param[in] field section field to lookup + /// \param[in] values values to lookup + /// + /// \returns sorted list of unique indices that satisfy the lookup + /// + template + IndexList IndicesMulti(const MappedLookupData::Field& field, + const std::vector& values) const; + + /// \} + +public: + /// \name Lookup Data Members + /// \{ + + // numeric comparisons make sense, keep key ordering preserved + OrderedLookup tId_; + OrderedLookup tStart_; + OrderedLookup tEnd_; + OrderedLookup aStart_; + OrderedLookup aEnd_; + OrderedLookup nM_; + OrderedLookup nMM_; + OrderedLookup mapQV_; + + // generated values, not stored directly in PBI file + OrderedLookup nIns_; + OrderedLookup nDel_; + + // no need for map overhead, just store direct indices + IndexList reverseStrand_; + IndexList forwardStrand_; + + /// \} +}; + +/// \brief The ReferenceLookupData class provides quick lookup access to the +/// "CoordinateSortedData" section of the PBI index. +/// +class PBBAM_EXPORT ReferenceLookupData +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty lookup data object. + /// + ReferenceLookupData(void); + + /// \brief Creates a lookup data object from the corresponding raw data. + /// + /// \param[in] rawData raw data loaded from a PBI file + /// + ReferenceLookupData(const PbiRawReferenceData& rawData); + + /// \} + +public: + /// \name Lookup Data Methods + /// \{ + + /// \brief Retrieves the index range for all records that map to a + /// particular reference. + /// + /// Client code, such as custom filters, should use this when possible, only + /// touching the raw fields for more complex operations (e.g. when unpacking + /// is necessary). + /// + /// \param[in] tId reference ID to lookup + /// + /// \returns resulting index range [begin, end). If \p tId is unknown, + /// will return IndexRange(-1,-1) . + /// + IndexRange Indices(const int32_t tId) const; + + /// \} + +public: + /// \name Lookup Data Members + /// \{ + + // references_[tId] = [begin, end) indices + std::unordered_map references_; + + /// \} +}; + +/// \brief The BarcodeLookupData class provides quick lookup access to the +/// "BarcodeData" section of the PBI index. +/// +class PBBAM_EXPORT BarcodeLookupData +{ +public: + /// \brief This enum describes the component fields of the BarcodeData + /// section. + enum Field + { + BC_FORWARD + , BC_REVERSE + , BC_QUALITY + }; + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty lookup data object. + /// + BarcodeLookupData(void); + + /// \brief Creates a lookup data object from the corresponding raw data. + /// + /// \param[in] rawData raw data loaded from a PBI file + /// + BarcodeLookupData(const PbiRawBarcodeData& rawData); + + /// \} + +public: + /// \name Lookup Data Methods + /// \{ + + /// \brief This method dispatches a single-value lookup query to the proper + /// data member. + /// + /// Client code, such as custom filters, should use this when possible, only + /// touching the raw fields for more complex operations (e.g. when unpacking + /// is necessary). + /// + /// \param[in] field section field to lookup + /// \param[in] value value to lookup + /// \param[in] compareType compare type + /// + /// \returns sorted list of unique indices that satisfy the lookup + /// + template + IndexList Indices(const BarcodeLookupData::Field& field, + const T& value, + const Compare::Type& compareType = Compare::EQUAL) const; + + /// \brief This method dispatches a multi-value lookup query to the proper + /// data member. + /// + /// Client code, such as custom filters, should use this when possible, only + /// touching the raw fields for more complex operations (e.g. when unpacking + /// is necessary). + /// + /// \note There is no compare type parameter here, it is always + /// Compare::EQUAL. Results will correspond to an exact match on at + /// least one value in the list. + /// + /// \param[in] field section field to lookup + /// \param[in] values values to lookup + /// + /// \returns sorted list of unique indices that satisfy the lookup + /// + template + IndexList IndicesMulti(const BarcodeLookupData::Field& field, + const std::vector& values) const; + + /// \} + +public: + /// \name Lookup Data Members + /// \{ + + // numeric comparisons make sense, keep key ordering preserved + OrderedLookup bcForward_; + OrderedLookup bcReverse_; + OrderedLookup bcQual_; + + /// \} +}; + +} // namespace BAM +} // namespace PacBio + +#include "internal/PbiLookupData.inl" + +#endif // PBILOOKUPDATA_H diff --git a/include/pbbam/PbiRawData.h b/include/pbbam/PbiRawData.h new file mode 100644 index 0000000..6e1e974 --- /dev/null +++ b/include/pbbam/PbiRawData.h @@ -0,0 +1,531 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiRawData.h +/// \brief Defines the classes used for working with raw PBI data. +// +// Author: Derek Barnett + +#ifndef PBIRAWDATA_H +#define PBIRAWDATA_H + +#include "pbbam/Config.h" +#include "pbbam/PbiFile.h" +#include +#include + +namespace PacBio { +namespace BAM { + +class BamRecord; +class DataSet; + +/// \brief The PbiRawBarcodeData class represents the raw data stored in the +/// "BarcodeData" section of the PBI index. +/// +class PBBAM_EXPORT PbiRawBarcodeData +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty data structure. + PbiRawBarcodeData(void); + + /// \brief Creates an empty data structure, preallocating space for a known + /// number of records. + PbiRawBarcodeData(uint32_t numReads); + + PbiRawBarcodeData(const PbiRawBarcodeData& other); + PbiRawBarcodeData(PbiRawBarcodeData&& other); + PbiRawBarcodeData& operator=(const PbiRawBarcodeData& other); + PbiRawBarcodeData& operator=(PbiRawBarcodeData&& other); + + /// \} + +public: + /// \name Index Construction + /// \{ + + /// \brief Adds a record's barcode data. + /// + /// \param[in] b %BAM record + /// + void AddRecord(const BamRecord& b); + + /// \} + +public: + /// \name Raw Data Containers + /// \{ + + std::vector bcForward_; + std::vector bcReverse_; + std::vector bcQual_; + + /// \} +}; + +/// \brief The PbiRawMappedData class represents the raw data stored in the +/// "MappedData" section of the PBI index. +/// +class PBBAM_EXPORT PbiRawMappedData +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty data structure. + PbiRawMappedData(void); + + /// \brief Creates an empty data structure, preallocating space for a known + /// number of records. + PbiRawMappedData(uint32_t numReads); + + PbiRawMappedData(const PbiRawMappedData& other); + PbiRawMappedData(PbiRawMappedData&& other); + PbiRawMappedData& operator=(const PbiRawMappedData& other); + PbiRawMappedData& operator=(PbiRawMappedData&& other); + + /// \} + +public: + /// \name Index Construction + /// \{ + + /// \brief Adds a record's mapping data. + /// + /// \param[in] b %BAM record + /// + void AddRecord(const BamRecord& b); + + /// \} + +public: + /// \name Index Data Query + /// \{ + + /// \brief Calculates the number of deleted bases for a particular record. + /// + /// Convenvience method. Equivalent to: + /// \code{.cpp} + /// NumDeletedAndInsertedBasesAt(i).first; + /// \endcode + /// + /// \param[in] recordIndex i-th record + /// \returns number of deleted bases + /// + uint32_t NumDeletedBasesAt(size_t recordIndex) const; + + /// \brief Calculates the number of inserted bases for a particular record. + /// + /// Convenvience method. Equivalent to: + /// \code{.cpp} + /// NumDeletedAndInsertedBasesAt(i).second; + /// \endcode + /// + /// \param[in] recordIndex i-th record + /// \returns number of inserted bases + /// + uint32_t NumInsertedBasesAt(size_t recordIndex) const; + + /// \brief Calculates the number of deleted & inserted bases for a + /// particular record. + /// + /// \param[in] recordIndex i-th record in the data set + /// \returns a pair consisting of (numDeletions,numInsertions) + /// + std::pair + NumDeletedAndInsertedBasesAt(size_t recordIndex) const; + + /// \} + +public: + /// \name Raw Data Containers + /// \{ + + std::vector tId_; + std::vector tStart_; + std::vector tEnd_; + std::vector aStart_; + std::vector aEnd_; + std::vector revStrand_; + std::vector nM_; + std::vector nMM_; + std::vector mapQV_; + + /// \} +}; + +/// \brief The PbiReferenceEntryClass represents a single reference in the PBI +/// CoordinateSorted section. +/// +/// A reference entry consists of an associated reference ID (tId), as well as +/// start and end indices into the %BAM or PBI. +/// +/// \note Rows are given in the interval [start, end). +/// +class PBBAM_EXPORT PbiReferenceEntry +{ +public: + typedef uint32_t ID; + typedef uint32_t Row; + +public: + static const ID UNMAPPED_ID; + static const Row UNSET_ROW; + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates a default entry. + /// + /// - default ID: PbiReferenceEntry::UNMAPPED_ID \n + /// - default rows: PbiReferenceEntry::UNSET_ROW + /// + PbiReferenceEntry(void); + + /// \brief Creates a reference entry, with no rows set. + /// + /// - default rows: PbiReferenceEntry::UNSET_ROW + /// + PbiReferenceEntry(ID id); + + /// \brief Creates a reference entry, with rows set. + /// + PbiReferenceEntry(ID id, Row beginRow, Row endRow); + + PbiReferenceEntry(const PbiReferenceEntry& other); + PbiReferenceEntry(PbiReferenceEntry&& other); + PbiReferenceEntry& operator=(const PbiReferenceEntry& other); + PbiReferenceEntry& operator=(PbiReferenceEntry&& other); + + bool operator==(const PbiReferenceEntry& other) const; + + /// \} + +public: + /// \name Reference Data Members + /// \{ + + ID tId_; + Row beginRow_; + Row endRow_; + + /// \} +}; + +/// \brief The PbiRawReferenceData class represents the raw data stored in the +/// "CoordinateSortedData" section of the PBI index. +/// +class PBBAM_EXPORT PbiRawReferenceData +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty data structure. + PbiRawReferenceData(void); + + /// \brief Creates an empty data structure, preallocating space for a + /// number of references. + /// + /// This constructor is recommended as this is the safest way to ensure that + /// references without observed mappings are included in the final output. + /// + PbiRawReferenceData(uint32_t numRefs); + + PbiRawReferenceData(const PbiRawReferenceData& other); + PbiRawReferenceData(PbiRawReferenceData&& other); + PbiRawReferenceData& operator=(const PbiRawReferenceData& other); + PbiRawReferenceData& operator=(PbiRawReferenceData&& other); + + /// \} + +public: + /// \name Raw Data Containers + /// \{ + + std::vector entries_; + + /// \} +}; + +/// \brief The PbiRawBasicData class represents the raw data stored in the +/// "BasicData" section of the PBI index. +/// +class PBBAM_EXPORT PbiRawBasicData +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty data structure. + PbiRawBasicData(void); + + /// \brief Creates an empty data structure, preallocating space for a known + /// number of records. + PbiRawBasicData(uint32_t numReads); + + PbiRawBasicData(const PbiRawBasicData& other); + PbiRawBasicData(PbiRawBasicData&& other); + PbiRawBasicData& operator=(const PbiRawBasicData& other); + PbiRawBasicData& operator=(PbiRawBasicData&& other); + + /// \} + +public: + /// \name Index Construction + /// \{ + + /// \brief Adds a record's mapping data. + /// + /// \param[in] b %BAM record + /// \param[in] offset \b virtual file offset where record begins + /// + void AddRecord(const BamRecord& b, int64_t offset); + + /// \} + +public: + /// \name Raw Data Containers + /// \{ + + std::vector rgId_; + std::vector qStart_; + std::vector qEnd_; + std::vector holeNumber_; + std::vector readQual_; + std::vector ctxtFlag_; + std::vector fileOffset_; + std::vector fileNumber_; + + /// \} +}; + +/// \deprecated For legacy-code support only, and will be removed soon. +/// Use PbiRawBasicData instead. +/// +typedef PbiRawBasicData PbiRawSubreadData; + +/// \brief The PbiRawData class provides an representation of raw PBI index +/// data, used mostly for construction or I/O. +/// +/// The PbiRawData class itself provides access to a few high-level attributes +/// (e.g. version, number of records, etc.). The actual index data is stored +/// in its member components: +/// PbiRawBasicData, +/// PbiRawMappedData, +/// PbiRawReferenceData, & +/// PbiRawBarcodeData . +/// +class PBBAM_EXPORT PbiRawData +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty raw data structure, ready for building. + /// + PbiRawData(void); + + /// \brief Loads raw PBI data from a file. + /// + /// \param[in] pbiFilename ".pbi" filename + /// + /// \throws std::runtime_error if file contents cannot be loaded properly + /// + PbiRawData(const std::string& pbiFilename); + + /// \brief Loads a raw, aggregate PBI data from a dataset + /// + /// This constructor creates a raw index object that contains an aggregation + /// of index data across the dataset. + /// + /// \note ReferenceData (the per-reference table for coordinate-sorted data) + /// is not currently available for the index aggregate. All other + /// per-record data sections will be present. + /// + /// \param[in] dataset DataSet object + /// + /// \throws std::runtime_error if file(s) contents cannot be loaded properly + /// + explicit PbiRawData(const DataSet& dataset); + + PbiRawData(const PbiRawData& other); + PbiRawData(PbiRawData&& other); + PbiRawData& operator=(const PbiRawData& other); + PbiRawData& operator=(PbiRawData&& other); + ~PbiRawData(void); + + /// \} + +public: + /// \name PBI General Attributes + /// \{ + + /// \returns true if index has BarcodeData section + bool HasBarcodeData(void) const; + + /// \returns true if index has MappedData section + bool HasMappedData(void) const; + + /// \returns true if index has ReferenceData section + bool HasReferenceData(void) const; + + /// \returns true if index has \b section + /// \param[in] section PbiFile::Section identifier + /// + bool HasSection(const PbiFile::Section section) const; + + /// \returns index filename ("*.pbi") + /// + /// \note Returns an empty string if the underlying data was calculated in + /// code or aggregated from a DataSet, rather than loaded from a + /// single PBI file. + /// + std::string Filename(void) const; + + /// \returns enum flags representing the file sections present + PbiFile::Sections FileSections(void) const; + + /// \returns the number of records in the PBI(s) + uint32_t NumReads(void) const; + + /// \returns the PBI file's version + PbiFile::VersionEnum Version(void) const; + + /// \} + +public: + /// \name Raw Data Components + /// \{ + + /// \returns const reference to BarcodeData lookup structure + /// + /// May be empty, check result of HasBarcodeData. + /// + const PbiRawBarcodeData& BarcodeData(void) const; + + /// \returns const reference to BasicData lookup structure + const PbiRawBasicData& BasicData(void) const; + + /// \returns const reference to MappedData lookup structure + /// + /// May be empty, check result of HasMappedData. + /// + const PbiRawMappedData& MappedData(void) const; + + /// \returns const reference to reference data lookup structure + /// + /// May be empty, check result of HasReferenceData. + /// + const PbiRawReferenceData& ReferenceData(void) const; + + /// \} + +public: + /// \name PBI General Attributes + /// \{ + + /// \brief Sets the file section flags. + /// + /// \param[in] sections section flags + /// \returns reference to this index + /// + PbiRawData& FileSections(PbiFile::Sections sections); + + /// \brief Sets the number of indexed records. + /// + /// \param[in] num number of records + /// \returns reference to this index + /// + PbiRawData& NumReads(uint32_t num); + + /// \brief Sets PBI file version. + /// + /// \param[in] version file version + /// \returns reference to this index + /// + PbiRawData& Version(PbiFile::VersionEnum version); + + /// \} + +public: + /// \name Raw Data Components + /// \{ + + /// \returns reference to BarcodeData lookup structure + /// + /// May be empty, check result of HasBarcodeData. + /// + PbiRawBarcodeData& BarcodeData(void); + + /// \returns reference to BasicData lookup structure + PbiRawBasicData& BasicData(void); + + /// \returns reference to MappedData lookup structure + /// + /// May be empty, check result of HasMappedData. + /// + PbiRawMappedData& MappedData(void); + + /// \returns reference to reference data lookup structure + /// + /// May be empty, check result of HasReferenceData. + /// + PbiRawReferenceData& ReferenceData(void); + + /// \} + +private: + std::string filename_; + PbiFile::VersionEnum version_; + PbiFile::Sections sections_; + uint32_t numReads_; + PbiRawBarcodeData barcodeData_; + PbiRawMappedData mappedData_; + PbiRawReferenceData referenceData_; + PbiRawBasicData basicData_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/PbiRawData.inl" + +#endif // PBIRAWDATA_H diff --git a/include/pbbam/Position.h b/include/pbbam/Position.h new file mode 100644 index 0000000..aece8c2 --- /dev/null +++ b/include/pbbam/Position.h @@ -0,0 +1,66 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Position.h +/// \brief Defines the Position typedef. +// +// Author: Derek Barnett + +#ifndef POSITION_H +#define POSITION_H + +#include "pbbam/Config.h" + +namespace PacBio { +namespace BAM { + +/// \brief This type is used to refer to genomic positions. +/// \typedef typedef int32_t PacBio::BAM::Position +/// +/// We use a signed integer because SAM/BAM uses the -1 value to indicate +/// unknown or unmapped positions. +/// +typedef int32_t Position; + +/// \brief This constant is widely used as a "missing" or "invalid" position +/// marker. +/// +static const Position UnmappedPosition = Position(-1); + +} // namespace BAM +} // namespace PacBio + +#endif // POSITION_H diff --git a/include/pbbam/ProgramInfo.h b/include/pbbam/ProgramInfo.h new file mode 100644 index 0000000..e137707 --- /dev/null +++ b/include/pbbam/ProgramInfo.h @@ -0,0 +1,222 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ProgramInfo.h +/// \brief Defines the ProgramInfo class. +// +// Author: Derek Barnett + +#ifndef PROGRAMINFO_H +#define PROGRAMINFO_H + +#include "pbbam/Config.h" +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief The ProgramInfo class represents a program entry (\@PG) in the SAM +/// header. +/// +class PBBAM_EXPORT ProgramInfo +{ +public: + /// \name Conversion & Validation + /// + + /// \brief Creates a ProgramInfo object from SAM-formatted text. + /// + /// \param[in] sam SAM-formatted text + /// \returns program info object + /// + static ProgramInfo FromSam(const std::string& sam); + + /// \brief Converts a ProgramInfo object to its SAM-formatted text. + /// + /// \param[in] prog input ProgramInfo object + /// \returns SAM-formatted text (no trailing newline) + /// + static std::string ToSam(const ProgramInfo& prog); + + /// \} + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty program info object. + ProgramInfo(void); + + /// \brief Creates a program info object with an ID. + /// + /// \param[in] id program ID (\@PG:ID) + /// + ProgramInfo(const std::string& id); + + + ProgramInfo(const ProgramInfo& other); + ProgramInfo(ProgramInfo&& other); + ProgramInfo& operator=(const ProgramInfo& other); + ProgramInfo& operator=(ProgramInfo&& other); + ~ProgramInfo(void); + + /// \} + +public: + /// \name Conversion & Validation + /// + + /// \returns true if program info is valid + /// + /// Currently this checks to see that ProgramInfo::Id does not contain an + /// empty string. + /// + bool IsValid(void) const; + + /// \brief Converts this object to its SAM-formatted text. + /// + /// \returns SAM-formatted text (no trailing newline) + /// + std::string ToSam(void) const; + + /// \} + +public: + /// \name Attributes + /// \{ + + /// \returns string value of \@PG:CL + std::string CommandLine(void) const; + + /// \returns any non-standard tags added to the \@PG entry + /// + /// Result map consists of {tagName => value}. + /// + std::map CustomTags(void) const; + + /// \returns string value of \@PG:DS + std::string Description(void) const; + + /// \returns string value of \@PG:ID + std::string Id(void) const; + + /// \returns string value of \@PG:PN + std::string Name(void) const; + + /// \returns string value of \@PG:PP + std::string PreviousProgramId(void) const; + + /// \returns string value of \@PG:VN + std::string Version(void) const; + + /// \} + +public: + /// \name Attributes + /// \{ + + /// \brief Sets the value for \@PG:CL + /// + /// \param[in] cmd new value + /// \returns reference to this object + /// + ProgramInfo& CommandLine(const std::string& cmd); + + /// \brief Sets a new collection of non-standard tags. + /// + /// Custom tag map entries should consist of {tagName => value}. + /// + /// \param[in] custom new tags + /// \returns reference to this object + /// + ProgramInfo& CustomTags(const std::map& custom); + + /// \brief Sets the value for \@PG:DS + /// + /// \param[in] description new value + /// \returns reference to this object + /// + ProgramInfo& Description(const std::string& description); + + /// \brief Sets the value for \@PG:ID + /// + /// \param[in] id new value + /// \returns reference to this object + /// + ProgramInfo& Id(const std::string& id); + + /// \brief Sets the value for \@PG:PN + /// + /// \param[in] name new value + /// \returns reference to this object + /// + ProgramInfo& Name(const std::string& name); + + /// \brief Sets the value for \@PG:PP + /// + /// \param[in] id new value + /// \returns reference to this object + /// + ProgramInfo& PreviousProgramId(const std::string& id); + + /// \brief Sets the value for \@PG:VN + /// + /// \param[in] version new value + /// \returns reference to this object + /// + ProgramInfo& Version(const std::string& version); + + /// \} + +private: + std::string commandLine_; // CL: + std::string description_; // DS: + std::string id_; // ID: * must be unique for valid SAM * + std::string name_; // PN: + std::string previousProgramId_; // PP: + std::string version_; // VN: + + // custom attributes + std::map custom_; // tag => value +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/ProgramInfo.inl" + +#endif // PROGRAMINFO_H diff --git a/include/pbbam/PulseBehavior.h b/include/pbbam/PulseBehavior.h new file mode 100644 index 0000000..79ec0da --- /dev/null +++ b/include/pbbam/PulseBehavior.h @@ -0,0 +1,60 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PulseBehavior.h +/// \brief Defines the PulseBehavior enum. +// +// Author: Derek Barnett + +#ifndef PULSEBEHAVIOR_H +#define PULSEBEHAVIOR_H + +namespace PacBio { +namespace BAM { + +/// \brief This enum defines the pulsecall modes supported by BamRecord tag +/// accessors. +/// +enum class PulseBehavior +{ + BASECALLS_ONLY ///< "Squashed" pulses not included, only basecalls. + , ALL ///< All pulses included. +}; + +} // namespace BAM +} // namespace PacBio + +#endif // PULSEBEHAVIOR_H diff --git a/include/pbbam/QNameQuery.h b/include/pbbam/QNameQuery.h new file mode 100644 index 0000000..ad93d03 --- /dev/null +++ b/include/pbbam/QNameQuery.h @@ -0,0 +1,94 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file QNameQuery.h +/// \brief Defines the QNameQuery class. +// +// Author: Derek Barnett + +#ifndef QNAMEQUERY_H +#define QNAMEQUERY_H + +#include "pbbam/internal/QueryBase.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The QNameQuery class provides iterable access to a DataSet's records, +/// with each iteration of the query returning a contiguous block of +/// records that share a name. +/// +/// There is no random-access here. It is simply a sequential read-through, +/// grouping contiguous results that share a BamRecord::FullName. +/// +/// \note The name is not ideal - but for legacy reasons, it will remain as-is +/// for now. It will likely become something more explicit, like +/// "SequentialQNameGroupQuery", so that the name "QNameQuery" will be +/// available for a built-in query on a QNAME filter (or whitelist). This +/// will make it more consistent with other queries (ReadAccuracyQuery, +/// SubreadLengthQuery, ZmwQuery, etc). +/// +class PBBAM_EXPORT QNameQuery : public internal::IGroupQuery +{ +public: + + /// \brief Creates a new QNameQuery. + /// + /// \param[in] dataset input data source(s) + /// + /// \throws std::runtime_error on failure to open/read underlying %BAM files + /// + QNameQuery(const DataSet& dataset); + ~QNameQuery(void); + +public: + /// \brief Main iteration point for record access. + /// + /// Most client code should not need to use this method directly. Use + /// iterators instead. + /// + bool GetNext(std::vector& records); + +private: + struct QNameQueryPrivate; + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // QNAMEQUERY_H diff --git a/include/pbbam/QualityValue.h b/include/pbbam/QualityValue.h new file mode 100644 index 0000000..ab108d0 --- /dev/null +++ b/include/pbbam/QualityValue.h @@ -0,0 +1,115 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file QualityValue.h +/// \brief Defines the QualityValue class. +// +// Author: Derek Barnett + +#ifndef QUALITYVALUE_H +#define QUALITYVALUE_H + +#include "pbbam/Config.h" +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief The QualityValue class represents a FASTQ-compatible quality value. +/// +/// Integers are clamped to [0, 93] (corresponding to ASCII printable chars +/// [!-~]). +/// +/// Use QualityValue::FromFastq for constructing entries from FASTQ encoding +/// characters. Otherwise, the resulting QualityValue will be interpreted using +/// the character's numeric value (ignoring the FASTQ offset of 33). +/// +class PBBAM_EXPORT QualityValue +{ +public: + static const uint8_t MAX; + +public: + /// \name Conversion Methods + /// \{ + + /// \brief Creates a QualityValue from a FASTQ-encoding character. + /// + /// \param[in] c FASTQ character + /// \returns quality value representing (c - 33) + /// + static QualityValue FromFastq(const char c); + + /// \} + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates a QualityValue with specified value. + /// + /// \param[in] value quality value + /// + QualityValue(const uint8_t value = 0); + + QualityValue(const QualityValue& other); + ~QualityValue(void); + + /// \} + +public: + /// \name Conversion Methods + /// \{ + + /// \returns the FASTQ-encoding char for this QualityValue + char Fastq(void) const; + + /// \returns the integer value of this QualityValue + operator uint8_t(void) const; + + /// \} + +private: + uint8_t value_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/QualityValue.inl" + +#endif // QUALITYVALUE_H diff --git a/include/pbbam/QualityValues.h b/include/pbbam/QualityValues.h new file mode 100644 index 0000000..af054f6 --- /dev/null +++ b/include/pbbam/QualityValues.h @@ -0,0 +1,200 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file QualityValues.h +/// \brief Defines the QualityValues class. +// +// Author: Derek Barnett + +#ifndef QUALITYVALUES_H +#define QUALITYVALUES_H + +#include "pbbam/QualityValue.h" +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief The QualityValues class represents a sequence of FASTQ-compatible +/// quality values. See QualityValue documentation for more details. +/// +class PBBAM_EXPORT QualityValues : public std::vector +{ +public: + /// \brief Creates a QualityValues object from a FASTQ-encoded string. + /// + /// \param[in] fastq FASTQ-encoded string + /// \returns corresponding QualityValues object + /// + static QualityValues FromFastq(const std::string& fastq); + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Default constructor - creates an empty QualityValues object. + QualityValues(void); + + /// \brief Creates a QualityValues object from a FASTQ-encoded string. + /// + /// \param[in] fastqString FASTQ-encoded string + /// + explicit QualityValues(const std::string& fastqString); + + /// \brief Creates a QualityValues object from a vector of QualityValue + /// elements. + /// + /// \param[in] quals vector of QualityValue elements + /// + explicit QualityValues(const std::vector& quals); + + /// \brief Creates a QualityValues object from a vector of QualityValue + /// elements. + /// + /// \param[in] quals vector of QualityValue elements + /// + QualityValues(std::vector&& quals); + + /// \brief Creates a QualityValues object from a vector of (numeric) quality + /// values. + /// + /// \param[in] quals vector of quality value numbers + /// + explicit QualityValues(const std::vector& quals); + + /// \brief Creates a QualityValues object from the contents of the range: + /// [first, last) + /// + /// \param[in] first input iterator, whose element is a numeric quality + /// \param[in] last input iterator, whose element is a numeric quality + /// + QualityValues(const std::vector::const_iterator first, + const std::vector::const_iterator last); + + /// \brief Creates a QualityValues object from the contents of the range: + /// [first, last) + /// + /// \param[in] first input iterator, whose element is a QualityValue + /// \param[in] last input iterator, whose element is a QualityValue + /// + QualityValues(const QualityValues::const_iterator first, + const QualityValues::const_iterator last); + + /// \brief Copy constructor + QualityValues(const QualityValues& other); + + /// \brief Move constructor + QualityValues(QualityValues&& other); + + /// \brief Copy assignment operator + /// + /// \param[in] other QualityValues object + /// + QualityValues& operator=(const QualityValues& other); + + /// \brief Move assignment operator + /// + /// \param[in] other QualityValues object + /// + QualityValues& operator=(QualityValues&& other); + + /// \brief Copy assignment operator + /// + /// \param[in] quals vector of QualityValue elements + /// + QualityValues& operator=(const std::vector& quals); + + /// \brief Move assignment operator + /// + /// \param[in] quals vector of QualityValue elements + /// + QualityValues& operator=(std::vector&& quals); + + /// \brief Destructor + ~QualityValues(void); + + /// \} + +public: + /// \name Comparison Operators + /// \{ + + bool operator==(const std::string& other) const; + bool operator!=(const std::string& other) const; + + /// \} + +public: + /// \name Iterators + /// \{ + + /// \returns a const_iterator to the beginning of the sequence + std::vector::const_iterator cbegin(void) const; + + /// \returns a const_iterator to the element following the last element + std::vector::const_iterator cend(void) const; + + /// \returns a const_iterator to the beginning of the sequence + std::vector::const_iterator begin(void) const; + + /// \returns a const_iterator to the element following the last element + std::vector::const_iterator end(void) const; + + /// \returns an iterator to the beginning of the sequence + std::vector::iterator begin(void); + + /// \returns an iterator to the element following the last element + std::vector::iterator end(void); + + /// \} + +public: + /// \name Conversion Methods + /// \{ + + /// \returns the FASTQ-encoded string for this sequence of quality values + std::string Fastq(void) const; + + /// \} +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/QualityValues.inl" + +#endif // QUALITYVALUES_H diff --git a/include/pbbam/ReadAccuracyQuery.h b/include/pbbam/ReadAccuracyQuery.h new file mode 100644 index 0000000..1eecb6c --- /dev/null +++ b/include/pbbam/ReadAccuracyQuery.h @@ -0,0 +1,104 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ReadAccuracyQuery.h +/// \brief Defines the ReadAccuracyQuery class. +// +// Author: Derek Barnett + +#ifndef READACCURACYQUERY_H +#define READACCURACYQUERY_H + +#include "pbbam/Accuracy.h" +#include "pbbam/Compare.h" +#include "pbbam/Config.h" +#include "pbbam/internal/QueryBase.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The ReadAccuracyQuery class provides iterable access to a DataSet's +/// %BAM records, limiting results to those matching a read accuracy +/// criterion. +/// +/// Example: +/// \include code/ReadAccuracyQuery.txt +/// +/// \note Currently, all %BAM files must have a corresponding ".pbi" index file. +/// Use BamFile::EnsurePacBioIndexExists before creating the query if one +/// may not be present. +/// +class PBBAM_EXPORT ReadAccuracyQuery : public internal::IQuery +{ +public: + + /// \brief Creates a new ReadAccuracyQuery, limiting record results to only + /// those matching a read accuracy criterion. + /// + /// \param[in] accuracy read accuracy value + /// \param[in] compareType compare operator + /// \param[in] dataset input data source(s) + /// + /// \sa BamRecord::ReadAccuracy + /// + /// \throws std::runtime_error on failure to open/read underlying %BAM or PBI + /// files. + /// + ReadAccuracyQuery(const Accuracy accuracy, + const Compare::Type compareType, + const DataSet& dataset); + + ~ReadAccuracyQuery(void); + +public: + + /// \brief Main iteration point for record access. + /// + /// Most client code should not need to use this method directly. Use + /// iterators instead. + /// + bool GetNext(BamRecord& r); + +private: + struct ReadAccuracyQueryPrivate; + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // READACCURACYQUERY_H diff --git a/include/pbbam/ReadGroupInfo.h b/include/pbbam/ReadGroupInfo.h new file mode 100644 index 0000000..c6c8a5f --- /dev/null +++ b/include/pbbam/ReadGroupInfo.h @@ -0,0 +1,636 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ReadGroupInfo.h +/// \brief Defines the ReadGroupInfo class. +// +// Author: Derek Barnett + +#ifndef READGROUPINFO_H +#define READGROUPINFO_H + +#include "pbbam/Config.h" +#include "pbbam/exception/InvalidSequencingChemistryException.h" +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief This enum describes the base features that may be present in a read +/// group's records. +/// +/// This information is stored in its description (\@RG:DS). +/// +enum class BaseFeature +{ + DELETION_QV + , DELETION_TAG + , INSERTION_QV + , MERGE_QV + , SUBSTITUTION_QV + , SUBSTITUTION_TAG + , IPD + , PULSE_WIDTH + , PKMID + , PKMEAN + , PKMID2 + , PKMEAN2 + , LABEL + , LABEL_QV + , ALT_LABEL + , ALT_LABEL_QV + , PULSE_MERGE_QV + , PULSE_CALL + , PRE_PULSE_FRAMES + , PULSE_CALL_WIDTH + , START_FRAME +}; + +/// \brief This enum describes the encoding types used for frame data within a +/// read group's records. +/// +/// This information is stored in its description (\@RG:DS). +/// +enum class FrameCodec +{ + RAW + , V1 +}; + +/// \brief This enum describes the experimental design of the barcodes within a +/// read group's records. +/// +/// This information is stored in its description (\@RG:DS). +/// +enum class BarcodeModeType +{ + NONE + , SYMMETRIC + , ASYMMETRIC +}; + +/// \brief This enum describes the type of value encoded by barcode quality, +/// within a read group's records. +/// +/// This information is stored in its description (\@RG:DS). +/// +enum class BarcodeQualityType +{ + NONE + , SCORE + , PROBABILITY +}; + +/// \brief This enum describes the instrument type / platform model, +/// within a read group's records. +/// +/// This information is stored in its description (\@RG:PM). +/// +enum class PlatformModelType +{ + ASTRO + , RS + , SEQUEL +}; + +/// \brief The ReadGroupInfo class represents a read group entry (\@RG) in the +/// SAM header. +/// +class PBBAM_EXPORT ReadGroupInfo +{ +public: + /// \name Conversion & Validation + /// + + /// \brief Creates a ReadGroupInfo object from SAM-formatted text. + /// + /// \param[in] sam SAM-formatted text + /// \returns read group info object + /// + static ReadGroupInfo FromSam(const std::string& sam); + + /// \brief Converts a ReadGroupInfo object to its SAM-formatted text. + /// + /// \param[in] rg input ReadGroupInfo object + /// \returns SAM-formatted text (no trailing newline) + /// + static std::string ToSam(const ReadGroupInfo& rg); + + /// \brief Converts a read group ID (string) to its numeric value. + /// + /// \param[in] rgId read group ID string + /// \returns numeric value of ID + /// + static int32_t IdToInt(const std::string& rgId); + + /// \brief Converts a read group ID number to its string representation. + /// + /// \param[in] id read group ID number + /// \returns hexadecimal string representation of ID + /// + static std::string IntToId(const int32_t id); + + /// \returns sequencing chemistry from (bindingKig, sequencingKit, + /// basecallerVersion) + /// + static std::string SequencingChemistryFromTriple(const std::string& bindingKit, + const std::string& sequencingKit, + const std::string& basecallerVersion); + + /// \} + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty read group info object. + ReadGroupInfo(void); + + /// \brief Creates a read group info object with an ID. + /// + /// \param[in] id string representation of read group ID + /// + ReadGroupInfo(const std::string& id); + + /// \brief Creates a read group info object from a movie name & read type. + /// + /// \param[in] movieName sequencing movie name + /// \param[in] readType string version of record type + /// + /// \sa RecordType + /// + ReadGroupInfo(const std::string& movieName, + const std::string& readType); + + /// \brief Creates a read group info object from a movie name, read type, + /// and platform model. + /// + /// \param[in] movieName sequencing movie name + /// \param[in] readType string version of record type + /// \param[in] platform platform model type + /// + /// \sa RecordType + /// + ReadGroupInfo(const std::string& movieName, + const std::string& readType, + const PlatformModelType platform); + + ReadGroupInfo(const ReadGroupInfo& other); + ReadGroupInfo(ReadGroupInfo&& other); + ReadGroupInfo& operator=(const ReadGroupInfo& other); + ReadGroupInfo& operator=(ReadGroupInfo&& other); + ~ReadGroupInfo(void); + + /// \} + +public: + /// \name Comparison Operators + /// \{ + + bool operator==(const ReadGroupInfo& other) const; + + /// \} + +public: + /// \name Conversion & Validation + /// \{ + + /// \returns true if read group info is valid + /// + /// Currently this checks to see that ReadGroupInfo::Id does not contain an + /// empty string. + /// + bool IsValid(void) const; + + /// \brief Converts this object to its SAM-formatted text. + /// + /// \returns SAM-formatted text (no trailing newline) + /// + std::string ToSam(void) const; + + /// \} + +public: + /// \name Attributes + /// \{ + + /// \returns the number of barcode sequences in BarcodeFile + /// + /// \throws std::runtime_error if barcode data not set. + /// Check HasBarcodeData if this data may be absent. + /// + size_t BarcodeCount(void) const; + + /// \returns name of FASTA file containing barcode sequences + /// + /// \throws std::runtime_error if barcode data not set. + /// Check HasBarcodeData if this data may be absent. + /// + std::string BarcodeFile(void) const; + + /// \returns MD5 hash of the contents of BarcodeFile + /// + /// \throws std::runtime_error if barcode data not set. + /// Check HasBarcodeData if this data may be absent. + /// + std::string BarcodeHash(void) const; + + /// \returns experimental design type of barcodes + /// + /// \throws std::runtime_error if barcode data not set. + /// Check HasBarcodeData if this data may be absent. + /// + BarcodeModeType BarcodeMode(void) const; + + /// \returns type of value encoded in the 'bq' tag + /// + /// \throws std::runtime_error if barcode data is not set. + /// Check HasBarcodeData if this data may be absent. + /// + BarcodeQualityType BarcodeQuality(void) const; + + /// \returns basecaller version number (e.g. "2.1") + std::string BasecallerVersion(void) const; + + /// \returns tag name in use for the specified for base feature + std::string BaseFeatureTag(const BaseFeature& feature) const; + + /// \returns binding kit part number (e.g. "100236500") + std::string BindingKit(void) const; + + /// \returns true if reads are classified as spike-in controls + bool Control(void) const; + + /// \returns any non-standard tags added to the \@PG entry + /// + /// Result map consists of {tagName => value}. + /// + std::map CustomTags(void) const; + + /// \returns string value of \@RG:DT + std::string Date(void) const; + + /// \returns string value of \@RG:FO + std::string FlowOrder(void) const; + + /// \returns frame rate in Hz + std::string FrameRateHz(void) const; + + /// \returns true if read group has barcode data + bool HasBarcodeData(void) const; + + /// \returns true if read group has an entry for the specified base feature + bool HasBaseFeature(const BaseFeature& feature) const; + + /// \returns string value of \@RG:ID + std::string Id(void) const; + + /// \returns codec type in use for IPD + FrameCodec IpdCodec(void) const; + + /// \returns string value of \@RG:KS + std::string KeySequence(void) const; + + /// \returns string value of \@RG:LB + std::string Library(void) const; + + /// \returns movie name (stored in \@RG:PU) + std::string MovieName(void) const; + + /// \returns string value of \@RG:PL + std::string Platform(void) const; + + /// \returns string value of \@RG:PM + PlatformModelType PlatformModel(void) const; + + /// \returns string value of \@RG:PI + std::string PredictedInsertSize(void) const; + + /// \returns string value of \@RG:PG + std::string Programs(void) const; + + /// \returns codec type in use for PulseWidth + FrameCodec PulseWidthCodec(void) const; + + /// \returns string value of read type + std::string ReadType(void) const; + + /// \returns string value of \@RG:SM + std::string Sample(void) const; + + /// \returns string value of \@RG:CN + std::string SequencingCenter(void) const; + + /// \returns sequencing chemistry name + std::string SequencingChemistry(void) const; + + /// \returns sequencing kit part number + std::string SequencingKit(void) const; + + /// \} + +public: + /// \name Attributes + /// \{ + + /// \brief Sets read group's barcode data. + /// + /// Barcode fields are either absent or all must be present. + /// + /// \param[in] barcodeFile barcode filename + /// \param[in] barcodeHash MD5 hash of barcode file + /// \param[in] barcodeCount number of records in barcode file + /// \param[in] barcodeMode experimental design of barcodes + /// \param[in] barcodeQuality type of barcode quality value + /// + /// \sa BarcodeFile \n + /// BarcodeHash \n + /// BarcodeCount \n + /// BarcodeMode \n + /// BarcodeQuality \n + /// ReadGroupInfo::ClearBarcodeData + /// + /// \returns reference to this object + /// + ReadGroupInfo& BarcodeData(const std::string& barcodeFile, + const std::string& barcodeHash, + size_t barcodeCount, + BarcodeModeType barcodeMode, + BarcodeQualityType barcodeQuality); + + /// \brief Sets the basecaller version number. + /// + /// \param[in] versionNumber new value + /// \returns reference to this object + /// + ReadGroupInfo& BasecallerVersion(const std::string& versionNumber); + + /// \brief Sets the tag to be used for a particular base feature. + /// + /// \param[in] feature feature type begin updated + /// \param[in] tag new value + /// \returns reference to this object + /// + ReadGroupInfo& BaseFeatureTag(const BaseFeature& feature, + const std::string& tag); + + /// \brief Sets the binding kit part number. + /// + /// \param[in] kitNumber new value + /// \returns reference to this object + /// + ReadGroupInfo& BindingKit(const std::string& kitNumber); + + /// \brief Removes all barcode data from this read group. + /// + /// \returns reference to this read group + /// + ReadGroupInfo& ClearBarcodeData(void); + + /// \brief Removes all base features from this read group. + /// + /// \returns reference to this read group + /// + ReadGroupInfo& ClearBaseFeatures(void); + + /// \brief Sets whether read group's records are classifed as spike-in + /// controls. + /// + /// \param[in] ctrl true if records are spike-in controls + /// \returns reference to this object + /// + ReadGroupInfo& Control(const bool ctrl); + + /// \brief Sets a new collection of non-standard tags. + /// + /// Custom tag map entries should consist of {tagName => value}. + /// + /// \param[in] custom new tags + /// \returns reference to this object + /// + ReadGroupInfo& CustomTags(const std::map& custom); + + /// \brief Sets the value for \@RG:DT + /// + /// \param[in] date new value + /// \returns reference to this object + /// + ReadGroupInfo& Date(const std::string& date); + + /// \brief Sets the value for \@RG:FO + /// + /// \param[in] order new value + /// \returns reference to this object + /// + ReadGroupInfo& FlowOrder(const std::string& order); + + /// \brief Sets the frame rate. + /// + /// \param[in] frameRateHz string value of frame rate in Hz + /// \returns reference to this object + /// + ReadGroupInfo& FrameRateHz(const std::string& frameRateHz); + + /// \brief Sets the read group's ID. + /// + /// \param[in] id string value of ID + /// \returns reference to this object + /// + ReadGroupInfo& Id(const std::string& id); + + /// \brief Sets the read group's ID, from movie name & read type + /// + /// \param[in] movieName sequencing movie name + /// \param[in] readType string version of read type + /// \returns reference to this object + /// + ReadGroupInfo& Id(const std::string& movieName, + const std::string& readType); + + /// \brief Sets the codec type used for IPD + /// + /// \param[in] codec codec type + /// \param[in] tag IPD tag + /// \returns reference to this object + /// + ReadGroupInfo& IpdCodec(const FrameCodec& codec, + const std::string& tag = std::string()); + + /// \brief Sets the value for \@RG:KS + /// + /// \param[in] sequence new value + /// \returns reference to this object + /// + ReadGroupInfo& KeySequence(const std::string& sequence); + + /// \brief Sets the value for \@RG:LB + /// + /// \param[in] library new value + /// \returns reference to this object + /// + ReadGroupInfo& Library(const std::string& library); + + /// \brief Sets the value for movie name (stored in \@RG:PU). + /// + /// \param[in] movieName new value + /// \returns reference to this object + /// + ReadGroupInfo& MovieName(const std::string& movieName); + + /// \brief Sets the value for \@RG:PI + /// + /// \param[in] size new value + /// \returns reference to this object + /// + ReadGroupInfo& PredictedInsertSize(const std::string& size); + + /// \brief Sets the value for \@RG:PG + /// + /// \param[in] programs new value + /// \returns reference to this object + /// + ReadGroupInfo& Programs(const std::string& programs); + + /// \brief Sets the value for \@RG:PM + /// + /// \param[in] platformModel new value + /// \returns reference to this object + /// + ReadGroupInfo& PlatformModel(const PlatformModelType& platform); + + /// \brief Sets the codec type used for PulseWidth + /// + /// \param[in] codec codec type + /// \param[in] tag pulse width tag + /// \returns reference to this object + /// + ReadGroupInfo& PulseWidthCodec(const FrameCodec& codec, + const std::string& tag = std::string()); + + /// \brief Sets the read type. + /// + /// \param[in] type new value + /// \returns reference to this object + /// + ReadGroupInfo& ReadType(const std::string& type); + + /// \brief Removes a particular base feature from this read group. + /// + /// \param[in] feature feature to remove + /// \returns reference to this object + /// + ReadGroupInfo& RemoveBaseFeature(const BaseFeature& feature); + + /// \brief Sets the value for \@RG:SM + /// + /// \param[in] sample new value + /// \returns reference to this object + /// + ReadGroupInfo& Sample(const std::string& sample); + + /// \brief Sets the value for \@RG:CN + /// + /// \param[in] center new value + /// \returns reference to this object + /// + ReadGroupInfo& SequencingCenter(const std::string& center); + + /// \brief Sets the sequencing kit part number. + /// + /// \param[in] kitNumber new value + /// \returns reference to this object + /// + ReadGroupInfo& SequencingKit(const std::string& kitNumber); + + /// \} + +private: + std::string id_; // ID * must be unique for valid SAM * + std::string sequencingCenter_; // CN + std::string date_; // DT * (ISO-8601) * + std::string flowOrder_; // FO + std::string keySequence_; // KS + std::string library_; // LB + std::string programs_; // PG + std::string predictedInsertSize_; // PI + std::string movieName_; // PU + std::string sample_; // SM + + PlatformModelType platformModel_; // PM + + // DS: components + std::string readType_; + std::string bindingKit_; + std::string sequencingKit_; + std::string basecallerVersion_; + std::string frameRateHz_; + bool control_ = false; + FrameCodec ipdCodec_; + FrameCodec pulseWidthCodec_; + bool hasBarcodeData_ = false; + std::string barcodeFile_; + std::string barcodeHash_; + size_t barcodeCount_ = 0; + BarcodeModeType barcodeMode_ = BarcodeModeType::NONE; + BarcodeQualityType barcodeQuality_ = BarcodeQualityType::NONE; + std::map features_; + + // custom attributes + std::map custom_; // tag => value + +private: + std::string EncodeSamDescription(void) const; + void DecodeSamDescription(const std::string& description); +}; + +/// \brief Creates a read group ID from a movie name & read type. +/// +/// \param[in] movieName sequencing movie name +/// \param[in] readType string version of read type +/// +/// \returns hexadecimal string read group ID +/// +PBBAM_EXPORT +std::string MakeReadGroupId(const std::string& movieName, + const std::string& readType); + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/ReadGroupInfo.inl" + +#endif // READGROUPINFO_H diff --git a/include/pbbam/RecordType.h b/include/pbbam/RecordType.h new file mode 100644 index 0000000..9688211 --- /dev/null +++ b/include/pbbam/RecordType.h @@ -0,0 +1,67 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file RecordType.h +/// \brief Defines the RecordType enum. +// +// Author: Derek Barnett + +#ifndef RECORDTYPE_H +#define RECORDTYPE_H + +namespace PacBio { +namespace BAM { + +/// \brief This enum defines the possible PacBio BAM record types. +/// +/// \sa ReadGroupInfo::ReadType +/// +enum class RecordType +{ + ZMW ///< Polymerase read + , HQREGION ///< High-quality region + , SUBREAD ///< Subread ( + , CCS ///< Circular consensus sequence + , SCRAP ///< Additional sequence (barcodes, adapters, etc.) + , UNKNOWN ///< Unknown read type + + , POLYMERASE = ZMW ///< \deprecated as of PacBio BAM spec v 3.0.4 (use RecordType::ZMW instead) +}; + +} // namespace BAM +} // namespace PacBio + +#endif // RECORDTYPE_H diff --git a/include/pbbam/SamTagCodec.h b/include/pbbam/SamTagCodec.h new file mode 100644 index 0000000..cc4def4 --- /dev/null +++ b/include/pbbam/SamTagCodec.h @@ -0,0 +1,82 @@ +// Copyright (c) 2014, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file SamTagCodec.h +/// \brief Defines the SamTagCodec class. +// +// Author: Derek Barnett + +#ifndef SAMTAGCODEC_H +#define SAMTAGCODEC_H + +#include "pbbam/Config.h" +#include "pbbam/TagCollection.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The SamTagCodec class provides text-based encoding/decoding of %BAM +/// tag data. +/// +/// \note SamTagCodec is mostly an implementation and/or testing detail, and may +/// be removed from the public API. +/// +class PBBAM_EXPORT SamTagCodec +{ +public: + /// \name Tag Collection Methods + /// \{ + + /// \brief Creates a TagCollection from SAM-formatted tag data. + /// + /// \param[in] tagString SAM-formmated string + /// \returns resulting tag collection + /// + static TagCollection Decode(const std::string& tagString); + + /// \brief Creates SAM-formatted string from a TagCollection. + /// + /// \param[in] tags TagCollection containing tag data + /// \returns SAM-formatted string + /// + static std::string Encode(const PacBio::BAM::TagCollection& tags); +}; + +} // namespace BAM +} // namespace PacBio + +#endif // SAMTAGCODEC_H diff --git a/include/pbbam/SamWriter.h b/include/pbbam/SamWriter.h new file mode 100644 index 0000000..b407d7e --- /dev/null +++ b/include/pbbam/SamWriter.h @@ -0,0 +1,130 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file SamWriter.h +/// \brief Defines the SamWriter class. +// +// Author: Derek Barnett + +#ifndef SAMWRITER_H +#define SAMWRITER_H + +#include "pbbam/BamHeader.h" +#include "pbbam/BamRecord.h" +#include "pbbam/IRecordWriter.h" +#include +#include + +namespace PacBio { +namespace BAM { + +namespace internal { class SamWriterPrivate; } + +/// \brief The SamWriter class provides a writing interface for creating +/// new SAM files. +/// +/// \note The underlying buffered data may not be flushed to the file until the +/// destructor is called. Trying to access the file (reading, stat-ing, +/// indexing, etc.) before the SamWriter is destroyed yields undefined +/// behavior. Enclose the SamWriter in some form of local scope (curly +/// braces, a separate function, etc.) to ensure that its destructor is +/// called before proceeding to read-based operations. +/// +/// \code{.cpp} +/// { +/// SamWriter w(...); +/// // write data +/// } +/// // now safe to access the new file +/// \endcode +/// +/// +class SamWriter : public IRecordWriter +{ +public: + /// \brief Opens a SAM file for writing & writes the header information. + /// + /// \note Set \p filename to "-" for stdout. + /// + /// \param[in] filename path to output SAM file + /// \param[in] header BamHeader object + /// + /// \throws std::runtime_error if there was a problem opening the file for + /// writing or if an error occurred while writing the header + /// + SamWriter(const std::string& filename, const BamHeader& header); + + /// Fully flushes all buffered data & closes file. + /// + ~SamWriter(void); + +public: + + /// \brief Try to flush any buffered data to file. + /// + /// \note The underlying implementation may not necessarily flush buffered + /// data immediately, especially in a multithreaded writer situation. + /// Let the SamWriter go out of scope to fully ensure flushing. + /// + /// \throws std::runtime_error if flush fails + /// + void TryFlush(void) override; + + /// \brief Write a record to the output SAM file. + /// + /// \param[in] record BamRecord object + /// + /// \throws std::runtime_error on failure to write + /// + void Write(const BamRecord& record) override; + + /// \brief Write a record to the output SAM file. + /// + /// \param[in] recordImpl BamRecordImpl object + /// + /// \throws std::runtime_error on failure to write + /// + void Write(const BamRecordImpl& recordImpl) override; + +private: + std::unique_ptr d_; + DISABLE_MOVE_AND_COPY(SamWriter); +}; + +} // namesapce BAM +} // namespace PacBio + +#endif // SAMWRITER_H diff --git a/include/pbbam/SequenceInfo.h b/include/pbbam/SequenceInfo.h new file mode 100644 index 0000000..88b8dd1 --- /dev/null +++ b/include/pbbam/SequenceInfo.h @@ -0,0 +1,232 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file SequenceInfo.h +/// \brief Defines the SequenceInfo class. +// +// Author: Derek Barnett + +#ifndef SEQUENCEINFO_H +#define SEQUENCEINFO_H + +#include "pbbam/Config.h" +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief The SequenceInfo class represents a program entry (\@SQ) in the SAM +/// header. +/// +class PBBAM_EXPORT SequenceInfo +{ +public: + /// \name Conversion & Validation + /// + + /// \brief Creates a SequenceInfo object from SAM-formatted text. + /// + /// \param[in] sam SAM-formatted text + /// \returns program info object + /// + static SequenceInfo FromSam(const std::string& sam); + + /// \brief Converts a SequenceInfo object to its SAM-formatted text. + /// + /// \param[in] seq input SequenceInfo object + /// \returns SAM-formatted text (no trailing newline) + /// + static std::string ToSam(const SequenceInfo& seq); + + /// \} + +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty sequence info object. + SequenceInfo(void); + + /// \brief Creates a sequence info object with name & (optional) length. + /// + /// \param[in] name sequence name (\@SQ:SN) + /// \param[in] length sequence length (\@SQ:LN) + /// + SequenceInfo(const std::string& name, + const std::string& length = "0"); + + SequenceInfo(const SequenceInfo& other); + SequenceInfo(SequenceInfo&& other); + SequenceInfo& operator=(const SequenceInfo& other); + SequenceInfo& operator=(SequenceInfo&& other); + ~SequenceInfo(void); + + /// \} + +public: + /// \name Operators + /// \{ + + bool operator==(const SequenceInfo& other) const; + bool operator!=(const SequenceInfo& other) const; + + /// \} + +public: + /// \name Conversion & Validation + /// + + /// \returns true if sequence info is valid + /// + /// Currently this checks to see that Name is non-empty and Length is within + /// the accepted range. + /// + bool IsValid(void) const; + + /// \brief Converts this object to its SAM-formatted text. + /// + /// \returns SAM-formatted text (no trailing newline) + /// + std::string ToSam(void) const; + + /// \} + +public: + /// \name Attributes + /// \{ + + /// \returns string value of \@SQ:AS + std::string AssemblyId(void) const; + + /// \returns string value of \@SQ:M5 + std::string Checksum(void) const; + + /// \returns any non-standard tags added to the \@PG entry + /// + /// Result map consists of {tagName => value}. + /// + std::map CustomTags(void) const; + + /// \returns string value of \@SQ:LN + std::string Length(void) const; + + /// \returns string value of \@SQ:SN + std::string Name(void) const; + + /// \returns string value of \@SQ:SP + std::string Species(void) const; + + /// \returns string value of \@SQ:UR + std::string Uri(void) const; + + /// \} + +public: + /// \name Attributes + /// \{ + + /// \brief Sets the value for \@SQ:AS + /// + /// \param[in] id new value + /// \returns reference to this object + /// + SequenceInfo& AssemblyId(const std::string& id); + + /// \brief Sets the value for \@SQ:M5 + /// + /// \param[in] checksum new value + /// \returns reference to this object + /// + SequenceInfo& Checksum(const std::string& checksum); + + /// \brief Sets a new collection of non-standard tags. + /// + /// Custom tag map entries should consist of {tagName => value}. + /// + /// \param[in] custom new tags + /// \returns reference to this object + /// + SequenceInfo& CustomTags(const std::map& custom); + + /// \brief Sets the value for \@SQ:LN + /// + /// \param[in] length new value + /// \returns reference to this object + /// + SequenceInfo& Length(const std::string& length); + + /// \brief Sets the value for \@SQ:SN + /// + /// \param[in] name new value + /// \returns reference to this object + /// + SequenceInfo& Name(const std::string& name); + + /// \brief Sets the value for \@SQ:SP + /// + /// \param[in] species new value + /// \returns reference to this object + /// + SequenceInfo& Species(const std::string& species); + + /// \brief Sets the value for \@SQ:UR + /// + /// \param[in] uri new value + /// \returns reference to this object + /// + SequenceInfo& Uri(const std::string& uri); + + /// \} + +private: + std::string name_; // SN: * must be unique for valid SAM * + std::string length_; // LN: * must be within [0 - 2^31-1] * + std::string assemblyId_; // AS: + std::string checksum_; // M5: + std::string species_; // SP: + std::string uri_; // UR: + + // custom attributes + std::map custom_; // tag => value +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/SequenceInfo.inl" + +#endif // SEQUENCEINFO_H diff --git a/include/pbbam/Strand.h b/include/pbbam/Strand.h new file mode 100644 index 0000000..6fa5043 --- /dev/null +++ b/include/pbbam/Strand.h @@ -0,0 +1,62 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Strand.h +/// \brief Defines the Strand enum. +// +// Author: Derek Barnett + +#ifndef STRAND_H +#define STRAND_H + +#include "pbbam/Config.h" + +namespace PacBio { +namespace BAM { + +/// \brief This enum defines the strand orientations used for reporting +/// alignment-related information. +/// +enum class Strand +{ + FORWARD ///< Forward strand + , REVERSE ///< Reverse strand +}; + +} // namespace BAM +} // namespace PacBio + +#endif // STRAND_H diff --git a/include/pbbam/SubreadLengthQuery.h b/include/pbbam/SubreadLengthQuery.h new file mode 100644 index 0000000..e8839fe --- /dev/null +++ b/include/pbbam/SubreadLengthQuery.h @@ -0,0 +1,99 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file SubreadLengthQuery.h +/// \brief Defines the SubreadLengthQuery class. +// +// Author: Derek Barnett + +#ifndef SUBREADLENGTHQUERY_H +#define SUBREADLENGTHQUERY_H + +#include "pbbam/Compare.h" +#include "pbbam/Config.h" +#include "pbbam/internal/QueryBase.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The SubreadLengthQuery class provides iterable access to a DataSet's +/// %BAM records, limiting results to those matching a subread length +/// criterion. +/// +/// Example: +/// \include code/SubreadLengthQuery.txt +/// +/// \note Currently, all %BAM files must have a corresponding ".pbi" index file. +/// Use BamFile::EnsurePacBioIndexExists before creating the query if one +/// may not be present. +/// +class PBBAM_EXPORT SubreadLengthQuery : public internal::IQuery +{ +public: + /// \brief Creates a new SubreadLengthQuery, limiting record results to only + /// those matching a subread length criterion. + /// + /// \param[in] length subread length value + /// \param[in] compareType compare operator + /// \param[in] dataset input data source(s) + /// + /// \throws std::runtime_error on failure to open/read underlying %BAM or PBI + /// files. + /// + SubreadLengthQuery(const int32_t length, + const Compare::Type compareType, + const DataSet& dataset); + + ~SubreadLengthQuery(void); + +public: + /// \brief Main iteration point for record access. + /// + /// Most client code should not need to use this method directly. Use + /// iterators instead. + /// + bool GetNext(BamRecord& r); + +private: + struct SubreadLengthQueryPrivate; + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // SUBREADLENGTHQUERY_H diff --git a/include/pbbam/Tag.h b/include/pbbam/Tag.h new file mode 100644 index 0000000..0520e38 --- /dev/null +++ b/include/pbbam/Tag.h @@ -0,0 +1,453 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Tag.h +/// \brief Defines the Tag class. +// +// Author: Derek Barnett + +#ifndef TAG_H +#define TAG_H + +#include "pbbam/Config.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief This enum is used to describe the exact (C++) data type held by a +/// Tag. +/// +enum class TagDataType +{ + INVALID = 0 ///< boost::blank + , INT8 ///< int8_t + , UINT8 ///< uint8_t + , INT16 ///< int16_t + , UINT16 ///< uint16_t + , INT32 = 5 ///< int32_t + , UINT32 ///< uint32_t + , FLOAT ///< float + , STRING ///< std::string + , INT8_ARRAY ///< std::vector + , UINT8_ARRAY = 10 ///< std::vector + , INT16_ARRAY ///< std::vector + , UINT16_ARRAY ///< std::vector + , INT32_ARRAY ///< std::vector + , UINT32_ARRAY ///< std::vector + , FLOAT_ARRAY = 15 ///< std::vector +}; + +/// \brief This enum provides additional instructions on interpreting the tag's +/// value. +/// +/// Some C++ data types (e.g. std::string) may represent more than one BAM tag +/// type ('H' vs 'Z'). Thus a TagModifier may be used to indicate how to +/// properly distinguish between these shared data types. +/// +enum class TagModifier +{ + /// \brief This value indicates that the tag has no modifiers set. + /// + NONE = 0, + + /// \brief This modifier marks an integer as ASCII. + /// + /// SAM/BAM has the concept of an ASCII character that is distinct from an + /// 8-bit integer. However, there is no such pure separation in C++ - as + /// int8_t/uint8_t are likely implemented as typedefs around char/unsigned + /// char. Thus this modifier can be used to indicate a tag's value should be + /// interpreted as a printable, ASCII character. + /// + ASCII_CHAR, + + /// \brief This modifier marks std::string data as "hex string", rather than + /// a regular string. + /// + /// SAM/BAM has a distinction between regular strings and "Hex format" + /// strings. However, they are both manipulated in C++ via std::string. Thus + /// this modifier can be used to indicate that a tag's string data should be + /// interpreted as "Hex format" rather than a regular, literal string. + /// + HEX_STRING +}; + +/// \brief The Tag class represents a SAM/BAM record tag value. +/// +/// SAM/BAM tags may store values from a variety of types: varying fixed-width +/// integers, strings, arrays of data, etc. +/// +/// The Tag class allow tags to be handled in a generic fashion, while +/// maintaining a high level of type-safety. Only those types recognized by the +/// SAM/BAM format are allowed, and extracting the value from a tag is subject +/// to allowed conversion rules, as well. +/// +// Inspired by (but greatly simplified & modified from) the boost::variant +// wrapper approach taken by DynamicCpp (https://code.google.com/p/dynamic-cpp) +// +class PBBAM_EXPORT Tag +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates an empty, null tag + Tag(void); + + /// \brief Creates a Tag from a signed 8-bit integer or character. + /// + /// Without a TagModifier, the resulting Tag will be annotated as containing + /// an 8-bit integer, whether the input \p value was an integer or a char. + /// For ASCII tags, use one of these methods: + /// \include code/Tag_AsciiCtor.txt + /// + Tag(int8_t value); + + /// \brief Creates a Tag from a signed 8-bit integer or character, + /// applying the provided modifier. + /// + /// This method allows direct construction of an ASCII character, rather + /// than an 8-bit integer (e.g. Tag('A', TagModifier::ASCII_CHAR) ). + /// + /// \throws runtime_error if \p modifier is not valid for int8_t data + /// + Tag(int8_t value, const TagModifier mod); + + /// \brief Creates a Tag from an unsigned 8-bit integer or character. + /// + /// Without a TagModifier, the resulting Tag will be annotated as containing + /// an 8-bit unsigned integer, whether the input \p value was an integer or + /// a char. For ASCII tags, use one of these methods: + /// \include code/Tag_AsciiCtor.txt + /// + Tag(uint8_t value); + + /// \brief Creates a Tag from 16-bit integer. + Tag(int16_t value); + + /// \brief Creates a Tag from 16-bit unsigned integer. + Tag(uint16_t value); + + /// \brief Creates a Tag from 32-bit signed integer. + Tag(int32_t value); + + /// \brief Creates a Tag from 32-bit unsigned integer. + Tag(uint32_t value); + + /// \brief Creates a Tag from floating-point value. + Tag(float value); + + /// \brief Creates a Tag from string data. + Tag(const std::string& value); + + /// \brief Creates a Tag from string data, adding modifier. + /// + /// \throws runtime_error if \p modifier is not valid for string data + /// + Tag(const std::string& value, const TagModifier mod); + + /// \brief Creates a Tag from a vector of 8-bit integers. + Tag(const std::vector& value); + + /// \brief Creates a Tag from a vector of 8-bit unsigned integers. + Tag(const std::vector& value); + + /// \brief Creates a Tag from a vector of 16-bit integers. + Tag(const std::vector& value); + + /// \brief Creates a Tag from a vector of 16-bit unsigned integers. + Tag(const std::vector& value); + + /// Constructs a Tag from a vector of 32-bit integers. + Tag(const std::vector& value); + + /// \brief Creates a Tag from a vector of 32-bit unsigned integers. + Tag(const std::vector& value); + + /// \brief Creates a Tag from a vector of floating-point values. + Tag(const std::vector& value); + + Tag(const Tag& other); + Tag(Tag&& other); + ~Tag(void); + + Tag& operator=(boost::blank value); + Tag& operator=(int8_t value); + Tag& operator=(uint8_t value); + Tag& operator=(int16_t value); + Tag& operator=(uint16_t value); + Tag& operator=(int32_t value); + Tag& operator=(uint32_t value); + Tag& operator=(float value); + Tag& operator=(const std::string& value); + Tag& operator=(const std::vector& value); + Tag& operator=(const std::vector& value); + Tag& operator=(const std::vector& value); + Tag& operator=(const std::vector& value); + Tag& operator=(const std::vector& value); + Tag& operator=(const std::vector& value); + Tag& operator=(const std::vector& value); + Tag& operator=(const Tag& other); + Tag& operator=(Tag&& other); + + bool operator== (const Tag& other) const; + bool operator!= (const Tag& other) const; + + /// \} + +public: + /// \name Data Conversion & Validation + /// \{ + + /// \brief Converts the tag value to an ASCII character. + /// + /// Tag must hold an integral type, within the valid ASCII range [33-127]. + /// + /// \returns ASCII character value + /// \throws std::runtime_error if not ASCII-compatible + /// + char ToAscii(void) const; + + /// \returns tag data as signed 8-bit (casting if needed) + /// \throws std::runtime_error if not integral data, or out of valid range + int8_t ToInt8(void) const; + + /// \returns tag data as unsigned 8-bit (casting if needed) + /// \throws std::runtime_error if not integral data, or out of valid range + uint8_t ToUInt8(void) const; + + /// \returns tag data as signed 16-bit (casting if needed) + /// \throws std::runtime_error if not integral data, or out of valid range + int16_t ToInt16(void) const; + + /// \returns tag data as unsigned 16-bit (casting if needed) + /// \throws std::runtime_error if not integral data, or out of valid range + uint16_t ToUInt16(void) const; + + /// \returns tag data as signed 32-bit (casting if needed) + /// \throws std::runtime_error if not integral data, or out of valid range + int32_t ToInt32(void) const; + + /// \returns tag data as unsigned 32-bit (casting if needed) + /// \throws std::runtime_error if not integral data, or out of valid range + uint32_t ToUInt32(void) const; + + /// \returns tag data as float + /// \throws std::runtime_error if tag does not contain a value of + /// explicit type: float + float ToFloat(void) const; + + /// \returns tag data as std::string + /// \throws std::runtime_error if tag does not contain a value of explicit + /// type: std::string + std::string ToString(void) const; + + /// \returns tag data as std::vector + /// \throws std::runtime_error if tag does not contain a value of explicit + /// type: std::vector + std::vector ToInt8Array(void) const; + + /// \returns tag data as std::vector + /// \throws std::runtime_error if tag does not contain a value of explicit + /// type: std::vector + std::vector ToUInt8Array(void) const; + + /// \returns tag data as std::vector + /// \throws std::runtime_error if tag does not contain a value of explicit + /// type: std::vector + std::vector ToInt16Array(void) const; + + /// \returns tag data as std::vector + /// \throws std::runtime_error if tag does not contain a value of explicit + /// type: std::vector + std::vector ToUInt16Array(void) const; + + /// \returns tag data as std::vector + /// \throws std::runtime_error if tag does not contain a value of explicit + /// type: std::vector + std::vector ToInt32Array(void) const; + + /// \returns tag data as std::vector + /// \throws std::runtime_error if tag does not contain a value of explicit + /// type: std::vector + std::vector ToUInt32Array(void) const; + + /// \returns tag data as std::vector + /// \throws std::runtime_error if tag does not contain a value of explicit + /// type: std::vector + std::vector ToFloatArray(void) const; + + /// \} + +public: + + /// \name Data Conversion & Validation + /// + + /// \returns true if tag is null (e.g. default-constructed) + bool IsNull(void) const; + + /// \returns true if tag contains a value of type: int8_t + bool IsInt8(void) const; + + /// \returns true if tag contains a value of type: uint8_t + bool IsUInt8(void) const; + + /// \returns true if tag contains a value of type: int16_t + bool IsInt16(void) const; + + /// \returns true if tag contains a value of type: uint16_t + bool IsUInt16(void) const; + + /// \returns true if tag contains a value of type: int32_t + bool IsInt32(void) const; + + /// \returns true if tag contains a value of type: uint32_t + bool IsUInt32(void) const; + + /// \returns true if tag contains a value of type: float + bool IsFloat(void) const; + + /// \returns true if tag contains a value of type: std::string + bool IsString(void) const; + + /// \returns true if tag contains a value of type: std::string \b AND has a + /// TagModifier of TagModifier::HEX_STRING + bool IsHexString(void) const; + + /// \returns true if tag contains a value of type: std::vector + bool IsInt8Array(void) const; + + /// \returns true if tag contains a value of type: std::vector + bool IsUInt8Array(void) const; + + /// \returns true if tag contains a value of type: std::vector + bool IsInt16Array(void) const; + + /// \returns true if tag contains a value of type: std::vector + bool IsUInt16Array(void) const; + + /// \returns true if tag contains a value of type: std::vector + bool IsInt32Array(void) const; + + /// \returns true if tag contains a value of type: std::vector + bool IsUInt32Array(void) const; + + /// \returns true if tag contains a value of type: std::vector + bool IsFloatArray(void) const; + + /// \returns true if tag contains a value with any signed integer type + bool IsSignedInt(void) const; + + /// \returns true if tag contains a value with any unsigned integer type + bool IsUnsignedInt(void) const; + + /// \returns true if tag contains a value with any integer type + bool IsIntegral(void) const; + + /// \returns true if tag contains a value with any integer or float type + bool IsNumeric(void) const; + + /// \returns true if tag contains a vector containing signed integers + bool IsSignedArray(void) const; + + /// \returns true if tag contains a vector containing unsigned integers + bool IsUnsignedArray(void) const; + + /// \returns true if tag contains a vector containing integers + bool IsIntegralArray(void) const; + + /// \returns true if tag contains a vector (integers or floats) + bool IsArray(void) const; + + /// \} + +public: + /// \name Type & Modifier Attributes + /// \{ + + /// \returns enum value for current tag data + TagDataType Type(void) const; + + /// \returns printable type name for current tag data + std::string Typename(void) const; + + /// \returns true if tag data modifier \p m is set + bool HasModifier(const TagModifier m) const; + + /// \returns current tag data modifier + TagModifier Modifier(void) const; + + /// \brief Sets tag data modifier. + /// + /// \param[in] m new modifier value + /// + /// \returns reference to this tag + Tag& Modifier(const TagModifier m); + + /// \} + +private : + // NOTE - keep this synced with TagDataType enum ordering + typedef boost::variant, + std::vector, + std::vector, + std::vector, + std::vector, + std::vector, + std::vector > var_t; + + var_t data_; + TagModifier modifier_; +}; + +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/Tag.inl" + +#endif // TAG_H diff --git a/include/pbbam/TagCollection.h b/include/pbbam/TagCollection.h new file mode 100644 index 0000000..11c80ff --- /dev/null +++ b/include/pbbam/TagCollection.h @@ -0,0 +1,68 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file TagCollection.h +/// \brief Defines the TagCollection class. +// +// Author: Derek Barnett + +#ifndef TAGCOLLECTION_H +#define TAGCOLLECTION_H + +#include "pbbam/Config.h" +#include "pbbam/Tag.h" +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief The TagCollection class represents a collection (or "dictionary") of +/// tags. +/// +/// Tags are mapped to their tag name, a 2-character string. +/// +class PBBAM_EXPORT TagCollection : public std::map +{ +public: + /// \returns true if the collection contains a tag with \p name + bool Contains(const std::string& name) const; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // TAGCOLLECTION_H diff --git a/include/pbbam/UnmappedReadsQuery.h b/include/pbbam/UnmappedReadsQuery.h new file mode 100644 index 0000000..1623dc9 --- /dev/null +++ b/include/pbbam/UnmappedReadsQuery.h @@ -0,0 +1,70 @@ +// Copyright (c) 2014, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef UNMAPPEDREADSQUERY_H +#define UNMAPPEDREADSQUERY_H + +// TODO: Not yet implemented. +// Can't get bam_itr_queryi(idx, HTS_IDX_NOCOOR, -1, -1) to work reliably at the moment. + +//#include "pbbam/QueryBase.h" +//#include + +//namespace PacBio { +//namespace BAM { + +//class BamFile; + +//class PBBAM_EXPORT UnmappedReadsQuery : public QueryBase +//{ +//public: +// UnmappedReadsQuery(const BamFile& file); + +//protected: +// bool GetNext(BamRecord& record); + +//private: +// std::shared_ptr file_; +// std::shared_ptr header_; +// std::shared_ptr index_; +// std::shared_ptr iterator_; +//}; + +//} // namespace BAM +//} // namspace PacBio + +#endif // UNMAPPEDREADSQUERY_H diff --git a/include/pbbam/Validator.h b/include/pbbam/Validator.h new file mode 100644 index 0000000..03f6c6c --- /dev/null +++ b/include/pbbam/Validator.h @@ -0,0 +1,192 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Validator.h +/// \brief Defines the Validator class. +// +// Author: Derek Barnett + +#ifndef VALIDATOR_H +#define VALIDATOR_H + +#include "pbbam/Config.h" +#include "pbbam/exception/ValidationException.h" +#include + +namespace PacBio { +namespace BAM { + +class BamFile; +class BamHeader; +class BamRecord; +class ReadGroupInfo; + +/// \brief The Validator class provides validation for %BAM data. +/// +/// There are 2 ways to use this class. If you are only compared with a quick & +/// dirty, yes/no validation, then you can use the IsValid() methods. This will +/// swallow the specific cause of the failure, but you don't have to catch an +/// exception and handle it in your client code. If you want to know, +/// specifically, what failed, then you can use the Validate*() methods that +/// will throw a ValidationException if the object is invalid. This exception +/// will provide more details as to what failed and why. +/// +/// See documentation for Config.h for details on building pbbam with +/// auto-validation enabled. +/// +class PBBAM_EXPORT Validator +{ +public: + /// \brief Checks that a %BAM file conforms to the %PacBio specification. + /// + /// When \p entireFile is false, this method only checks file metadata. If + /// \p entireFile is true, all records are checked as well. + /// + /// \param[in] file %BAM header to validate + /// \param[in] entireFile check records in addition to metadata + /// \returns true if \p file passes validation checks + /// + /// \sa Validator::ValidateFileMetdata, Validator::ValidateEntireFile + /// + static bool IsValid(const BamFile& file, const bool entireFile); + + /// \brief Checks that a %BAM header conforms to the %PacBio specification. + /// + /// \returns true if \p header passes validation checks + /// + /// \sa Validator::Validate(const BamHeader& header) + /// + static bool IsValid(const BamHeader& header); + + /// \brief Checks that a %BAM read group conforms to the %PacBio + /// specification. + /// + /// \returns true if \p rg passes validation checks + /// + /// \sa Validator::Validate(const ReadGroupInfo& rg) + /// + static bool IsValid(const ReadGroupInfo& rg); + + /// \brief Checks that a %BAM record conforms to the %PacBio specification. + /// + /// \returns true if \p record passes validation checks + /// + /// \sa Validator::Validate(const BamRecord& record) + /// + static bool IsValid(const BamRecord& record); + +public: + /// \brief Checks that a %BAM file's header conforms to the + /// %PacBio specification. + /// + /// This validation step checks the SAM/%BAM version number, sort order, + /// PacBioBAM version number, and calls Validate(readGroup) internally for + /// all read groups. + /// + /// \param[in] file %BAM header to validate + /// \param[in] maxErrors maximum number of errors to allow before throwing + /// + /// \throws ValidationException if \p header fails validation checks + /// + static void Validate(const BamHeader& header, + const size_t maxErrors = std::numeric_limits::max()); + + /// \brief Checks that a %BAM read group conforms to the %PacBio + /// specification. + /// + /// \param[in] rg %BAM read group to validate + /// \param[in] maxErrors maximum number of errors to allow before throwing + /// + /// \throws ValidationException if \p rg fails validation checks + /// + static void Validate(const ReadGroupInfo& rg, + const size_t maxErrors = std::numeric_limits::max()); + + /// \brief Checks that a %BAM record conforms to the %PacBio specification. + /// + /// \param[in] record %BAM record to validate + /// \param[in] maxErrors maximum number of errors to allow before throwing + /// + /// \throws ValidationException if \p record fails validation checks + /// + static void Validate(const BamRecord& record, + const size_t maxErrors = std::numeric_limits::max()); + + /// \brief Checks that a %BAM file's (entire) contents conform to the + /// %PacBio specification. + /// + /// This is equivalent to: + /// + /// \code + /// Validator::ValidateMetadata(file); + /// EntireFileQuery query(file); + /// for (const BamRecord& record : query) + /// Validator::Validate(record); + /// \endcode + /// + /// \param[in] file %BAM file to validate + /// \param[in] maxErrors maximum number of errors to allow before throwing + /// + /// \throws ValidationException if \p file fails validation checks + /// + static void ValidateEntireFile(const BamFile& file, + const size_t maxErrors = std::numeric_limits::max()); + + /// \brief Checks that a %BAM file's metadata conforms to the + /// %PacBio specification. + /// + /// This validation step checks the filename, ensures EOF marker, and + /// presence of PBI. It also calls Validate(file.Header()) internally. + /// + /// \param[in] file %BAM header to validate + /// \param[in] maxErrors maximum number of errors to allow before throwing + /// + /// \throws ValidationException if \p header fails validation checks + /// + static void ValidateFileMetadata(const BamFile& file, + const size_t maxErrors = std::numeric_limits::max()); + +private: + // hidden constructor + Validator(void) = delete; +}; + +} // namespace BAM +} // namespace PacBio + +#include "internal/Validator.inl" + +#endif // VALIDATOR_H diff --git a/include/pbbam/ZmwGroupQuery.h b/include/pbbam/ZmwGroupQuery.h new file mode 100644 index 0000000..290d3ad --- /dev/null +++ b/include/pbbam/ZmwGroupQuery.h @@ -0,0 +1,94 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ZmwGroupQuery.h +/// \brief Defines the ZmwGroupQuery class. +// +// Author: Derek Barnett + +#ifndef ZMWGROUPQUERY_H +#define ZMWGROUPQUERY_H + +#include "pbbam/internal/QueryBase.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The ZmwGroupQuery class provides iterable access to a DataSet's +/// %BAM records, limiting results to those matching a ZMW hole number +/// whitelist, and grouping those results by hole number. +/// +/// Example: +/// \include code/ZmwGroupQuery.txt +/// +/// \note Currently, all %BAM files must have a corresponding ".pbi" index file. +/// Use BamFile::EnsurePacBioIndexExists before creating the query if one +/// may not be present. +/// +class PBBAM_EXPORT ZmwGroupQuery : public internal::IGroupQuery +{ +public: + /// \brief Creates a new ZmwGroupQuery, limiting record results to only + /// those matching a ZMW hole number criterion. + /// + /// \param[in] zmwWhitelist vector of allowed ZMW hole numbers + /// \param[in] dataset input data source(s) + /// + /// \throws std::runtime_error on failure to open/read underlying %BAM or + /// PBI files. + /// + ZmwGroupQuery(const std::vector& zmwWhitelist, + const DataSet& dataset); + ~ZmwGroupQuery(void); + +public: + /// \brief Main iteration point for record access. + /// + /// Most client code should not need to use this method directly. Use + /// iterators instead. + /// + bool GetNext(std::vector& records); + +private: + struct ZmwGroupQueryPrivate; + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // ZMWGROUPQUERY_H diff --git a/include/pbbam/ZmwQuery.h b/include/pbbam/ZmwQuery.h new file mode 100644 index 0000000..0d6e166 --- /dev/null +++ b/include/pbbam/ZmwQuery.h @@ -0,0 +1,96 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ZmwQuery.h +/// \brief Defines the ZmwQuery class. +// +// Author: Derek Barnett + +#ifndef ZMWQUERY_H +#define ZMWQUERY_H + +#include "pbbam/Config.h" +#include "pbbam/internal/QueryBase.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief The ZmwQuery class provides iterable access to a DataSet's +/// %BAM records, limiting results to those matching a ZMW hole number +/// whitelist. +/// +/// Example: +/// \include code/ZmwQuery.txt +/// +/// \note Currently, all %BAM files must have a corresponding ".pbi" index file. +/// Use BamFile::EnsurePacBioIndexExists before creating the query if one +/// may not be present. +/// +class PBBAM_EXPORT ZmwQuery : public internal::IQuery +{ +public: + /// \brief Creates a new ZmwQuery, limiting record results to only + /// those matching a ZMW hole number criterion. + /// + /// \param[in] zmwWhitelist vector of allowed ZMW hole numbers + /// \param[in] dataset input data source(s) + /// + /// \throws std::runtime_error on failure to open/read underlying %BAM or + /// PBI files. + /// + ZmwQuery(const std::vector& zmwWhitelist, + const DataSet& dataset); + + ~ZmwQuery(void); + +public: + /// \brief Main iteration point for record access. + /// + /// Most client code should not need to use this method directly. Use + /// iterators instead. + /// + bool GetNext(BamRecord& r); + +private: + struct ZmwQueryPrivate; + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // ZMWQUERY_H diff --git a/include/pbbam/ZmwType.h b/include/pbbam/ZmwType.h new file mode 100644 index 0000000..a93e295 --- /dev/null +++ b/include/pbbam/ZmwType.h @@ -0,0 +1,63 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ZmwType.h +/// \brief Defines the ZmwType enum. +// +// Author: Armin Töpfer + +#ifndef ZMWTYPE_H +#define ZMWTYPE_H + +#include "pbbam/Config.h" + +namespace PacBio { +namespace BAM { + +/// \brief This enum defines the different ZMW categories of scraps +/// +enum class ZmwType : char +{ + CONTROL = 'C', + MALFORMED = 'M', + NORMAL = 'N', + SENTINEL = 'S' +}; + +} // namespace BAM +} // namespace PacBio + +#endif // ZMWTYPE_H diff --git a/include/pbbam/ZmwTypeMap.h b/include/pbbam/ZmwTypeMap.h new file mode 100644 index 0000000..4dc781c --- /dev/null +++ b/include/pbbam/ZmwTypeMap.h @@ -0,0 +1,65 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ZmwTypeMap.h +/// \brief Defines the ZmwTypeMap class. +// +// Author: Armin Töpfer + +#ifndef ZMWTYPEMAP_H +#define ZMWTYPEMAP_H + +#include + +#include "pbbam/Config.h" +#include "pbbam/ZmwType.h" + +namespace PacBio { +namespace BAM { + +/// \brief The ZmwTypeMap class provides mapping between char codes and +/// ZmwType enum keys. +/// +class ZmwTypeMap +{ +public: + static std::map ParseChar; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // ZMWTYPEMAP_H diff --git a/include/pbbam/exception/InvalidSequencingChemistryException.h b/include/pbbam/exception/InvalidSequencingChemistryException.h new file mode 100644 index 0000000..a670bc3 --- /dev/null +++ b/include/pbbam/exception/InvalidSequencingChemistryException.h @@ -0,0 +1,103 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file InvalidSequencingChemistryException.h +/// \brief Defines the InvalidSequencingChemistryException class. +// +// Author: Derek Barnett + +#ifndef INVALIDSEQUENCINGCHEMISTRYEXCEPTION_H +#define INVALIDSEQUENCINGCHEMISTRYEXCEPTION_H + +#include +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief The InvalidSequencingChemistryException class represents an exception +/// that will be thrown when an invalid sequencing chemistry combination +/// is encountered. +/// +class InvalidSequencingChemistryException : public std::exception +{ +public: + InvalidSequencingChemistryException(const std::string& bindingKit, + const std::string& sequencingKit, + const std::string& basecallerVersion) + : bindingKit_(bindingKit) + , sequencingKit_(sequencingKit) + , basecallerVersion_(basecallerVersion) + { + std::stringstream s; + s << "unsupported sequencing chemistry combination: " << std::endl + << " binding kit: " << bindingKit_ << std::endl + << " sequencing kit: " << sequencingKit_ << std::endl + << " basecaller version: " << basecallerVersion_ << std::endl; + what_ = s.str(); + } + + // This is a work around for the Intel PHI compiler (icpc) + ~InvalidSequencingChemistryException() throw() + { + + } +public: + const std::string& BindingKit(void) const + { return bindingKit_; } + + const std::string& SequencingKit(void) const + { return sequencingKit_; } + + const std::string& BasecallerVersion(void) const + { return basecallerVersion_; } + +public: + const char* what(void) const noexcept override + { return what_.c_str(); } + +protected: + std::string bindingKit_; + std::string sequencingKit_; + std::string basecallerVersion_; + std::string what_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // INVALIDSEQUENCINGCHEMISTRYEXCEPTION_H diff --git a/include/pbbam/exception/ValidationException.h b/include/pbbam/exception/ValidationException.h new file mode 100644 index 0000000..92d8f17 --- /dev/null +++ b/include/pbbam/exception/ValidationException.h @@ -0,0 +1,99 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ValidationException.h +/// \brief Defines the ValidationException class. +// +// Author: Derek Barnett + +#ifndef VALIDATIONEXCEPTION_H +#define VALIDATIONEXCEPTION_H + +#include +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { + +/// \brief The ValidationExecption represents an exception that will be thrown +/// when any error is encountered using the Validator API. In addition to +/// a default display message, it provides programmatic access to all +/// reported error messages. +/// +/// \sa Validator::Validate(const BamRecord& record) +/// +class ValidationException : public std::runtime_error +{ +public: + typedef std::vector ErrorList; + typedef std::map ErrorMap; + +public: + ValidationException(const ErrorMap& fileErrors, + const ErrorMap& readGroupErrors, + const ErrorMap& recordErrors); + ValidationException(ErrorMap&& fileErrors, + ErrorMap&& readGroupErrors, + ErrorMap&& recordErrors); + // This is a work around for the Intel PHI compiler (icpc) + ~ValidationException() throw() + { + + } +public: + const ErrorMap& FileErrors(void) const; + const ErrorMap& ReadGroupErrors(void) const; + const ErrorMap& RecordErrors(void) const; + + const char* what(void) const noexcept override; + +private: + ErrorMap fileErrors_; + ErrorMap readGroupErrors_; + ErrorMap recordErrors_; + std::string msg_; + +private: + void FormatMessage(void); +}; + +} // namespace BAM +} // namespace PacBio + +#endif // VALIDATIONEXCEPTION_H diff --git a/include/pbbam/internal/Accuracy.inl b/include/pbbam/internal/Accuracy.inl new file mode 100644 index 0000000..f859662 --- /dev/null +++ b/include/pbbam/internal/Accuracy.inl @@ -0,0 +1,66 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Accuracy.inl +/// \brief Inline implementations for the Accuracy class. +// +// Author: Derek Barnett + +#include "pbbam/Accuracy.h" + +namespace PacBio { +namespace BAM { + +inline Accuracy::Accuracy(float accuracy) +{ + if (accuracy < Accuracy::MIN) + accuracy = Accuracy::MIN; + else if (accuracy > Accuracy::MAX) + accuracy = Accuracy::MAX; + accuracy_ = accuracy; +} + +inline Accuracy::Accuracy(const Accuracy &other) + : accuracy_(other.accuracy_) +{ } + +inline Accuracy::~Accuracy(void) { } + +inline Accuracy::operator float(void) const +{ return accuracy_; } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/BamHeader.inl b/include/pbbam/internal/BamHeader.inl new file mode 100644 index 0000000..2445a25 --- /dev/null +++ b/include/pbbam/internal/BamHeader.inl @@ -0,0 +1,154 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamHeader.inl +/// \brief Inline implementations for the BamHeader class. +// +// Author: Derek Barnett + +#include "pbbam/BamHeader.h" + +namespace PacBio { +namespace BAM { +namespace internal { + +class BamHeaderPrivate +{ +public: + std::string version_; + std::string pacbioBamVersion_; + std::string sortOrder_; + std::map headerLineCustom_; + + std::map readGroups_; // id => read group info + std::map programs_; // id => program info + std::vector comments_; + + // we need to preserve insertion order, use lookup for access by name + std::vector sequences_; + std::map sequenceIdLookup_; +}; + +} // namespace internal + +inline BamHeader::BamHeader(void) + : d_(new internal::BamHeaderPrivate) +{ } + +inline BamHeader::BamHeader(const BamHeader& other) + : d_(other.d_) +{ } + +inline BamHeader::BamHeader(BamHeader&& other) + : d_(std::move(other.d_)) +{ } + +inline BamHeader& BamHeader::operator=(const BamHeader& other) +{ d_ = other.d_; return *this; } + +inline BamHeader& BamHeader::operator=(BamHeader&& other) +{ d_ = std::move(other.d_); return *this; } + +inline BamHeader::~BamHeader(void) { } + +inline BamHeader BamHeader::operator+(const BamHeader& other) const +{ return DeepCopy() += other; } + +inline BamHeader& BamHeader::AddComment(const std::string& comment) +{ d_->comments_.push_back(comment); return *this; } + +inline BamHeader& BamHeader::AddProgram(const ProgramInfo& pg) +{ d_->programs_[pg.Id()] = pg; return *this; } + +inline BamHeader& BamHeader::AddReadGroup(const ReadGroupInfo& readGroup) +{ d_->readGroups_[readGroup.Id()] = readGroup; return *this; } + +inline BamHeader& BamHeader::ClearComments(void) +{ d_->comments_.clear(); return* this; } + +inline BamHeader& BamHeader::ClearPrograms(void) +{ d_->programs_.clear(); return *this; } + +inline BamHeader& BamHeader::ClearReadGroups(void) +{ d_->readGroups_.clear(); return *this; } + +inline std::vector BamHeader::Comments(void) const +{ return d_->comments_; } + +inline BamHeader& BamHeader::Comments(const std::vector& comments) +{ d_->comments_ = comments; return *this; } + +inline bool BamHeader::HasProgram(const std::string& id) const +{ return d_->programs_.find(id) != d_->programs_.cend(); } + +inline bool BamHeader::HasReadGroup(const std::string& id) const +{ return d_->readGroups_.find(id) != d_->readGroups_.cend(); } + +inline bool BamHeader::HasSequence(const std::string& name) const +{ return d_->sequenceIdLookup_.find(name) != d_->sequenceIdLookup_.cend(); } + +inline size_t BamHeader::NumSequences(void) const +{ return d_->sequences_.size(); } + +inline std::string BamHeader::PacBioBamVersion(void) const +{ return d_->pacbioBamVersion_; } + +inline SequenceInfo BamHeader::Sequence(const int32_t id) const +{ return d_->sequences_.at(id); } + +inline std::string BamHeader::SequenceLength(const int32_t id) const +{ return Sequence(id).Length(); } + +inline std::string BamHeader::SequenceName(const int32_t id) const +{ return Sequence(id).Name(); } + +inline std::vector BamHeader::Sequences(void) const +{ return d_->sequences_; } + +inline std::string BamHeader::SortOrder(void) const +{ return d_->sortOrder_; } + +inline BamHeader& BamHeader::SortOrder(const std::string& order) +{ d_->sortOrder_ = order; return *this; } + +inline std::string BamHeader::Version(void) const +{ return d_->version_; } + +inline BamHeader& BamHeader::Version(const std::string& version) +{ d_->version_ = version; return *this; } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/BamRecord.inl b/include/pbbam/internal/BamRecord.inl new file mode 100644 index 0000000..d4a7cbd --- /dev/null +++ b/include/pbbam/internal/BamRecord.inl @@ -0,0 +1,86 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamRecord.inl +/// \brief Inline implementations for the BamRecord class. +// +// Author: Derek Barnett + +#include "pbbam/BamRecord.h" + +namespace PacBio { +namespace BAM { + +inline BamRecord BamRecord::Clipped(const BamRecord& input, + const ClipType clipType, + const PacBio::BAM::Position start, + const PacBio::BAM::Position end) +{ + return input.Clipped(clipType, start, end); +} + +inline BamRecord BamRecord::Clipped(const ClipType clipType, + const PacBio::BAM::Position start, + const PacBio::BAM::Position end) const +{ + BamRecord result(*this); + result.Clip(clipType, start, end); + return result; +} + +inline BamRecord BamRecord::Mapped(const BamRecord& input, + const int32_t referenceId, + const Position refStart, + const Strand strand, + const Cigar& cigar, + const uint8_t mappingQuality) +{ + return input.Mapped(referenceId, refStart, strand, cigar, mappingQuality); +} + +inline BamRecord BamRecord::Mapped(const int32_t referenceId, + const Position refStart, + const Strand strand, + const Cigar& cigar, + const uint8_t mappingQuality) const +{ + BamRecord result(*this); + result.Map(referenceId, refStart, strand, cigar, mappingQuality); + return result; +} + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/BamRecordBuilder.inl b/include/pbbam/internal/BamRecordBuilder.inl new file mode 100644 index 0000000..212e831 --- /dev/null +++ b/include/pbbam/internal/BamRecordBuilder.inl @@ -0,0 +1,84 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamRecordBuilder.inl +/// \brief Inline implementations for the BamRecordBuilder class. +// +// Author: Derek Barnett + +#include "pbbam/BamRecordBuilder.h" + +namespace PacBio { +namespace BAM { + +inline BamRecordBuilder& BamRecordBuilder::Bin(const uint32_t bin) +{ core_.bin = bin; return *this; } + +inline BamRecordBuilder& BamRecordBuilder::Flag(const uint32_t flag) +{ core_.flag = flag; return *this; } + +inline BamRecordBuilder& BamRecordBuilder::InsertSize(const int32_t iSize) +{ core_.isize = iSize; return *this; } + +inline BamRecordBuilder& BamRecordBuilder::MapQuality(const uint8_t mapQual) +{ core_.qual = mapQual; return *this; } + +inline BamRecordBuilder& BamRecordBuilder::MatePosition(const int32_t pos) +{ core_.mpos = pos; return *this; } + +inline BamRecordBuilder& BamRecordBuilder::MateReferenceId(const int32_t id) +{ core_.mtid = id; return *this; } + +inline BamRecordBuilder& BamRecordBuilder::Position(const int32_t pos) +{ core_.pos = pos; return *this; } + +inline BamRecordBuilder& BamRecordBuilder::Qualities(const std::string& qualities) +{ qualities_ = qualities; return *this; } + +inline BamRecordBuilder& BamRecordBuilder::Qualities(std::string&& qualities) +{ qualities_ = std::move(qualities); return *this; } + +inline BamRecordBuilder& BamRecordBuilder::ReferenceId(const int32_t id) +{ core_.tid = id; return *this; } + +inline BamRecordBuilder& BamRecordBuilder::Tags(const TagCollection& tags) +{ tags_ = tags; return *this; } + +inline BamRecordBuilder& BamRecordBuilder::Tags(TagCollection&& tags) +{ tags_ = std::move(tags); return *this; } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/BamRecordImpl.inl b/include/pbbam/internal/BamRecordImpl.inl new file mode 100644 index 0000000..6c0ecef --- /dev/null +++ b/include/pbbam/internal/BamRecordImpl.inl @@ -0,0 +1,216 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamRecordImpl.inl +/// \brief Inline implementations for the BamRecordImpl class. +// +// Author: Derek Barnett + +#include "pbbam/BamRecordImpl.h" + +namespace PacBio { +namespace BAM { + +inline uint32_t BamRecordImpl::Bin(void) const +{ return d_->core.bin; } + +inline BamRecordImpl& BamRecordImpl::Bin(uint32_t bin) +{ d_->core.bin = bin; return *this; } + +inline uint32_t BamRecordImpl::Flag(void) const +{ return d_->core.flag; } + +inline BamRecordImpl& BamRecordImpl::Flag(uint32_t flag) +{ d_->core.flag = flag; return *this; } + +inline int32_t BamRecordImpl::InsertSize(void) const +{ return d_->core.isize; } + +inline BamRecordImpl& BamRecordImpl::InsertSize(int32_t iSize) +{ d_->core.isize = iSize; return *this; } + +inline uint8_t BamRecordImpl::MapQuality(void) const +{ return d_->core.qual; } + +inline BamRecordImpl& BamRecordImpl::MapQuality(uint8_t mapQual) +{ d_->core.qual = mapQual; return *this; } + +inline PacBio::BAM::Position BamRecordImpl::MatePosition(void) const +{ return d_->core.mpos; } + +inline BamRecordImpl& BamRecordImpl::MatePosition(PacBio::BAM::Position pos) +{ d_->core.mpos = pos; return *this; } + +inline int32_t BamRecordImpl::MateReferenceId(void) const +{ return d_->core.mtid; } + +inline BamRecordImpl& BamRecordImpl::MateReferenceId(int32_t id) +{ d_->core.mtid = id; return *this; } + +inline PacBio::BAM::Position BamRecordImpl::Position(void) const +{ return d_->core.pos; } + +inline BamRecordImpl& BamRecordImpl::Position(PacBio::BAM::Position pos) +{ d_->core.pos = pos; return *this; } + +inline int32_t BamRecordImpl::ReferenceId(void) const +{ return d_->core.tid; } + +inline BamRecordImpl& BamRecordImpl::ReferenceId(int32_t id) +{ d_->core.tid = id; return *this; } + +inline bool BamRecordImpl::IsDuplicate(void) const +{ return (d_->core.flag & BamRecordImpl::DUPLICATE) != 0; } + +inline BamRecordImpl& BamRecordImpl::SetDuplicate(bool ok) +{ + if (ok) d_->core.flag |= BamRecordImpl::DUPLICATE; + else d_->core.flag &= ~BamRecordImpl::DUPLICATE; + return *this; +} + +inline bool BamRecordImpl::IsFailedQC(void) const +{ return (d_->core.flag & BamRecordImpl::FAILED_QC) != 0; } + +inline BamRecordImpl& BamRecordImpl::SetFailedQC(bool ok) +{ + if (ok) d_->core.flag |= BamRecordImpl::FAILED_QC; + else d_->core.flag &= ~BamRecordImpl::FAILED_QC; + return *this; +} + +inline bool BamRecordImpl::IsFirstMate(void) const +{ return (d_->core.flag & BamRecordImpl::MATE_1) != 0; } + +inline BamRecordImpl& BamRecordImpl::SetFirstMate(bool ok) +{ + if (ok) d_->core.flag |= BamRecordImpl::MATE_1; + else d_->core.flag &= ~BamRecordImpl::MATE_1; + return *this; +} + +inline bool BamRecordImpl::IsMapped(void) const +{ return (d_->core.flag & BamRecordImpl::UNMAPPED) == 0; } + +inline BamRecordImpl& BamRecordImpl::SetMapped(bool ok) +{ + if (ok) d_->core.flag &= ~BamRecordImpl::UNMAPPED; + else d_->core.flag |= BamRecordImpl::UNMAPPED; + return *this; +} + +inline bool BamRecordImpl::IsMateMapped(void) const +{ return (d_->core.flag & BamRecordImpl::MATE_UNMAPPED) == 0; } + +inline BamRecordImpl& BamRecordImpl::SetMateMapped(bool ok) +{ + if (ok) d_->core.flag &= ~BamRecordImpl::MATE_UNMAPPED; + else d_->core.flag |= BamRecordImpl::MATE_UNMAPPED; + return *this; +} + +inline bool BamRecordImpl::IsMateReverseStrand(void) const +{ return (d_->core.flag & BamRecordImpl::MATE_REVERSE_STRAND) != 0; } + +inline BamRecordImpl& BamRecordImpl::SetMateReverseStrand(bool ok) +{ + if (ok) d_->core.flag |= BamRecordImpl::MATE_REVERSE_STRAND; + else d_->core.flag &= ~BamRecordImpl::MATE_REVERSE_STRAND; + return *this; +} + +inline bool BamRecordImpl::IsPaired(void) const +{ return (d_->core.flag & BamRecordImpl::PAIRED) != 0; } + +inline BamRecordImpl& BamRecordImpl::SetPaired(bool ok) +{ + if (ok) d_->core.flag |= BamRecordImpl::PAIRED; + else d_->core.flag &= ~BamRecordImpl::PAIRED; + return *this; +} + +inline bool BamRecordImpl::IsPrimaryAlignment(void) const +{ return (d_->core.flag & BamRecordImpl::SECONDARY) == 0; } + +inline BamRecordImpl& BamRecordImpl::SetPrimaryAlignment(bool ok) +{ + if (ok) d_->core.flag &= ~BamRecordImpl::SECONDARY; + else d_->core.flag |= BamRecordImpl::SECONDARY; + return *this; +} + +inline bool BamRecordImpl::IsProperPair(void) const +{ return (d_->core.flag & BamRecordImpl::PROPER_PAIR) != 0; } + +inline BamRecordImpl& BamRecordImpl::SetProperPair(bool ok) +{ + if (ok) d_->core.flag |= BamRecordImpl::PROPER_PAIR; + else d_->core.flag &= ~BamRecordImpl::PROPER_PAIR; + return *this; +} + +inline bool BamRecordImpl::IsReverseStrand(void) const +{ return (d_->core.flag & BamRecordImpl::REVERSE_STRAND) != 0; } + +inline BamRecordImpl& BamRecordImpl::SetReverseStrand(bool ok) +{ + if (ok) d_->core.flag |= BamRecordImpl::REVERSE_STRAND; + else d_->core.flag &= ~BamRecordImpl::REVERSE_STRAND; + return *this; +} + +inline bool BamRecordImpl::IsSecondMate(void) const +{ return (d_->core.flag & BamRecordImpl::MATE_2) != 0; } + +inline BamRecordImpl& BamRecordImpl::SetSecondMate(bool ok) +{ + if (ok) d_->core.flag |= BamRecordImpl::MATE_2; + else d_->core.flag &= ~BamRecordImpl::MATE_2; + return *this; +} + +inline bool BamRecordImpl::IsSupplementaryAlignment(void) const +{ return (d_->core.flag & BamRecordImpl::SUPPLEMENTARY) != 0; } + +inline BamRecordImpl& BamRecordImpl::SetSupplementaryAlignment(bool ok) +{ + if (ok) d_->core.flag |= BamRecordImpl::SUPPLEMENTARY; + else d_->core.flag &= ~BamRecordImpl::SUPPLEMENTARY; + return *this; +} + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/BamRecordView.inl b/include/pbbam/internal/BamRecordView.inl new file mode 100644 index 0000000..35486be --- /dev/null +++ b/include/pbbam/internal/BamRecordView.inl @@ -0,0 +1,129 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamRecordView.inl +/// \brief Inline implementations for the BamRecordView class. +// +// Author: Derek Barnett + +#include "pbbam/BamRecordView.h" + +namespace PacBio { +namespace BAM { + +inline BamRecordView::BamRecordView(const BamRecord& record, + const Orientation orientation, + const bool aligned, + const bool exciseSoftClips, + const PulseBehavior pulseBehavior) + : record_(record) + , orientation_(orientation) + , aligned_(aligned) + , exciseSoftClips_(exciseSoftClips) + , pulseBehavior_(pulseBehavior) +{ } + +inline QualityValues BamRecordView::AltLabelQVs(void) const +{ return record_.AltLabelQV(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); } + +inline std::string BamRecordView::AltLabelTags(void) const +{ return record_.AltLabelTag(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); } + +inline QualityValues BamRecordView::DeletionQVs(void) const +{ return record_.DeletionQV(orientation_, aligned_, exciseSoftClips_); } + +inline std::string BamRecordView::DeletionTags(void) const +{ return record_.DeletionTag(orientation_, aligned_, exciseSoftClips_); } + +inline QualityValues BamRecordView::InsertionQVs(void) const +{ return record_.InsertionQV(orientation_, aligned_, exciseSoftClips_); } + +inline Frames BamRecordView::IPD(void) const +{ return record_.IPD(orientation_, aligned_, exciseSoftClips_); } + +inline Frames BamRecordView::PrebaseFrames(void) const +{ return record_.IPD(orientation_, aligned_, exciseSoftClips_); } + +inline QualityValues BamRecordView::LabelQVs(void) const +{ return record_.LabelQV(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); } + +inline QualityValues BamRecordView::MergeQVs(void) const +{ return record_.MergeQV(orientation_, aligned_, exciseSoftClips_); } + +inline QualityValues BamRecordView::PulseMergeQVs(void) const +{ return record_.PulseMergeQV(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); } + +inline std::vector BamRecordView::Pkmean(void) const +{ return record_.Pkmean(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); } + +inline std::vector BamRecordView::Pkmid(void) const +{ return record_.Pkmid(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); } + +inline std::vector BamRecordView::Pkmean2(void) const +{ return record_.Pkmean2(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); } + +inline std::vector BamRecordView::Pkmid2(void) const +{ return record_.Pkmid2(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); } + +inline Frames BamRecordView::PrePulseFrames(void) const +{ return record_.PrePulseFrames(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); } + +inline std::string BamRecordView::PulseCalls(void) const +{ return record_.PulseCall(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); } + +inline Frames BamRecordView::PulseCallWidth(void) const +{ return record_.PulseCallWidth(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); } + +inline Frames BamRecordView::PulseWidths(void) const +{ return record_.PulseWidth(orientation_, aligned_, exciseSoftClips_); } + +inline QualityValues BamRecordView::Qualities(void) const +{ return record_.Qualities(orientation_, aligned_, exciseSoftClips_); } + +inline std::string BamRecordView::Sequence(void) const +{ return record_.Sequence(orientation_, aligned_, exciseSoftClips_); } + +inline std::vector BamRecordView::StartFrames(void) const +{ return record_.StartFrame(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); } + +inline QualityValues BamRecordView::SubstitutionQVs(void) const +{ return record_.SubstitutionQV(orientation_, aligned_, exciseSoftClips_); } + +inline std::string BamRecordView::SubstitutionTags(void) const +{ return record_.SubstitutionTag(orientation_, aligned_, exciseSoftClips_); } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/Cigar.inl b/include/pbbam/internal/Cigar.inl new file mode 100644 index 0000000..4799a72 --- /dev/null +++ b/include/pbbam/internal/Cigar.inl @@ -0,0 +1,77 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Cigar.inl +/// \brief Inline implemenations for the Cigar class. +// +// Author: Derek Barnett + +#include "pbbam/Cigar.h" + +namespace PacBio { +namespace BAM { + +inline Cigar::Cigar(void) + : std::vector() +{ } + +inline Cigar::Cigar(const Cigar& other) + : std::vector(other) +{ } + +inline Cigar::Cigar(Cigar&& other) + : std::vector(std::move(other)) +{ } + +inline Cigar& Cigar::operator=(const Cigar& other) +{ + std::vector::operator=(other); + return *this; +} + +inline Cigar& Cigar::operator=(Cigar&& other) +{ + std::vector::operator=(std::move(other)); + return *this; +} + +inline Cigar::~Cigar(void) { } + +inline Cigar Cigar::FromStdString(const std::string& stdString) +{ return Cigar(stdString); } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/CigarOperation.inl b/include/pbbam/internal/CigarOperation.inl new file mode 100644 index 0000000..167528c --- /dev/null +++ b/include/pbbam/internal/CigarOperation.inl @@ -0,0 +1,111 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file CigarOperation.inl +/// \brief Inline implemenations for the CigarOperation class. +// +// Author: Derek Barnett + +#include "pbbam/CigarOperation.h" + +namespace PacBio { +namespace BAM { + +inline CigarOperation::CigarOperation(void) + : type_(CigarOperationType::UNKNOWN_OP) + , length_(0) +{ } + +inline CigarOperation::CigarOperation(char c, uint32_t length) + : type_(CigarOperation::CharToType(c)) + , length_(length) +{ + if (type_ == CigarOperationType::ALIGNMENT_MATCH) + throw std::runtime_error("CIGAR operation 'M' is not allowed in PacBio BAM files. Use 'X/=' instead."); +} + +inline CigarOperation::CigarOperation(CigarOperationType op, uint32_t length) + : type_(op) + , length_(length) +{ + if (type_ == CigarOperationType::ALIGNMENT_MATCH) + throw std::runtime_error("CIGAR operation 'M' is not allowed in PacBio BAM files. Use 'X/=' instead."); +} + +inline CigarOperation::CigarOperation(const CigarOperation& other) + : type_(other.type_) + , length_(other.length_) +{ } + +inline CigarOperation::CigarOperation(CigarOperation&& other) + : type_(std::move(other.type_)) + , length_(std::move(other.length_)) +{ } + +inline CigarOperation::~CigarOperation(void) { } + +inline uint32_t CigarOperation::Length(void) const +{ return length_; } + +inline CigarOperation& CigarOperation::Length(const uint32_t length) +{ length_ = length; return *this; } + +inline CigarOperationType CigarOperation::Type(void) const +{ return type_; } + +inline CigarOperation &CigarOperation::Type(const CigarOperationType opType) +{ type_ = opType; return *this; } + +inline char CigarOperation::Char(void) const +{ return CigarOperation::TypeToChar(type_); } + +inline CigarOperation &CigarOperation::Char(const char opChar) +{ type_ = CigarOperation::CharToType(opChar);return *this; } + +inline CigarOperation& CigarOperation::operator=(const CigarOperation& other) +{ type_ = other.type_; length_ = other.length_; return *this; } + +inline CigarOperation& CigarOperation::operator=(CigarOperation&& other) +{ type_ = std::move(other.type_); length_ = std::move(other.length_); return *this; } + +inline bool CigarOperation::operator==(const CigarOperation& other) const +{ return type_ == other.type_ && length_ == other.length_; } + +inline bool CigarOperation::operator!=(const CigarOperation& other) const +{ return !(*this == other); } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/Compare.inl b/include/pbbam/internal/Compare.inl new file mode 100644 index 0000000..4eb5ccf --- /dev/null +++ b/include/pbbam/internal/Compare.inl @@ -0,0 +1,78 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Compare.inl +/// \brief Inline implementations for the Compare class & inner classes. +// +// Author: Derek Barnett + +#include "pbbam/Compare.h" + +namespace PacBio { +namespace BAM { +namespace internal { + +template struct MemberFnProxy; + +template +struct MemberFnProxy +{ + static R call(const T& obj, Args&&... args) + { + return (obj.*fn)(std::forward(args)...); + } +}; + +} // namespace internal + +template::MemberFnType fn, + typename CompareType> +inline bool Compare::MemberFunctionBase::operator()(const BamRecord& lhs, + const BamRecord& rhs) const +{ + using MemberFnType = typename Compare::MemberFunctionBaseHelper::MemberFnType; + using Proxy = internal::MemberFnProxy; + + CompareType cmp; + return cmp(Proxy::call(lhs), Proxy::call(rhs)); +} + +inline bool Compare::None::operator()(const BamRecord&, const BamRecord&) const +{ return false; } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/CompositeBamReader.inl b/include/pbbam/internal/CompositeBamReader.inl new file mode 100644 index 0000000..f8f301e --- /dev/null +++ b/include/pbbam/internal/CompositeBamReader.inl @@ -0,0 +1,397 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file CompositeBamReader.inl +/// \brief Inline implementations for the composite BAM readers, for +/// working with multiple input files. +// +// Author: Derek Barnett + +#include "pbbam/CompositeBamReader.h" +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +// ----------------------------------- +// Merging helpers +// ----------------------------------- + +inline CompositeMergeItem::CompositeMergeItem(std::unique_ptr&& rdr) + : reader(std::move(rdr)) +{ } + +inline CompositeMergeItem::CompositeMergeItem(std::unique_ptr&& rdr, + BamRecord&& rec) + : reader(std::move(rdr)) + , record(std::move(rec)) +{ } + +inline CompositeMergeItem::CompositeMergeItem(CompositeMergeItem&& other) + : reader(std::move(other.reader)) + , record(std::move(other.record)) +{ } + +inline CompositeMergeItem& CompositeMergeItem::operator=(CompositeMergeItem&& other) +{ + reader = std::move(other.reader); + record = std::move(other.record); + return *this; +} + +inline CompositeMergeItem::~CompositeMergeItem(void) { } + +template +inline bool CompositeMergeItemSorter::operator()(const CompositeMergeItem& lhs, + const CompositeMergeItem& rhs) +{ + const BamRecord& l = lhs.record; + const BamRecord& r = rhs.record; + return CompareType()(l, r); +} + +} // namespace internal + +// ----------------------------------- +// GenomicIntervalCompositeBamReader +// ----------------------------------- + +inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(const GenomicInterval& interval, + const std::vector& bamFiles) +{ + filenames_.reserve(bamFiles.size()); + for(const auto& bamFile : bamFiles) + filenames_.push_back(bamFile.Filename()); + Interval(interval); +} + +inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(const GenomicInterval& interval, + std::vector&& bamFiles) +{ + filenames_.reserve(bamFiles.size()); + for(auto&& bamFile : bamFiles) + filenames_.push_back(bamFile.Filename()); + Interval(interval); +} + +inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(const GenomicInterval& interval, + const DataSet& dataset) + : GenomicIntervalCompositeBamReader(interval, dataset.BamFiles()) +{ } + +inline bool GenomicIntervalCompositeBamReader::GetNext(BamRecord& record) +{ + // nothing left to read + if (mergeItems_.empty()) + return false; + + // non-destructive 'pop' of first item from queue + auto firstIter = mergeItems_.begin(); + auto firstItem = internal::CompositeMergeItem{ std::move(firstIter->reader), std::move(firstIter->record) }; + mergeItems_.pop_front(); + + // store its record in our output record + std::swap(record, firstItem.record); + + // try fetch 'next' from first item's reader + // if successful, re-insert it into container & re-sort on our new values + // otherwise, this item will go out of scope & reader destroyed + if (firstItem.reader->GetNext(firstItem.record)) { + mergeItems_.push_front(std::move(firstItem)); + UpdateSort(); + } + + // return success + return true; +} + +inline const GenomicInterval& GenomicIntervalCompositeBamReader::Interval(void) const +{ return interval_; } + +inline GenomicIntervalCompositeBamReader& GenomicIntervalCompositeBamReader::Interval(const GenomicInterval& interval) +{ + auto updatedMergeItems = std::deque{ }; + auto filesToCreate = std::set{ filenames_.cbegin(), filenames_.cend() }; + + // update existing readers + while (!mergeItems_.empty()) { + + // non-destructive 'pop' of first item from queue + auto firstIter = mergeItems_.begin(); + auto firstItem = internal::CompositeMergeItem{ std::move(firstIter->reader), std::move(firstIter->record) }; + mergeItems_.pop_front(); + + // reset interval + BaiIndexedBamReader* baiReader = dynamic_cast(firstItem.reader.get()); + assert(baiReader); + baiReader->Interval(interval); + + // try fetch 'next' from first item's reader + // if successful, re-insert it into container & re-sort on our new values + // otherwise, this item will go out of scope & reader destroyed + if (firstItem.reader->GetNext(firstItem.record)) { + updatedMergeItems.push_front(std::move(firstItem)); + filesToCreate.erase(firstItem.reader->Filename()); + } + } + + // create readers for files that were not 'active' for the previous + std::vector missingBai; + for (auto&& fn : filesToCreate) { + auto bamFile = BamFile{ fn }; + if (bamFile.StandardIndexExists()) { + auto item = internal::CompositeMergeItem{ std::unique_ptr{ new BaiIndexedBamReader{ interval, std::move(bamFile) } } }; + if (item.reader->GetNext(item.record)) + updatedMergeItems.push_back(std::move(item)); + // else not an error, simply no data matching interval + } + else { + // maybe handle PBI-backed interval searches if BAI missing, but for now treat as error + missingBai.push_back(bamFile.Filename()); + } + } + + // throw if any files missing BAI + if (!missingBai.empty()) { + std::stringstream e; + e << "failed to open GenomicIntervalCompositeBamReader because the following files are missing a BAI file:" << std::endl; + for (const auto& fn : missingBai) + e << " " << fn << std::endl; + throw std::runtime_error(e.str()); + } + + // update our actual container and return + mergeItems_ = std::move(updatedMergeItems); + UpdateSort(); + return *this; +} + +struct OrderByPosition +{ + static inline bool less_than(const BamRecord& lhs, const BamRecord& rhs) + { + const int32_t lhsId = lhs.ReferenceId(); + const int32_t rhsId = rhs.ReferenceId(); + if (lhsId == -1) return false; + if (rhsId == -1) return true; + + if (lhsId == rhsId) + return lhs.ReferenceStart() < rhs.ReferenceStart(); + else return lhsId < rhsId; + } + + static inline bool equals(const BamRecord& lhs, const BamRecord& rhs) + { + return lhs.ReferenceId() == rhs.ReferenceId() && + lhs.ReferenceStart() == rhs.ReferenceStart(); + } +}; + +struct PositionSorter : std::binary_function +{ + bool operator()(const internal::CompositeMergeItem& lhs, + const internal::CompositeMergeItem& rhs) + { + const BamRecord& l = lhs.record; + const BamRecord& r = rhs.record; + return OrderByPosition::less_than(l, r); + } +}; + +inline void GenomicIntervalCompositeBamReader::UpdateSort(void) +{ std::sort(mergeItems_.begin(), mergeItems_.end(), PositionSorter{ }); } + +// ------------------------------ +// PbiRequestCompositeBamReader +// ------------------------------ + +template +inline PbiFilterCompositeBamReader::PbiFilterCompositeBamReader(const PbiFilter& filter, + const std::vector& bamFiles) +{ + filenames_.reserve(bamFiles.size()); + for(const auto& bamFile : bamFiles) + filenames_.push_back(bamFile.Filename()); + Filter(filter); +} + +template +inline PbiFilterCompositeBamReader::PbiFilterCompositeBamReader(const PbiFilter& filter, + std::vector&& bamFiles) +{ + filenames_.reserve(bamFiles.size()); + for(auto&& bamFile : bamFiles) + filenames_.push_back(bamFile.Filename()); + Filter(filter); +} + +template +inline PbiFilterCompositeBamReader::PbiFilterCompositeBamReader(const PbiFilter& filter, + const DataSet& dataset) + : PbiFilterCompositeBamReader(filter, std::move(dataset.BamFiles())) +{ } + +template +inline bool PbiFilterCompositeBamReader::GetNext(BamRecord& record) +{ + // nothing left to read + if (mergeQueue_.empty()) + return false; + + // non-destructive 'pop' of first item from queue + auto firstIter = mergeQueue_.begin(); + auto firstItem = value_type{ std::move(firstIter->reader), std::move(firstIter->record) }; + mergeQueue_.pop_front(); + + // store its record in our output record + std::swap(record, firstItem.record); + + // try fetch 'next' from first item's reader + // if successful, re-insert it into container & re-sort on our new values + // otherwise, this item will go out of scope & reader destroyed + if (firstItem.reader->GetNext(firstItem.record)) { + mergeQueue_.push_front(std::move(firstItem)); + UpdateSort(); + } + + // return success + return true; +} + +template +inline PbiFilterCompositeBamReader& +PbiFilterCompositeBamReader::Filter(const PbiFilter& filter) +{ + auto updatedMergeItems = container_type{ }; + auto filesToCreate = std::set{ filenames_.cbegin(), filenames_.cend() }; + + // update existing readers + while (!mergeQueue_.empty()) { + + // non-destructive 'pop' of first item from queue + auto firstIter = mergeQueue_.begin(); + auto firstItem = internal::CompositeMergeItem{ std::move(firstIter->reader), std::move(firstIter->record) }; + mergeQueue_.pop_front(); + + // reset request + PbiIndexedBamReader* pbiReader = dynamic_cast(firstItem.reader.get()); + assert(pbiReader); + pbiReader->Filter(filter); + + // try fetch 'next' from first item's reader + // if successful, re-insert it into container & re-sort on our new values + // otherwise, this item will go out of scope & reader destroyed + if (firstItem.reader->GetNext(firstItem.record)) { + updatedMergeItems.push_front(std::move(firstItem)); + filesToCreate.erase(firstItem.reader->Filename()); + } + } + + // create readers for files that were not 'active' for the previous + std::vector missingPbi; + for (auto&& fn : filesToCreate) { + auto bamFile = BamFile{ fn }; + if (bamFile.PacBioIndexExists()) { + auto item = internal::CompositeMergeItem{ std::unique_ptr{ new PbiIndexedBamReader{ filter, std::move(bamFile) } } }; + if (item.reader->GetNext(item.record)) + updatedMergeItems.push_back(std::move(item)); + // else not an error, simply no data matching filter + } + else + missingPbi.push_back(fn); + } + + // throw if any files missing PBI + if (!missingPbi.empty()) { + std::stringstream e; + e << "failed to open PbiFilterCompositeBamReader because the following files are missing a PBI file:" << std::endl; + for (const auto& fn : missingPbi) + e << " " << fn << std::endl; + throw std::runtime_error(e.str()); + } + + // update our actual container and return + mergeQueue_ = std::move(updatedMergeItems); + UpdateSort(); + return *this; +} + +template +inline void PbiFilterCompositeBamReader::UpdateSort(void) +{ std::stable_sort(mergeQueue_.begin(), mergeQueue_.end(), merge_sorter_type{}); } + +// ------------------------------ +// SequentialCompositeBamReader +// ------------------------------ + +inline SequentialCompositeBamReader::SequentialCompositeBamReader(const std::vector& bamFiles) +{ + for (auto&& bamFile : bamFiles) + readers_.emplace_back(new BamReader{ bamFile }); +} + +inline SequentialCompositeBamReader::SequentialCompositeBamReader(std::vector&& bamFiles) +{ + for (auto&& bamFile : bamFiles) + readers_.emplace_back(new BamReader{ std::move(bamFile) }); +} + +inline SequentialCompositeBamReader::SequentialCompositeBamReader(const DataSet& dataset) + : SequentialCompositeBamReader(dataset.BamFiles()) +{ } + +inline bool SequentialCompositeBamReader::GetNext(BamRecord& record) +{ + // try first reader, if successful return true + // else pop reader and try next, until all readers exhausted + while (!readers_.empty()) { + auto& reader = readers_.front(); + if (reader->GetNext(record)) + return true; + else + readers_.pop_front(); + } + + // no readers available + return false; +} + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/DataSet.inl b/include/pbbam/internal/DataSet.inl new file mode 100644 index 0000000..6627ddf --- /dev/null +++ b/include/pbbam/internal/DataSet.inl @@ -0,0 +1,201 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file DataSet.inl +/// \brief Inline implementations for the DataSet class. +// +// Author: Derek Barnett + +#include "pbbam/DataSet.h" + +namespace PacBio { +namespace BAM { + +inline const std::string& DataSet::Attribute(const std::string& name) const +{ return d_->Attribute(name); } + +inline std::string& DataSet::Attribute(const std::string& name) +{ return d_->Attribute(name); } + +inline DataSet& DataSet::Attribute(const std::string& name, const std::string& value) +{ d_->Attribute(name, value); return *this; } + +inline const std::string& DataSet::CreatedAt(void) const +{ return d_->CreatedAt(); } + +inline std::string& DataSet::CreatedAt(void) +{ return d_->CreatedAt(); } + +inline DataSet& DataSet::CreatedAt(const std::string& createdAt) +{ d_->CreatedAt(createdAt); return *this; } + +inline const PacBio::BAM::Extensions& DataSet::Extensions(void) const +{ return d_->Extensions(); } + +inline PacBio::BAM::Extensions& DataSet::Extensions(void) +{ return d_->Extensions(); } + +inline DataSet& DataSet::Extensions(const PacBio::BAM::Extensions& extensions) +{ d_->Extensions(extensions); return *this; } + +inline const PacBio::BAM::ExternalResources& DataSet::ExternalResources(void) const +{ return d_->ExternalResources(); } + +inline PacBio::BAM::ExternalResources& DataSet::ExternalResources(void) +{ return d_->ExternalResources(); } + +inline DataSet& DataSet::ExternalResources(const PacBio::BAM::ExternalResources& resources) +{ d_->ExternalResources(resources); return *this; } + +inline const PacBio::BAM::Filters& DataSet::Filters(void) const +{ return d_->Filters(); } + +inline PacBio::BAM::Filters& DataSet::Filters(void) +{ return d_->Filters(); } + +inline DataSet& DataSet::Filters(const PacBio::BAM::Filters& filters) +{ d_->Filters(filters); return *this; } + +inline const std::string& DataSet::Format(void) const +{ return d_->Format(); } + +inline std::string& DataSet::Format(void) +{ return d_->Format(); } + +inline DataSet& DataSet::Format(const std::string& format) +{ d_->Format(format); return *this; } + +inline const PacBio::BAM::DataSetMetadata& DataSet::Metadata(void) const +{ return d_->Metadata(); } + +inline PacBio::BAM::DataSetMetadata& DataSet::Metadata(void) +{ return d_->Metadata(); } + +inline DataSet& DataSet::Metadata(const PacBio::BAM::DataSetMetadata& metadata) +{ d_->Metadata(metadata); return *this; } + +inline const std::string& DataSet::MetaType(void) const +{ return d_->MetaType(); } + +inline std::string& DataSet::MetaType(void) +{ return d_->MetaType(); } + +inline DataSet& DataSet::MetaType(const std::string& metatype) +{ d_->MetaType(metatype); return *this; } + +inline const std::string& DataSet::ModifiedAt(void) const +{ return d_->ModifiedAt(); } + +inline std::string& DataSet::ModifiedAt(void) +{ return d_->ModifiedAt(); } + +inline DataSet& DataSet::ModifiedAt(const std::string& modifiedAt) +{ d_->ModifiedAt(modifiedAt); return *this; } + +inline const std::string& DataSet::Name(void) const +{ return d_->Name(); } + +inline std::string& DataSet::Name(void) +{ return d_->Name(); } + +inline DataSet& DataSet::Name(const std::string& name) +{ d_->Name(name); return *this; } + +inline const std::string& DataSet::ResourceId(void) const +{ return d_->ResourceId(); } + +inline std::string& DataSet::ResourceId(void) +{ return d_->ResourceId(); } + +inline DataSet& DataSet::ResourceId(const std::string& resourceId) +{ d_->ResourceId(resourceId); return *this; } + +inline const PacBio::BAM::SubDataSets& DataSet::SubDataSets(void) const +{ return d_->SubDataSets(); } + +inline PacBio::BAM::SubDataSets& DataSet::SubDataSets(void) +{ return d_->SubDataSets(); } + +inline DataSet& DataSet::SubDataSets(const PacBio::BAM::SubDataSets& subdatasets) +{ d_->SubDataSets(subdatasets); return *this; } + +inline const std::string& DataSet::Tags(void) const +{ return d_->Tags(); } + +inline std::string& DataSet::Tags(void) +{ return d_->Tags(); } + +inline DataSet& DataSet::Tags(const std::string& tags) +{ d_->Tags(tags); return *this; } + +inline const std::string& DataSet::TimeStampedName(void) const +{ return d_->TimeStampedName(); } + +inline std::string& DataSet::TimeStampedName(void) +{ return d_->TimeStampedName(); } + +inline DataSet& DataSet::TimeStampedName(const std::string& timeStampedName) +{ d_->TimeStampedName(timeStampedName); return *this; } + +inline PacBio::BAM::DataSet::TypeEnum DataSet::Type(void) const +{ return DataSet::NameToType(TypeName()); } + +inline DataSet& DataSet::Type(const DataSet::TypeEnum type) +{ d_->Label(DataSet::TypeToName(type)); return *this; } + +inline std::string DataSet::TypeName(void) const +{ return d_->LocalNameLabel().to_string(); } + +inline const std::string& DataSet::UniqueId(void) const +{ return d_->UniqueId(); } + +inline std::string& DataSet::UniqueId(void) +{ return d_->UniqueId(); } + +inline DataSet& DataSet::UniqueId(const std::string& uuid) +{ d_->UniqueId(uuid); return *this; } + +inline const std::string& DataSet::Version(void) const +{ return d_->Version(); } + +inline std::string& DataSet::Version(void) +{ return d_->Version(); } + +inline DataSet& DataSet::Version(const std::string& version) +{ d_->Version(version); return *this; } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/DataSetBaseTypes.h b/include/pbbam/internal/DataSetBaseTypes.h new file mode 100644 index 0000000..917162a --- /dev/null +++ b/include/pbbam/internal/DataSetBaseTypes.h @@ -0,0 +1,179 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef DATASETBASETYPES_H +#define DATASETBASETYPES_H + +#include "pbbam/Config.h" +#include "pbbam/internal/DataSetElement.h" +#include "pbbam/internal/DataSetListElement.h" +#include + +namespace PacBio { +namespace BAM { + +class DataSetMetadata; +class Extensions; +class ExternalResources; +class FileIndices; +class Filters; +class Properties; +class Provenance; + +namespace internal { + +class BaseEntityType : public DataSetElement +{ +protected: + BaseEntityType(const std::string& label, + const XsdType& xsd = XsdType::BASE_DATA_MODEL); + +public: + const std::string& CreatedAt(void) const; + const std::string& Description(void) const; + const PacBio::BAM::Extensions& Extensions(void) const; + const std::string& Format(void) const; + const std::string& ModifiedAt(void) const; + const std::string& Name(void) const; + const std::string& ResourceId(void) const; + const std::string& Tags(void) const; + const std::string& Version(void) const; + + std::string& CreatedAt(void); + std::string& Description(void); + PacBio::BAM::Extensions& Extensions(void); + std::string& Format(void); + std::string& ModifiedAt(void); + std::string& Name(void); + std::string& ResourceId(void); + std::string& Tags(void); + std::string& Version(void); + + BaseEntityType& CreatedAt(const std::string& createdAt); + BaseEntityType& Description(const std::string& description); + BaseEntityType& Extensions(const PacBio::BAM::Extensions& extensions); + BaseEntityType& Format(const std::string& format); + BaseEntityType& ModifiedAt(const std::string& modifiedAt); + BaseEntityType& Name(const std::string& name); + BaseEntityType& ResourceId(const std::string& resourceId); + BaseEntityType& Tags(const std::string& tags); + BaseEntityType& Version(const std::string& version); +}; + +class DataEntityType : public BaseEntityType +{ +protected: + DataEntityType(const std::string& label, + const XsdType& xsd = XsdType::BASE_DATA_MODEL); + +public: + const std::string& Checksum(void) const; + const std::string& EncodedValue(void) const; + const std::string& MetaType(void) const; + const std::string& SimpleValue(void) const; + const std::string& TimeStampedName(void) const; + const std::string& UniqueId(void) const; + const std::string& ValueDataType(void) const; + + std::string& Checksum(void); + std::string& EncodedValue(void); + std::string& MetaType(void); + std::string& SimpleValue(void); + std::string& TimeStampedName(void); + std::string& UniqueId(void); + std::string& ValueDataType(void); + + DataEntityType& Checksum(const std::string& checksum); + DataEntityType& EncodedValue(const std::string& encodedValue); + DataEntityType& MetaType(const std::string& metatype); + DataEntityType& SimpleValue(const std::string& simpleValue); + DataEntityType& TimeStampedName(const std::string& timeStampedName); + DataEntityType& UniqueId(const std::string& uuid); + DataEntityType& ValueDataType(const std::string& valueDataType); +}; + +class StrictEntityType : public BaseEntityType +{ +protected: + StrictEntityType(const std::string& metatype, + const std::string& label, + const XsdType& xsd = XsdType::BASE_DATA_MODEL); + +public: + const std::string& MetaType(void) const; + const std::string& TimeStampedName(void) const; + const std::string& UniqueId(void) const; + + std::string& MetaType(void); + std::string& TimeStampedName(void); + std::string& UniqueId(void); + + StrictEntityType& MetaType(const std::string& metatype); + StrictEntityType& TimeStampedName(const std::string& timeStampedName); + StrictEntityType& UniqueId(const std::string& uuid); +}; + +class InputOutputDataType : public StrictEntityType +{ +protected: + InputOutputDataType(const std::string& metatype, + const std::string& filename, + const std::string& label, + const XsdType& xsd = XsdType::BASE_DATA_MODEL); +}; + +class IndexedDataType : public InputOutputDataType +{ +protected: + IndexedDataType(const std::string& metatype, + const std::string& filename, + const std::string& label, + const XsdType& xsd = XsdType::BASE_DATA_MODEL); + +public: + const PacBio::BAM::FileIndices& FileIndices(void) const; + PacBio::BAM::FileIndices& FileIndices(void); + IndexedDataType& FileIndices(const PacBio::BAM::FileIndices& indices); +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/DataSetBaseTypes.inl" + +#endif // DATASETBASETYPES_H diff --git a/include/pbbam/internal/DataSetBaseTypes.inl b/include/pbbam/internal/DataSetBaseTypes.inl new file mode 100644 index 0000000..c1d2f43 --- /dev/null +++ b/include/pbbam/internal/DataSetBaseTypes.inl @@ -0,0 +1,220 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "pbbam/internal/DataSetBaseTypes.h" + +namespace PacBio { +namespace BAM { +namespace internal { + +// ---------------- +// BaseEntityType +// ---------------- + +inline const std::string& BaseEntityType::CreatedAt(void) const +{ return Attribute("CreatedAt"); } + +inline std::string& BaseEntityType::CreatedAt(void) +{ return Attribute("CreatedAt"); } + +inline BaseEntityType& BaseEntityType::CreatedAt(const std::string& createdAt) +{ Attribute("CreatedAt", createdAt); return *this; } + +inline const std::string& BaseEntityType::Description(void) const +{ return Attribute("Description"); } + +inline std::string& BaseEntityType::Description(void) +{ return Attribute("Description"); } + +inline BaseEntityType& BaseEntityType::Description(const std::string& description) +{ Attribute("Description", description); return *this; } + +inline const std::string& BaseEntityType::Format(void) const +{ return Attribute("Format"); } + +inline std::string& BaseEntityType::Format(void) +{ return Attribute("Format"); } + +inline BaseEntityType& BaseEntityType::Format(const std::string& format) +{ Attribute("Format", format); return *this; } + +inline const std::string& BaseEntityType::ModifiedAt(void) const +{ return Attribute("ModifiedAt"); } + +inline std::string& BaseEntityType::ModifiedAt(void) +{ return Attribute("ModifiedAt"); } + +inline BaseEntityType& BaseEntityType::ModifiedAt(const std::string& modifiedAt) +{ Attribute("ModifiedAt", modifiedAt); return *this; } + +inline const std::string& BaseEntityType::Name(void) const +{ return Attribute("Name"); } + +inline std::string& BaseEntityType::Name(void) +{ return Attribute("Name"); } + +inline BaseEntityType& BaseEntityType::Name(const std::string& name) +{ Attribute("Name", name); return *this; } + +inline const std::string& BaseEntityType::ResourceId(void) const +{ return Attribute("ResourceId"); } + +inline std::string& BaseEntityType::ResourceId(void) +{ return Attribute("ResourceId"); } + +inline BaseEntityType& BaseEntityType::ResourceId(const std::string& resourceId) +{ Attribute("ResourceId", resourceId); return *this; } + +inline const std::string& BaseEntityType::Tags(void) const +{ return Attribute("Tags"); } + +inline std::string& BaseEntityType::Tags(void) +{ return Attribute("Tags"); } + +inline BaseEntityType& BaseEntityType::Tags(const std::string& tags) +{ Attribute("Tags", tags); return *this; } + +inline const std::string& BaseEntityType::Version(void) const +{ return Attribute("Version"); } + +inline std::string& BaseEntityType::Version(void) +{ return Attribute("Version"); } + +inline BaseEntityType& BaseEntityType::Version(const std::string& version) +{ Attribute("Version", version); return *this; } + +// ---------------- +// DataEntityType +// ---------------- + +inline const std::string& DataEntityType::Checksum(void) const +{ return ChildText("Checksum"); } + +inline std::string& DataEntityType::Checksum(void) +{ return ChildText("Checksum"); } + +inline DataEntityType& DataEntityType::Checksum(const std::string& checksum) +{ ChildText("Checksum", checksum); return *this; } + +inline const std::string& DataEntityType::EncodedValue(void) const +{ return ChildText("EncodedValue"); } + +inline std::string& DataEntityType::EncodedValue(void) +{ return ChildText("EncodedValue"); } + +inline DataEntityType& DataEntityType::EncodedValue(const std::string& encodedValue) +{ ChildText("EncodedValue", encodedValue); return *this; } + +inline const std::string& DataEntityType::MetaType(void) const +{ return Attribute("MetaType"); } + +inline std::string& DataEntityType::MetaType(void) +{ return Attribute("MetaType"); } + +inline DataEntityType& DataEntityType::MetaType(const std::string& metatype) +{ Attribute("MetaType", metatype); return *this; } + +inline const std::string& DataEntityType::SimpleValue(void) const +{ return Attribute("SimpleValue"); } + +inline std::string& DataEntityType::SimpleValue(void) +{ return Attribute("SimpleValue"); } + +inline DataEntityType& DataEntityType::SimpleValue(const std::string& simpleValue) +{ Attribute("SimpleValue", simpleValue); return *this; } + +inline const std::string& DataEntityType::TimeStampedName(void) const +{ return Attribute("TimeStampedName"); } + +inline std::string& DataEntityType::TimeStampedName(void) +{ return Attribute("TimeStampedName"); } + +inline DataEntityType& DataEntityType::TimeStampedName(const std::string& timeStampedName) +{ Attribute("TimeStampedName", timeStampedName); return *this; } + +inline const std::string& DataEntityType::UniqueId(void) const +{ return Attribute("UniqueId"); } + +inline std::string& DataEntityType::UniqueId(void) +{ return Attribute("UniqueId"); } + +inline DataEntityType& DataEntityType::UniqueId(const std::string& uuid) +{ Attribute("UniqueId", uuid); return *this; } + +inline const std::string& DataEntityType::ValueDataType(void) const +{ return Attribute("ValueDataType"); } + +inline std::string& DataEntityType::ValueDataType(void) +{ return Attribute("ValueDataType"); } + +inline DataEntityType& DataEntityType::ValueDataType(const std::string& valueDataType) +{ Attribute("ValueDataType", valueDataType); return *this; } + +// ---------------- +// StrictEntityType +// ---------------- + +inline const std::string& StrictEntityType::MetaType(void) const +{ return Attribute("MetaType"); } + +inline std::string& StrictEntityType::MetaType(void) +{ return Attribute("MetaType"); } + +inline StrictEntityType& StrictEntityType::MetaType(const std::string& metatype) +{ Attribute("MetaType", metatype); return *this; } + +inline const std::string& StrictEntityType::TimeStampedName(void) const +{ return Attribute("TimeStampedName"); } + +inline std::string& StrictEntityType::TimeStampedName(void) +{ return Attribute("TimeStampedName"); } + +inline StrictEntityType& StrictEntityType::TimeStampedName(const std::string& timeStampedName) +{ Attribute("TimeStampedName", timeStampedName); return *this; } + +inline const std::string& StrictEntityType::UniqueId(void) const +{ return Attribute("UniqueId"); } + +inline std::string& StrictEntityType::UniqueId(void) +{ return Attribute("UniqueId"); } + +inline StrictEntityType& StrictEntityType::UniqueId(const std::string& uuid) +{ Attribute("UniqueId", uuid); return *this; } + +} // namespace internal +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/DataSetElement.h b/include/pbbam/internal/DataSetElement.h new file mode 100644 index 0000000..c7f7c8d --- /dev/null +++ b/include/pbbam/internal/DataSetElement.h @@ -0,0 +1,192 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef DATASETELEMENT_H +#define DATASETELEMENT_H + +#include "pbbam/DataSetXsd.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +class XmlName +{ + // qualified name + // | + // -------------- + // + // ---- --------- + // | | + // prefix local name + +public: + XmlName(const std::string& fullName, bool verbatim = false); + XmlName(const std::string& localName, const std::string& prefix); + XmlName(const XmlName& other); + XmlName(XmlName&& other); + XmlName& operator=(const XmlName& other); + XmlName& operator=(XmlName&& other); + ~XmlName(void); + +public: + bool operator==(const XmlName& other) const; + bool operator!=(const XmlName& other) const; + +public: + const boost::string_ref LocalName(void) const; + const boost::string_ref Prefix(void) const; + const std::string& QualifiedName(void) const; + bool Verbatim(void) const; + +private: + std::string qualifiedName_; + size_t prefixSize_; + size_t localNameOffset_; + size_t localNameSize_; + bool verbatim_; +}; + +struct FromInputXml { }; + +class DataSetElement +{ +public: + DataSetElement(const std::string& label, const XsdType& xsd = XsdType::NONE); + DataSetElement(const std::string& label, const FromInputXml& fromInputXml, const XsdType& xsd = XsdType::NONE); + DataSetElement(const DataSetElement& other); + DataSetElement(DataSetElement&& other); + DataSetElement& operator=(const DataSetElement& other); + DataSetElement& operator=(DataSetElement&& other); + virtual ~DataSetElement(void); + +public: + bool operator==(const DataSetElement& other) const; + bool operator!=(const DataSetElement& other) const; + +public: + const std::string& Attribute(const std::string& name) const; + std::string& Attribute(const std::string& name); + const std::map& Attributes(void) const; + std::map& Attributes(void); + bool HasAttribute(const std::string& name) const; + + const std::vector& Children(void) const; + std::vector& Children(void); + bool HasChild(const std::string& label) const; + + const boost::string_ref LocalNameLabel(void) const; + const boost::string_ref PrefixLabel(void) const; + const std::string& QualifiedNameLabel(void) const; + bool IsVerbatimLabel(void) const; + + const std::string& Text(void) const; + std::string& Text(void); + + const XsdType& Xsd(void) const; + +public: + void Attribute(const std::string& name, const std::string& value); + void Label(const std::string& label); + void Text(const std::string& text); + +public: + size_t NumAttributes(void) const; + size_t NumChildren(void) const; + +public: + void AddChild(const DataSetElement& e); + void RemoveChild(const DataSetElement& e); + + template + const T& Child(size_t index) const; + + template + T& Child(size_t index); + + template + const T& Child(const std::string& label) const; + + template + T& Child(const std::string& label); + + template + const T& operator[](size_t index) const; + + template + T& operator[](size_t index); + + template + const T& operator[](const std::string& label) const; + + template + T& operator[](const std::string& label); + +protected: + static const std::string& SharedNullString(void); + +public: + const std::string& ChildText(const std::string& label) const; + std::string& ChildText(const std::string& label); + void ChildText(const std::string& label, const std::string& text); + +protected: + XsdType xsd_; + XmlName label_; + std::string text_; + std::map attributes_; + std::vector children_; + +private: + int IndexOf(const std::string& label) const; +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/DataSetElement.inl" + +#endif // DATASETELEMENT_H diff --git a/include/pbbam/internal/DataSetElement.inl b/include/pbbam/internal/DataSetElement.inl new file mode 100644 index 0000000..37a673f --- /dev/null +++ b/include/pbbam/internal/DataSetElement.inl @@ -0,0 +1,373 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "pbbam/internal/DataSetElement.h" + +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +// ---------------- +// DataSetElement +// ---------------- + +inline DataSetElement::DataSetElement(const std::string& label, const XsdType& xsd) + : xsd_(xsd) + , label_(label) +{ } + +inline DataSetElement::DataSetElement(const std::string& label, + const FromInputXml&, + const XsdType& xsd) + : xsd_(xsd) + , label_(label, true) +{ } + +inline DataSetElement::DataSetElement(const DataSetElement& other) + : xsd_(other.xsd_) + , label_(other.label_) + , text_(other.text_) + , attributes_(other.attributes_) + , children_(other.children_) +{ } + +inline DataSetElement::DataSetElement(DataSetElement&& other) + : xsd_(std::move(other.xsd_)) + , label_(std::move(other.label_)) + , text_(std::move(other.text_)) + , attributes_(std::move(other.attributes_)) + , children_(std::move(other.children_)) +{ } + +inline DataSetElement& DataSetElement::operator=(const DataSetElement& other) +{ + xsd_ = other.xsd_; + label_ = other.label_; + text_ = other.text_; + attributes_ = other.attributes_; + children_ = other.children_; + return *this; +} + +inline DataSetElement& DataSetElement::operator=(DataSetElement&& other) +{ + xsd_ = std::move(other.xsd_); + label_ = std::move(other.label_); + text_ = std::move(other.text_); + attributes_ = std::move(other.attributes_); + children_ = std::move(other.children_); + return *this; +} + +inline DataSetElement::~DataSetElement(void) { } + +inline bool DataSetElement::operator==(const DataSetElement& other) const +{ + return xsd_ == other.xsd_ && + label_ == other.label_ && + text_ == other.text_ && + attributes_ == other.attributes_ && + children_ == other.children_; +} + +inline bool DataSetElement::operator!=(const DataSetElement& other) const +{ return !(*this == other); } + +template +const T& DataSetElement::operator[](size_t index) const +{ return Child(index); } + +template +T& DataSetElement::operator[](size_t index) +{ return Child(index); } + +template +const T& DataSetElement::operator[](const std::string& label) const +{ return Child(label); } + +template +T& DataSetElement::operator[](const std::string& label) +{ return Child(label); } + +inline void DataSetElement::AddChild(const DataSetElement& e) +{ children_.push_back(e); } + +inline std::string& DataSetElement::Attribute(const std::string& name) +{ return attributes_[name]; } + +inline const std::string& DataSetElement::Attribute(const std::string& name) const +{ + auto iter = attributes_.find(name); + if (iter == attributes_.cend()) + return SharedNullString(); + return iter->second; +} + +inline void DataSetElement::Attribute(const std::string& name, const std::string& value) +{ attributes_[name] = value; } + +inline const std::map& DataSetElement::Attributes(void) const +{ return attributes_; } + +inline std::map& DataSetElement::Attributes(void) +{ return attributes_; } + +template +inline const T& DataSetElement::Child(size_t index) const +{ return static_cast(children_.at(index)); } + +template +inline T& DataSetElement::Child(size_t index) +{ return static_cast(children_.at(index)); } + +template +inline const T& DataSetElement::Child(const std::string& label) const +{ return Child(IndexOf(label)); } + +template +inline T& DataSetElement::Child(const std::string& label) +{ + const int i = IndexOf(label); + if (i >= 0) { + assert(static_cast(i) < NumChildren()); + return Child(i); + } else { + AddChild(DataSetElement(label)); + return Child(NumChildren()-1); + } +} + +inline const std::vector& DataSetElement::Children(void) const +{ return children_; } + +inline std::vector& DataSetElement::Children(void) +{ return children_; } + +inline const std::string& DataSetElement::ChildText(const std::string& label) const +{ + if (!HasChild(label)) + return SharedNullString(); + return Child(label).Text(); +} + +inline std::string& DataSetElement::ChildText(const std::string& label) +{ + if (!HasChild(label)) + AddChild(DataSetElement(label)); + return Child(label).Text(); +} + +inline bool DataSetElement::HasAttribute(const std::string& name) const +{ return attributes_.find(name) != attributes_.cend(); } + +inline bool DataSetElement::HasChild(const std::string& label) const +{ return IndexOf(label) != -1; } + +inline int DataSetElement::IndexOf(const std::string& label) const +{ + const size_t count = NumChildren(); + for (size_t i = 0; i < count; ++i) { + const DataSetElement& child = children_.at(i); + if (child.LocalNameLabel() == label || child.label_ == label) + return i; + } + return -1; +} + +inline const boost::string_ref DataSetElement::LocalNameLabel(void) const +{ return label_.LocalName(); } + +inline const boost::string_ref DataSetElement::PrefixLabel(void) const +{ return label_.Prefix(); } + +inline const std::string& DataSetElement::QualifiedNameLabel(void) const +{ return label_.QualifiedName(); } + +//inline std::string& DataSetElement::Label(void) +//{ return label_.QualifiedName(); } + +inline void DataSetElement::Label(const std::string& label) +{ label_ = XmlName(label, true); } + +inline size_t DataSetElement::NumAttributes(void) const +{ return attributes_.size(); } + +inline size_t DataSetElement::NumChildren(void) const +{ return children_.size(); } + +inline void DataSetElement::RemoveChild(const DataSetElement& e) +{ + children_.erase( + std::remove(children_.begin(), + children_.end(), + e), + children_.end() + ); +} + +inline void DataSetElement::ChildText(const std::string& label, + const std::string& text) +{ + if (!HasChild(label)) { + DataSetElement e(label); + e.Text(text); + AddChild(e); + } else { + Child(label).Text(text); + } +} + +inline bool DataSetElement::IsVerbatimLabel(void) const +{ return label_.Verbatim(); } + +inline const std::string& DataSetElement::Text(void) const +{ return text_; } + +inline std::string& DataSetElement::Text(void) +{ return text_; } + +inline void DataSetElement::Text(const std::string& text) +{ text_ = text; } + +inline const XsdType& DataSetElement::Xsd(void) const +{ return xsd_; } + +// ---------------- +// XmlName +// ---------------- + +inline XmlName::XmlName(const std::string& fullName, bool verbatim) + : qualifiedName_(fullName) + , prefixSize_(0) + , localNameOffset_(0) + , localNameSize_(0) + , verbatim_(verbatim) +{ + const size_t colonFound = qualifiedName_.find(':'); + if (colonFound == std::string::npos || colonFound == 0) + localNameSize_ = qualifiedName_.size(); + else { + prefixSize_ = colonFound; + localNameSize_ = (qualifiedName_.size() - colonFound) - 1; + } + + // adjust for colon if prefix present + localNameOffset_ = prefixSize_; + if (prefixSize_ != 0) + ++localNameOffset_; +} + +inline XmlName::XmlName(const std::string& localName, + const std::string& prefix) + : prefixSize_(prefix.size()) + , localNameOffset_(prefixSize_) + , localNameSize_(localName.size()) + , verbatim_(true) +{ + qualifiedName_.clear(); + qualifiedName_.reserve(localNameSize_+ prefixSize_ + 1); + qualifiedName_.append(prefix); + if (!qualifiedName_.empty()) + qualifiedName_.append(1, ':'); + qualifiedName_.append(localName); + + // adjust for colon if prefix present + if (prefixSize_ != 0) + ++localNameOffset_; +} + +inline XmlName::XmlName(const XmlName& other) + : qualifiedName_(other.qualifiedName_) + , prefixSize_(other.prefixSize_) + , localNameOffset_(other.localNameOffset_) + , localNameSize_(other.localNameSize_) + , verbatim_(other.verbatim_) +{ } + +inline XmlName::XmlName(XmlName&& other) + : qualifiedName_(std::move(other.qualifiedName_)) + , prefixSize_(std::move(other.prefixSize_)) + , localNameOffset_(std::move(other.localNameOffset_)) + , localNameSize_(std::move(other.localNameSize_)) + , verbatim_(std::move(other.verbatim_)) +{ } + +inline XmlName& XmlName::operator=(const XmlName& other) +{ + qualifiedName_ = other.qualifiedName_; + prefixSize_ = other.prefixSize_; + localNameOffset_ = other.localNameOffset_; + localNameSize_ = other.localNameSize_; + verbatim_ = other.verbatim_; + return *this; +} + +inline XmlName& XmlName::operator=(XmlName&& other) +{ + qualifiedName_ = std::move(other.qualifiedName_); + prefixSize_ = std::move(other.prefixSize_); + localNameOffset_ = std::move(other.localNameOffset_); + localNameSize_ = std::move(other.localNameSize_); + verbatim_ = std::move(other.verbatim_); + return *this; +} + +inline XmlName::~XmlName(void) { } + +inline bool XmlName::operator==(const XmlName& other) const +{ return qualifiedName_ == other.qualifiedName_; } + +inline bool XmlName::operator!=(const XmlName& other) const +{ return !(*this == other); } + +inline const boost::string_ref XmlName::LocalName(void) const +{ return boost::string_ref(qualifiedName_.data() + localNameOffset_, localNameSize_); } + +inline const boost::string_ref XmlName::Prefix(void) const +{ return boost::string_ref(qualifiedName_.data(), prefixSize_); } + +inline const std::string& XmlName::QualifiedName(void) const +{ return qualifiedName_; } + +inline bool XmlName::Verbatim(void) const +{ return verbatim_; } + +} // namespace internal +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/DataSetListElement.h b/include/pbbam/internal/DataSetListElement.h new file mode 100644 index 0000000..5c44d25 --- /dev/null +++ b/include/pbbam/internal/DataSetListElement.h @@ -0,0 +1,118 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef DATASETLISTELEMENT_H +#define DATASETLISTELEMENT_H + +#include "pbbam/internal/DataSetElement.h" + +namespace PacBio { +namespace BAM { +namespace internal { + +// +// adds iterators for convenience +// +template class DataSetListElement; + +template +class DataSetListIteratorBase +{ +public: + bool operator==(const DataSetListIteratorBase& other) const; + bool operator!=(const DataSetListIteratorBase& other) const; + +protected: + DataSetListIteratorBase(const DataSetListElement* parent, size_t i); + void ReadNext(void); + +protected: + const DataSetListElement* parent_; + size_t index_; +}; + +template +class DataSetListIterator : public DataSetListIteratorBase +{ +public: + DataSetListIterator(const DataSetListElement* parent, size_t i); + T& operator*(void); + T* operator->(void); + DataSetListIterator& operator++(void); + DataSetListIterator operator++(int); +}; + +template +class DataSetListConstIterator : public DataSetListIteratorBase +{ +public: + DataSetListConstIterator(const DataSetListElement* parent, size_t i); + const T& operator*(void) const; + const T* operator->(void) const; + DataSetListConstIterator& operator++(void); + DataSetListConstIterator operator++(int); +}; + +template +class DataSetListElement : public DataSetElement +{ +public: + DataSetListElement(const std::string& label, const XsdType& xsd = XsdType::NONE); + +// child access through index +public: + const T& operator[](size_t index) const; + T& operator[](size_t index); + size_t Size(void) const; + +// child access through iterators +public: + DataSetListIterator begin(void); + DataSetListConstIterator begin(void) const; + DataSetListConstIterator cbegin(void) const; + DataSetListIterator end(void); + DataSetListConstIterator end(void) const; + DataSetListConstIterator cend(void) const; +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/DataSetListElement.inl" + +#endif // DATASETLISTELEMENT_H diff --git a/include/pbbam/internal/DataSetListElement.inl b/include/pbbam/internal/DataSetListElement.inl new file mode 100644 index 0000000..1479fa5 --- /dev/null +++ b/include/pbbam/internal/DataSetListElement.inl @@ -0,0 +1,181 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "pbbam/internal/DataSetListElement.h" +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +// -------------------- +// DataSetListElement +// -------------------- + +template +inline DataSetListElement::DataSetListElement(const std::string& label, + const XsdType& xsd) + : DataSetElement(label, xsd) +{ } + +template +inline const T& DataSetListElement::operator[](size_t index) const +{ return static_cast(children_.at(index)); } + +template +inline T& DataSetListElement::operator[](size_t index) +{ return static_cast(children_.at(index)); } + +template +inline size_t DataSetListElement::Size(void) const +{ return NumChildren(); } + +template +inline DataSetListIterator DataSetListElement::begin(void) +{ return DataSetListIterator(this, 0); } + +template +inline DataSetListConstIterator DataSetListElement::begin(void) const +{ return DataSetListConstIterator(this, 0); } + +template +inline DataSetListConstIterator DataSetListElement::cbegin(void) const +{ return DataSetListConstIterator(this, 0); } + +template +inline DataSetListIterator DataSetListElement::end(void) +{ return DataSetListIterator(this, NumChildren()); } + +template +inline DataSetListConstIterator DataSetListElement::end(void) const +{ return DataSetListConstIterator(this, NumChildren()); } + +template +inline DataSetListConstIteratorDataSetListElement::cend(void) const +{ return DataSetListConstIterator(this, NumChildren()); } + +// ------------------------- +// DataSetListIteratorBase +// ------------------------- + +template +inline bool DataSetListIteratorBase::operator==(const DataSetListIteratorBase& other) const +{ return parent_ == other.parent_ && + index_ == other.index_; +} + +template +inline bool DataSetListIteratorBase::operator!=(const DataSetListIteratorBase& other) const +{ return !(*this == other); } + +template +inline DataSetListIteratorBase::DataSetListIteratorBase(const DataSetListElement* parent, size_t i) + : parent_(parent) + , index_(i) +{ } + +template +inline void DataSetListIteratorBase::ReadNext(void) +{ + if (index_ >= parent_->NumChildren()) { + parent_ = nullptr; + return; + } + ++index_; +} + +// --------------------- +// DataSetListIterator +// --------------------- + +template +inline DataSetListIterator::DataSetListIterator(const DataSetListElement* parent, size_t i) + : DataSetListIteratorBase(parent, i) +{ } + +template +inline T& DataSetListIterator::operator*(void) +{ return DataSetListIteratorBase::parent_->template Child(DataSetListIteratorBase::index_); } + +template +inline T* DataSetListIterator::operator->(void) +{ return &(operator*()); } + +template +inline DataSetListIterator& DataSetListIterator::operator++(void) +{ DataSetListIteratorBase::ReadNext(); return *this; } + +template +inline DataSetListIterator DataSetListIterator::operator++(int) +{ + DataSetListIterator result(*this); + ++(*this); + return result; +} + +// -------------------------- +// DataSetListConstIterator +// -------------------------- + +template +inline DataSetListConstIterator::DataSetListConstIterator(const DataSetListElement* parent, size_t i) + : DataSetListIteratorBase(parent, i) +{ } + +template +inline const T& DataSetListConstIterator::operator*(void) const +{ return DataSetListIteratorBase::parent_->template Child(DataSetListIteratorBase::index_); } + +template +inline const T* DataSetListConstIterator::operator->(void) const +{ return &(operator*()); } + +template +inline DataSetListConstIterator& DataSetListConstIterator::operator++(void) +{ DataSetListIteratorBase::ReadNext(); return *this; } + +template +inline DataSetListConstIterator DataSetListConstIterator::operator++(int) +{ + DataSetListConstIterator result(*this); + ++(*this); + return result; +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/DataSetTypes.inl b/include/pbbam/internal/DataSetTypes.inl new file mode 100644 index 0000000..dbcbd26 --- /dev/null +++ b/include/pbbam/internal/DataSetTypes.inl @@ -0,0 +1,154 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file DataSetTypes.inl +/// \brief Inline implementations for the public DataSet component classes. +// +// Author: Derek Barnett + +#include "pbbam/DataSetTypes.h" + +namespace PacBio { +namespace BAM { + +// ------------- +// DataSetBase +// -------------- + +inline const NamespaceRegistry& DataSetBase::Namespaces(void) const +{ return registry_; } + +inline NamespaceRegistry& DataSetBase::Namespaces(void) +{ return registry_; } + +// --------------------- +// DataSetMetadata +// --------------------- + +inline const std::string& DataSetMetadata::NumRecords(void) const +{ return ChildText("NumRecords"); } + +inline std::string& DataSetMetadata::NumRecords(void) +{ return ChildText("NumRecords"); } + +inline DataSetMetadata& DataSetMetadata::NumRecords(const std::string& numRecords) +{ ChildText("NumRecords", numRecords); return *this; } + +inline const std::string& DataSetMetadata::TotalLength(void) const +{ return ChildText("TotalLength"); } + +inline std::string& DataSetMetadata::TotalLength(void) +{ return ChildText("TotalLength"); } + +inline DataSetMetadata& DataSetMetadata::TotalLength(const std::string& totalLength) +{ ChildText("TotalLength", totalLength); return *this; } + +// ---------- +// Property +// ---------- + +inline const std::string& Property::Name(void) const +{ return Attribute("Name"); } + +inline std::string& Property::Name(void) +{ return Attribute("Name"); } + +inline Property& Property::Name(const std::string& name) +{ Attribute("Name", name); return *this; } + +inline const std::string& Property::Operator(void) const +{ return Attribute("Operator"); } + +inline std::string& Property::Operator(void) +{ return Attribute("Operator"); } + +inline Property& Property::Operator(const std::string& op) +{ Attribute("Operator", op); return *this; } + +inline const std::string& Property::Value(void) const +{ return Attribute("Value"); } + +inline std::string& Property::Value(void) +{ return Attribute("Value"); } + +inline Property& Property::Value(const std::string& value) +{ Attribute("Value", value); return *this; } + +// ------------ +// Provenance +// ------------ + +inline const std::string& Provenance::CreatedBy(void) const +{ return Attribute("CreatedBy"); } + +inline std::string& Provenance::CreatedBy(void) +{ return Attribute("CreatedBy"); } + +inline Provenance& Provenance::CreatedBy(const std::string& createdBy) +{ Attribute("CreatedBy", createdBy); return *this; } + +inline const std::string& Provenance::CommonServicesInstanceId(void) const +{ return ChildText("CommonServicesInstanceId"); } + +inline std::string& Provenance::CommonServicesInstanceId(void) +{ return ChildText("CommonServicesInstanceId"); } + +inline Provenance& Provenance::CommonServicesInstanceId(const std::string& id) +{ ChildText("CommonServicesInstanceId", id); return *this; } + +inline const std::string& Provenance::CreatorUserId(void) const +{ return ChildText("CreatorUserId"); } + +inline std::string& Provenance::CreatorUserId(void) +{ return ChildText("CreatorUserId"); } + +inline Provenance& Provenance::CreatorUserId(const std::string& id) +{ ChildText("CreatorUserId", id); return *this; } + +inline const std::string& Provenance::ParentJobId(void) const +{ return ChildText("ParentJobId"); } + +inline std::string& Provenance::ParentJobId(void) +{ return ChildText("ParentJobId"); } + +inline Provenance& Provenance::ParentJobId(const std::string& id) +{ ChildText("ParentJobId", id); return *this; } + +inline Provenance& Provenance::ParentTool(const PacBio::BAM::ParentTool& tool) +{ ParentTool() = tool; return *this; } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/FastaSequence.inl b/include/pbbam/internal/FastaSequence.inl new file mode 100644 index 0000000..fe28170 --- /dev/null +++ b/include/pbbam/internal/FastaSequence.inl @@ -0,0 +1,60 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file FastaSequence.inl +/// \brief Inline implementations for the FastaSequence class. +// +// Author: Derek Barnett + +#include "pbbam/FastaSequence.h" + +namespace PacBio { +namespace BAM { + +inline FastaSequence::FastaSequence(std::string name, + std::string bases) + : name_{std::move(name)} + , bases_{std::move(bases)} +{ } + +inline std::string FastaSequence::Bases(void) const +{ return bases_; } + +inline std::string FastaSequence::Name(void) const +{ return name_; } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/Frames.inl b/include/pbbam/internal/Frames.inl new file mode 100644 index 0000000..37cb64b --- /dev/null +++ b/include/pbbam/internal/Frames.inl @@ -0,0 +1,93 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Frames.inl +/// \brief Inline implementations for the Frames class. +// +// Author: Derek Barnett + +#include "pbbam/Frames.h" + +namespace PacBio { +namespace BAM { + +inline const std::vector& Frames::Data(void) const +{ return data_; } + +inline std::vector& Frames::DataRaw(void) +{ return data_; } + +inline std::vector Frames::Encode(void) const +{ return Frames::Encode(data_); } + +inline Frames& Frames::Data(const std::vector& frames) +{ data_ = frames; return *this; } + +inline Frames& Frames::Data(std::vector&& frames) +{ data_ = std::move(frames); return *this; } + +inline std::vector::const_iterator Frames::begin(void) const +{ return data_.begin(); } + +inline std::vector::iterator Frames::begin(void) +{ return data_.begin(); } + +inline std::vector::const_iterator Frames::cbegin(void) const +{ return data_.cbegin(); } + +inline std::vector::const_iterator Frames::cend(void) const +{ return data_.cend(); } + +inline std::vector::const_iterator Frames::end(void) const +{ return data_.end(); } + +inline std::vector::iterator Frames::end(void) +{ return data_.end(); } + +inline size_t Frames::size(void) const +{ return data_.size(); } + +inline bool Frames::empty(void) const +{ return data_.empty(); } + +inline bool Frames::operator==(const Frames& other) const +{ return data_ == other.data_; } + +inline bool Frames::operator!=(const Frames& other) const +{ return !(*this == other); } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/GenomicInterval.inl b/include/pbbam/internal/GenomicInterval.inl new file mode 100644 index 0000000..07c18ef --- /dev/null +++ b/include/pbbam/internal/GenomicInterval.inl @@ -0,0 +1,91 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file GenomicInterval.inl +/// \brief Inline implementations for the GenomicInterval class. +// +// Author: Derek Barnett + +#include "pbbam/GenomicInterval.h" + +namespace PacBio { +namespace BAM { + +inline GenomicInterval::~GenomicInterval(void) { } + +inline std::string GenomicInterval::Name(void) const +{ return name_; } + +inline GenomicInterval& GenomicInterval::Name(const std::string& name) +{ name_ = name; return *this; } + +inline PacBio::BAM::Interval GenomicInterval::Interval(void) const +{ return interval_; } + +inline GenomicInterval& GenomicInterval::Interval(const PacBio::BAM::Interval& interval) +{ interval_ = interval; return *this; } + +inline bool GenomicInterval::IsValid(void) const +{ + return !name_.empty() && + interval_.Start() >= 0 && + interval_.Stop() >= 0 && + interval_.IsValid(); +} + +inline size_t GenomicInterval::Length(void) const +{ return interval_.Length(); } + +inline Position GenomicInterval::Start(void) const +{ return interval_.Start(); } + +inline GenomicInterval& GenomicInterval::Start(const Position start) +{ interval_.Start(start); return *this; } + +inline Position GenomicInterval::Stop(void) const +{ return interval_.Stop(); } + +inline GenomicInterval& GenomicInterval::Stop(const Position stop) +{ interval_.Stop(stop); return *this; } + +inline bool GenomicInterval::operator==(const GenomicInterval& other) const +{ return name_ == other.name_ && interval_ == other.interval_; } + +inline bool GenomicInterval::operator!=(const GenomicInterval& other) const +{ return !(*this == other); } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/Interval.inl b/include/pbbam/internal/Interval.inl new file mode 100644 index 0000000..e9c7edd --- /dev/null +++ b/include/pbbam/internal/Interval.inl @@ -0,0 +1,118 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Interval.inl +/// \brief Inline implementations for the Interval class. +// +// Author: Derek Barnett + +#include "pbbam/Interval.h" + +namespace PacBio { +namespace BAM { + +template +inline Interval::Interval(void) + : data_(boost::icl::discrete_interval::right_open(0,0)) +{ } + +template +inline Interval::Interval(const T val) + : data_(boost::icl::discrete_interval::right_open(val,val+1)) +{ } + +template +inline Interval::Interval(const T start, const T stop) + : data_(boost::icl::discrete_interval::right_open(start,stop)) +{ } + +template +inline Interval::Interval(const Interval& other) + : data_(boost::icl::discrete_interval::right_open(other.Start(), other.Stop())) +{ } + +template +inline bool Interval::operator==(const Interval& other) const +{ return data_ == other.data_; } + +template +inline bool Interval::operator!=(const Interval& other) const +{ return !(data_ == other.data_); } + +template +inline bool Interval::CoveredBy(const Interval& other) const +{ return boost::icl::within(data_, other.data_); } + +template +inline bool Interval::Covers(const Interval& other) const +{ return boost::icl::contains(data_, other.data_); } + +template +inline bool Interval::Intersects(const Interval& other) const +{ return boost::icl::intersects(data_, other.data_); } + +template +inline bool Interval::IsValid(void) const +{ return !boost::icl::is_empty(data_); } + +template +inline size_t Interval::Length(void) const +{ return boost::icl::length(data_); } + +template +inline T Interval::Start(void) const +{ return data_.lower(); } + +template +inline Interval& Interval::Start(const T& start) +{ + data_ = boost::icl::discrete_interval::right_open(start, data_.upper()); + return *this; +} + +template +inline T Interval::Stop(void) const +{ return data_.upper(); } + +template +inline Interval& Interval::Stop(const T& stop) +{ + data_ = boost::icl::discrete_interval::right_open(data_.lower(), stop); + return *this; +} + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/PbiBasicTypes.inl b/include/pbbam/internal/PbiBasicTypes.inl new file mode 100644 index 0000000..229841e --- /dev/null +++ b/include/pbbam/internal/PbiBasicTypes.inl @@ -0,0 +1,70 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiBasicTypes.inl +/// \brief Inline implementations for the basic data structures used in PBI lookups. +// +// Author: Derek Barnett + +#include "pbbam/PbiBasicTypes.h" + +namespace PacBio { +namespace BAM { + +inline IndexResultBlock::IndexResultBlock(void) + : firstIndex_(0) + , numReads_(0) + , virtualOffset_(-1) +{ } + +inline IndexResultBlock::IndexResultBlock(size_t idx, size_t numReads) + : firstIndex_(idx) + , numReads_(numReads) + , virtualOffset_(-1) +{ } + +inline bool IndexResultBlock::operator==(const IndexResultBlock& other) const +{ + return firstIndex_ == other.firstIndex_ && + numReads_ == other.numReads_ && + virtualOffset_ == other.virtualOffset_; +} + +inline bool IndexResultBlock::operator!=(const IndexResultBlock& other) const +{ return !(*this == other); } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/PbiFilter.inl b/include/pbbam/internal/PbiFilter.inl new file mode 100644 index 0000000..18c26d0 --- /dev/null +++ b/include/pbbam/internal/PbiFilter.inl @@ -0,0 +1,312 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiFilter.inl +/// \brief Inline implementations for the PbiFilter class. +// +// Author: Derek Barnett + +#include "pbbam/PbiFilter.h" +#include +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +/// \internal +/// +/// This class wraps a the basic PBI filter (whether property filter or some operator +/// e.g. union, intersect, etc.). The wrapper allows PbiFilters to hold heterogeneous, +/// recursive filter types - without exposing pointers & worrying about memory ownership +/// issues between client & library. +/// +/// Filters can be given by value from client code and we will wrap them for composition. +/// +/// \code{.cpp} +/// PbiFilter f1(PbiZmwFilter(42)); +/// PbiFilter f2; +/// f2.Add(PbiQueryLengthFilter(3000, GREATER_THAN_EQUAL)); +/// f2.Add(MyApplicationCustomFilter("foo")); +/// PbiFilter intersect = PbiFilter::Intersect(f1, f2); +/// ... +/// \endcode +/// +struct FilterWrapper +{ +public: + template FilterWrapper(T x); + + FilterWrapper(const FilterWrapper& other); + FilterWrapper(FilterWrapper&&) noexcept = default; + FilterWrapper& operator=(const FilterWrapper& other); + FilterWrapper& operator=(FilterWrapper&&) noexcept = default; + ~FilterWrapper(void); + +public: + bool Accepts(const PacBio::BAM::PbiRawData& idx, const size_t row) const; + +private: + struct WrapperInterface + { + virtual ~WrapperInterface(void) = default; + virtual WrapperInterface* Clone(void) const =0; + virtual bool Accepts(const PacBio::BAM::PbiRawData& idx, + const size_t row) const =0; + }; + + template + struct WrapperImpl : public WrapperInterface + { + WrapperImpl(T x); + WrapperImpl(const WrapperImpl& other); + WrapperInterface* Clone(void) const; + bool Accepts(const PacBio::BAM::PbiRawData& idx, const size_t row) const; + T data_; + }; + +private: + std::unique_ptr self_; +}; + +// --------------- +// FilterWrapper +// --------------- + +template +inline FilterWrapper::FilterWrapper(T x) + : self_(new WrapperImpl(std::move(x))) +{ } + +inline FilterWrapper::FilterWrapper(const FilterWrapper& other) + : self_(other.self_->Clone()) +{ } + +inline FilterWrapper& FilterWrapper::operator=(const FilterWrapper& other) +{ + self_.reset(other.self_->Clone()); + return *this; +} + +inline FilterWrapper::~FilterWrapper(void) { } + +inline bool FilterWrapper::Accepts(const PbiRawData& idx, const size_t row) const +{ return self_->Accepts(idx, row); } + +// ---------------- +// WrapperImpl +// ---------------- + +template +inline FilterWrapper::WrapperImpl::WrapperImpl(T x) + : FilterWrapper::WrapperInterface() + , data_(std::move(x)) +{ + BOOST_CONCEPT_ASSERT((PbiFilterConcept)); +} + +template +inline FilterWrapper::WrapperImpl::WrapperImpl(const WrapperImpl& other) + : FilterWrapper::WrapperInterface() + , data_(other.data_) +{ } + +template +inline FilterWrapper::WrapperInterface* FilterWrapper::WrapperImpl::Clone(void) const +{ return new WrapperImpl(*this); } + +template +inline bool FilterWrapper::WrapperImpl::Accepts(const PbiRawData& idx, + const size_t row) const +{ return data_.Accepts(idx, row); } + +struct PbiFilterPrivate +{ + PbiFilterPrivate(PbiFilter::CompositionType type) + : type_(type) + { } + + template + void Add(T&& filter) + { + filters_.emplace_back(std::move(filter)); + } + + std::unique_ptr DeepCopy(void) + { + auto copy = std::unique_ptr{ new PbiFilterPrivate{type_} }; + copy->filters_ = this->filters_; + return copy; + } + + bool Accepts(const PbiRawData& idx, const size_t row) const + { + // no filter -> accepts every record + if (filters_.empty()) + return true; + + // intersection of child filters + if (type_ == PbiFilter::INTERSECT) { + for (const auto& filter : filters_) { + if (!filter.Accepts(idx, row)) + return false; // break early on failure + } + return true; // all passed + } + + // union of child filters + else if (type_ == PbiFilter::UNION) { + for (const auto& filter : filters_) { + if (filter.Accepts(idx, row)) + return true; // break early on pass + } + return false; // none passed + } + + else + //assert(false); // invalid composite filter type + throw std::runtime_error("invalid composite filter type in PbiFilterPrivate::Accepts"); + } + + PbiFilter::CompositionType type_; + std::vector filters_; +}; + +} // namespace internal + +inline PbiFilter::PbiFilter(const CompositionType type) + : d_{ new internal::PbiFilterPrivate{ type } } +{ } + +template inline +PbiFilter::PbiFilter(const T& filter) + : d_{ new internal::PbiFilterPrivate{ PbiFilter::INTERSECT } } +{ + Add(filter); +} + +template inline +PbiFilter::PbiFilter(T&& filter) + : d_{ new internal::PbiFilterPrivate{ PbiFilter::INTERSECT } } +{ + Add(std::move(filter)); +} + +inline PbiFilter::PbiFilter(const std::vector& filters) + : d_{ new internal::PbiFilterPrivate{ PbiFilter::INTERSECT } } +{ + Add(filters); +} + +inline PbiFilter::PbiFilter(std::vector&& filters) + : d_{ new internal::PbiFilterPrivate{ PbiFilter::INTERSECT} } +{ + Add(std::move(filters)); +} + +inline PbiFilter::PbiFilter(const PbiFilter& other) + : d_{ other.d_->DeepCopy() } +{ } + +inline PbiFilter::PbiFilter(PbiFilter&& other) noexcept + : d_{ std::move(other.d_) } +{ } + +inline PbiFilter& PbiFilter::operator=(const PbiFilter& other) +{ + d_ = other.d_->DeepCopy(); + return *this; +} + +inline PbiFilter& PbiFilter::operator=(PbiFilter&& other) noexcept +{ + d_ = std::move(other.d_); + return *this; +} + +inline PbiFilter::~PbiFilter(void) { } + +inline bool PbiFilter::Accepts(const PacBio::BAM::PbiRawData& idx, + const size_t row) const +{ return d_->Accepts(idx, row); } + +template +inline PbiFilter& PbiFilter::Add(const T& filter) +{ + T copy = filter; + return Add(std::move(copy)); +} + +template +inline PbiFilter& PbiFilter::Add(T&& filter) +{ + d_->Add(std::move(filter)); + return *this; +} + +inline PbiFilter& PbiFilter::Add(const PbiFilter& filter) +{ + PbiFilter copy = filter; + return Add(std::move(copy)); +} + +inline PbiFilter& PbiFilter::Add(PbiFilter&& filter) +{ + d_->Add(std::move(filter)); + return *this; +} + +inline PbiFilter& PbiFilter::Add(const std::vector& filters) +{ + std::vector copy = filters; + return Add(std::move(copy)); +} + +inline PbiFilter& PbiFilter::Add(std::vector&& filters) +{ + for (auto&& filter : filters) + d_->Add(std::move(filter)); + return *this; +} + +inline bool PbiFilter::IsEmpty(void) const +{ return d_->filters_.empty(); } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/PbiFilterTypes.inl b/include/pbbam/internal/PbiFilterTypes.inl new file mode 100644 index 0000000..a7a8787 --- /dev/null +++ b/include/pbbam/internal/PbiFilterTypes.inl @@ -0,0 +1,548 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiFilterTypes.inl +/// \brief Inline implementations for the built-in PBI filters. +// +// Author: Derek Barnett + +#include "pbbam/PbiFilterTypes.h" +#include +#include + +namespace PacBio { +namespace BAM { + +namespace internal { + +template +inline FilterBase::FilterBase(const T& value, const Compare::Type cmp) + : value_(value) + , cmp_(cmp) +{ } + +template +inline FilterBase::FilterBase(T&& value, const Compare::Type cmp) + : value_(std::move(value)) + , cmp_(cmp) +{ } + +template +inline FilterBase::FilterBase(const std::vector& values) + : multiValue_(values) +{ } + +template +inline FilterBase::FilterBase(std::vector&& values) + : multiValue_(std::move(values)) +{ } + +template +inline bool FilterBase::CompareHelper(const T& lhs) const +{ + if (multiValue_ == boost::none) + return CompareSingleHelper(lhs); + else + return CompareMultiHelper(lhs); +} + +template +inline bool FilterBase::CompareMultiHelper(const T& lhs) const +{ + // check provided value against all filter criteria, + // return true on any exact match + auto iter = multiValue_.get().cbegin(); + const auto end = multiValue_.get().cend(); + for (; iter != end; ++iter) { + if (*iter == lhs) + return true; + } + return false; // no matches +} + +template +inline bool FilterBase::CompareSingleHelper(const T& lhs) const +{ + switch(cmp_) { + case Compare::EQUAL: return lhs == value_; + case Compare::LESS_THAN: return lhs < value_; + case Compare::LESS_THAN_EQUAL: return lhs <= value_; + case Compare::GREATER_THAN: return lhs > value_; + case Compare::GREATER_THAN_EQUAL: return lhs >= value_; + case Compare::NOT_EQUAL: return lhs != value_; + default: + assert(false); + throw std::runtime_error("unsupported compare type requested"); + } +} + +template<> +inline bool FilterBase::CompareSingleHelper(const LocalContextFlags& lhs) const +{ + switch(cmp_) { + case Compare::EQUAL: return lhs == value_; + case Compare::LESS_THAN: return lhs < value_; + case Compare::LESS_THAN_EQUAL: return lhs <= value_; + case Compare::GREATER_THAN: return lhs > value_; + case Compare::GREATER_THAN_EQUAL: return lhs >= value_; + case Compare::NOT_EQUAL: return lhs != value_; + case Compare::CONTAINS: return ((lhs & value_) != 0); + case Compare::NOT_CONTAINS: return ((lhs & value_) == 0); + + default: + assert(false); + throw std::runtime_error("unsupported compare type requested"); + } +} + +// BarcodeDataFilterBase + +template +inline BarcodeDataFilterBase::BarcodeDataFilterBase(const T& value, const Compare::Type cmp) + : FilterBase(value, cmp) +{ } + +template +inline BarcodeDataFilterBase::BarcodeDataFilterBase(T&& value, const Compare::Type cmp) + : FilterBase(std::move(value), cmp) +{ } + +template +inline BarcodeDataFilterBase::BarcodeDataFilterBase(const std::vector& values) + : FilterBase(values) +{ } + +template +inline BarcodeDataFilterBase::BarcodeDataFilterBase(std::vector&& values) + : FilterBase(std::move(values)) +{ } + +template +inline bool BarcodeDataFilterBase::BarcodeDataFilterBase::Accepts(const PbiRawData& idx, + const size_t row) const +{ + const PbiRawBarcodeData& barcodeData = idx.BarcodeData(); + switch (field) { + case BarcodeLookupData::BC_FORWARD: return FilterBase::CompareHelper(barcodeData.bcForward_.at(row)); + case BarcodeLookupData::BC_REVERSE: return FilterBase::CompareHelper(barcodeData.bcReverse_.at(row)); + case BarcodeLookupData::BC_QUALITY: return FilterBase::CompareHelper(barcodeData.bcQual_.at(row)); + default: + assert(false); + throw std::runtime_error("unsupported BarcodeData field requested"); + } +} + +// BasicDataFilterBase + +template +inline BasicDataFilterBase::BasicDataFilterBase(const T& value, const Compare::Type cmp) + : FilterBase(value, cmp) +{ } + +template +inline BasicDataFilterBase::BasicDataFilterBase(T&& value, const Compare::Type cmp) + : FilterBase(std::move(value), cmp) +{ } + +template +inline BasicDataFilterBase::BasicDataFilterBase(const std::vector& values) + : FilterBase(values) +{ } + +template +inline BasicDataFilterBase::BasicDataFilterBase(std::vector&& values) + : FilterBase(std::move(values)) +{ } + +template +inline bool BasicDataFilterBase::BasicDataFilterBase::Accepts(const PbiRawData& idx, + const size_t row) const +{ + const PbiRawBasicData& basicData = idx.BasicData(); + switch (field) { + case BasicLookupData::RG_ID: return FilterBase::CompareHelper(basicData.rgId_.at(row)); + case BasicLookupData::Q_START: return FilterBase::CompareHelper(basicData.qStart_.at(row)); + case BasicLookupData::Q_END: return FilterBase::CompareHelper(basicData.qEnd_.at(row)); + case BasicLookupData::ZMW: return FilterBase::CompareHelper(basicData.holeNumber_.at(row)); + case BasicLookupData::READ_QUALITY: return FilterBase::CompareHelper(basicData.readQual_.at(row)); + // BasicLookupData::CONTEXT_FLAG has its own specialization + default: + assert(false); + throw std::runtime_error("unsupported BasicData field requested"); + } +} + +// this typedef exists purely so that the next method signature isn't 2 screen widths long +typedef BasicDataFilterBase LocalContextFilter__; + +template<> +inline bool LocalContextFilter__::BasicDataFilterBase::Accepts(const PbiRawData& idx, + const size_t row) const +{ + const PbiRawBasicData& basicData = idx.BasicData(); + const LocalContextFlags rowFlags = static_cast(basicData.ctxtFlag_.at(row)); + return FilterBase::CompareHelper(rowFlags); +} + +template +inline MappedDataFilterBase::MappedDataFilterBase(const T& value, const Compare::Type cmp) + : FilterBase(value, cmp) +{ } + +template +inline MappedDataFilterBase::MappedDataFilterBase(T&& value, const Compare::Type cmp) + : FilterBase(std::move(value), cmp) +{ } + +template +inline MappedDataFilterBase::MappedDataFilterBase(const std::vector& values) + : FilterBase(values) +{ } + +template +inline MappedDataFilterBase::MappedDataFilterBase(std::vector&& values) + : FilterBase(std::move(values)) +{ } + +template<> +inline bool MappedDataFilterBase::MappedDataFilterBase::Accepts(const PbiRawData& idx, + const size_t row) const +{ + const PbiRawMappedData& mappedData = idx.MappedData(); + const Strand strand = (mappedData.revStrand_.at(row) == 1 ? Strand::REVERSE : Strand::FORWARD); + return FilterBase::CompareHelper(strand); +} + +template +inline bool MappedDataFilterBase::MappedDataFilterBase::Accepts(const PbiRawData& idx, + const size_t row) const +{ + const PbiRawMappedData& mappedData = idx.MappedData(); + switch (field) { + case MappedLookupData::T_ID: return FilterBase::CompareHelper(mappedData.tId_.at(row)); + case MappedLookupData::T_START: return FilterBase::CompareHelper(mappedData.tStart_.at(row)); + case MappedLookupData::T_END: return FilterBase::CompareHelper(mappedData.tEnd_.at(row)); + case MappedLookupData::A_START: return FilterBase::CompareHelper(mappedData.aStart_.at(row)); + case MappedLookupData::A_END: return FilterBase::CompareHelper(mappedData.aEnd_.at(row)); + case MappedLookupData::N_M: return FilterBase::CompareHelper(mappedData.nM_.at(row)); + case MappedLookupData::N_MM: return FilterBase::CompareHelper(mappedData.nMM_.at(row)); + case MappedLookupData::N_DEL: return FilterBase::CompareHelper(mappedData.NumDeletedBasesAt(row)); + case MappedLookupData::N_INS: return FilterBase::CompareHelper(mappedData.NumInsertedBasesAt(row)); + case MappedLookupData::MAP_QUALITY: return FilterBase::CompareHelper(mappedData.mapQV_.at(row)); + default: + assert(false); + throw std::runtime_error("unsupported MappedData field requested"); + } +} + +} // namespace internal + +// PbiAlignedEndFilter + +inline PbiAlignedEndFilter::PbiAlignedEndFilter(const uint32_t position, const Compare::Type cmp) + : internal::MappedDataFilterBase(position, cmp) +{ } + +// PbiAlignedLengthFilter + +inline PbiAlignedLengthFilter::PbiAlignedLengthFilter(const uint32_t length, const Compare::Type cmp) + : internal::FilterBase(length, cmp) +{ } + +// PbiAlignedStartFilter + +inline PbiAlignedStartFilter::PbiAlignedStartFilter(const uint32_t position, const Compare::Type cmp) + : internal::MappedDataFilterBase(position, cmp) +{ } + +// PbiAlignedStrandFilter + +inline PbiAlignedStrandFilter::PbiAlignedStrandFilter(const Strand strand, const Compare::Type cmp) + : internal::MappedDataFilterBase(strand, cmp) +{ + if (cmp != Compare::EQUAL && cmp != Compare::NOT_EQUAL) { + auto msg = std::string{ "Compare type: " }; + msg += Compare::TypeToName(cmp); + msg += " not supported for PbiAlignedStrandFilter (use one of Compare::EQUAL or Compare::NOT_EQUAL)."; + throw std::runtime_error(msg); + } +} + +// PbiBarcodeFilter + +inline PbiBarcodeFilter::PbiBarcodeFilter(const int16_t barcode, const Compare::Type cmp) + : compositeFilter_{ PbiFilter::Union({ PbiBarcodeForwardFilter{barcode,cmp}, + PbiBarcodeReverseFilter{barcode,cmp} + }) + } +{ } + +inline PbiBarcodeFilter::PbiBarcodeFilter(const std::vector& whitelist) + : compositeFilter_{ PbiFilter::Union({ PbiBarcodeForwardFilter{whitelist}, + PbiBarcodeReverseFilter{whitelist} + }) + } +{ } + +inline PbiBarcodeFilter::PbiBarcodeFilter(std::vector&& whitelist) + : compositeFilter_{ PbiFilter::Union({ PbiBarcodeForwardFilter{std::move(whitelist)}, + PbiBarcodeReverseFilter{std::move(whitelist)} + }) + } +{ } + +inline bool PbiBarcodeFilter::Accepts(const PbiRawData& idx, const size_t row) const +{ return compositeFilter_.Accepts(idx, row); } + +// PbiBarcodeForwardFilter + +inline PbiBarcodeForwardFilter::PbiBarcodeForwardFilter(const int16_t bcFwdId, const Compare::Type cmp) + : internal::BarcodeDataFilterBase(bcFwdId, cmp) +{ } + +inline PbiBarcodeForwardFilter::PbiBarcodeForwardFilter(const std::vector& whitelist) + : internal::BarcodeDataFilterBase(whitelist) +{ } + +inline PbiBarcodeForwardFilter::PbiBarcodeForwardFilter(std::vector&& whitelist) + : internal::BarcodeDataFilterBase(std::move(whitelist)) +{ } + +// PbiBarcodeQualityFilter + +inline PbiBarcodeQualityFilter::PbiBarcodeQualityFilter(const uint8_t bcQuality, const Compare::Type cmp) + : internal::BarcodeDataFilterBase(bcQuality, cmp) +{ } + +// PbiBarcodeReverseFilter + +inline PbiBarcodeReverseFilter::PbiBarcodeReverseFilter(const int16_t bcRevId, const Compare::Type cmp) + : internal::BarcodeDataFilterBase(bcRevId, cmp) +{ } + +inline PbiBarcodeReverseFilter::PbiBarcodeReverseFilter(const std::vector& whitelist) + : internal::BarcodeDataFilterBase(whitelist) +{ } + +inline PbiBarcodeReverseFilter::PbiBarcodeReverseFilter(std::vector&& whitelist) + : internal::BarcodeDataFilterBase(std::move(whitelist)) +{ } + +// PbiBarcodesFilter + +inline PbiBarcodesFilter::PbiBarcodesFilter(const std::pair barcodes, const Compare::Type cmp) + : PbiBarcodesFilter(barcodes.first, barcodes.second, cmp) +{ } + +inline PbiBarcodesFilter::PbiBarcodesFilter(const int16_t bcForward, const int16_t bcReverse, const Compare::Type cmp) + : compositeFilter_{ PbiFilter::Intersection({ PbiBarcodeForwardFilter{bcForward,cmp}, + PbiBarcodeReverseFilter{bcReverse,cmp} + }) + } +{ } + +inline bool PbiBarcodesFilter::Accepts(const PbiRawData& idx, const size_t row) const +{ return compositeFilter_.Accepts(idx, row); } + +// PbiIdentityFilter + +inline PbiIdentityFilter::PbiIdentityFilter(const float identity, + const Compare::Type cmp) + : internal::FilterBase(identity, cmp) +{ } + +// PbiLocalContextFilter + +inline PbiLocalContextFilter::PbiLocalContextFilter(const LocalContextFlags& flags, + const Compare::Type cmp) + : internal::BasicDataFilterBase(flags, cmp) +{ } + +// PbiMapQualityFilter + +inline PbiMapQualityFilter::PbiMapQualityFilter(const uint8_t mapQual, const Compare::Type cmp) + : internal::MappedDataFilterBase(mapQual, cmp) +{ } + +// PbiMovieNameFilter + +inline bool PbiMovieNameFilter::Accepts(const PbiRawData& idx, const size_t row) const +{ return compositeFilter_.Accepts(idx, row); } + +// PbiNumDeletedBasesFilter + +inline PbiNumDeletedBasesFilter::PbiNumDeletedBasesFilter(const size_t numDeletions, const Compare::Type cmp) + : internal::MappedDataFilterBase(numDeletions, cmp) +{ } + +// PbiNumInsertedBasesFilter + +inline PbiNumInsertedBasesFilter::PbiNumInsertedBasesFilter(const size_t numInsertions, const Compare::Type cmp) + : internal::MappedDataFilterBase(numInsertions, cmp) +{ } + +// PbiNumMatchesFilter + +inline PbiNumMatchesFilter::PbiNumMatchesFilter(const size_t numMatchedBases, const Compare::Type cmp) + : internal::MappedDataFilterBase(numMatchedBases, cmp) +{ } + +// PbiNumMismatchesFilter + +inline PbiNumMismatchesFilter::PbiNumMismatchesFilter(const size_t numMismatchedBases, const Compare::Type cmp) + : internal::MappedDataFilterBase(numMismatchedBases, cmp) +{ } + +// PbiQueryEndFilter + +inline PbiQueryEndFilter::PbiQueryEndFilter(const int32_t position, const Compare::Type cmp) + : internal::BasicDataFilterBase(position, cmp) +{ } + +// PbiQueryLengthFilter + +inline PbiQueryLengthFilter::PbiQueryLengthFilter(const int32_t length, const Compare::Type cmp) + : internal::FilterBase(length, cmp) +{ } + +// PbiQueryStartFilter + +inline PbiQueryStartFilter::PbiQueryStartFilter(const int32_t position, const Compare::Type cmp) + : internal::BasicDataFilterBase(position, cmp) +{ } + +// PbiReadAccuracyFilter + +inline PbiReadAccuracyFilter::PbiReadAccuracyFilter(const Accuracy accuracy, const Compare::Type cmp) + : internal::BasicDataFilterBase(accuracy, cmp) +{ } + +// PbiReadGroupFilter + +inline PbiReadGroupFilter::PbiReadGroupFilter(const int32_t rgId, const Compare::Type cmp) + : internal::BasicDataFilterBase(rgId, cmp) +{ } + +inline PbiReadGroupFilter::PbiReadGroupFilter(const std::string rgId, const Compare::Type cmp) + : PbiReadGroupFilter(ReadGroupInfo::IdToInt(rgId), cmp) +{ } + +inline PbiReadGroupFilter::PbiReadGroupFilter(const ReadGroupInfo& rg, const Compare::Type cmp) + : PbiReadGroupFilter(rg.Id(), cmp) +{ } + +inline PbiReadGroupFilter::PbiReadGroupFilter(const std::vector& whitelist) + : internal::BasicDataFilterBase(whitelist) +{ } + +inline PbiReadGroupFilter::PbiReadGroupFilter(std::vector&& whitelist) + : internal::BasicDataFilterBase(std::move(whitelist)) +{ } + +inline PbiReadGroupFilter::PbiReadGroupFilter(const std::vector& whitelist) + : internal::BasicDataFilterBase(std::vector()) +{ + multiValue_->reserve(whitelist.size()); + for (const auto& rg : whitelist) + multiValue_->push_back(ReadGroupInfo::IdToInt(rg)); +} + +inline PbiReadGroupFilter::PbiReadGroupFilter(std::vector&& whitelist) + : internal::BasicDataFilterBase(std::vector()) +{ + multiValue_->reserve(whitelist.size()); + for (auto&& rg : whitelist) + multiValue_->push_back(ReadGroupInfo::IdToInt(rg)); +} + +inline PbiReadGroupFilter::PbiReadGroupFilter(const std::vector& whitelist) + : internal::BasicDataFilterBase(std::vector()) +{ + multiValue_->reserve(whitelist.size()); + for (const auto& rg : whitelist) + multiValue_->push_back(ReadGroupInfo::IdToInt(rg.Id())); +} + +inline PbiReadGroupFilter::PbiReadGroupFilter(std::vector&& whitelist) + : internal::BasicDataFilterBase(std::vector()) +{ + multiValue_->reserve(whitelist.size()); + for (auto&& rg : whitelist) + multiValue_->push_back(ReadGroupInfo::IdToInt(rg.Id())); +} + +// PbiReferenceEndFilter + +inline PbiReferenceEndFilter::PbiReferenceEndFilter(const uint32_t tEnd, const Compare::Type cmp) + : internal::MappedDataFilterBase(tEnd, cmp) +{ } + +// PbiReferenceIdFilter + +inline PbiReferenceIdFilter::PbiReferenceIdFilter(const int32_t tId, const Compare::Type cmp) + : internal::MappedDataFilterBase(tId, cmp) +{ } + +inline PbiReferenceIdFilter::PbiReferenceIdFilter(const std::vector& whitelist) + : internal::MappedDataFilterBase(whitelist) +{ } + +inline PbiReferenceIdFilter::PbiReferenceIdFilter(std::vector&& whitelist) + : internal::MappedDataFilterBase(std::move(whitelist)) +{ } + +// PbiReferenceStartFilter + +inline PbiReferenceStartFilter::PbiReferenceStartFilter(const uint32_t tStart, const Compare::Type cmp) + : internal::MappedDataFilterBase(tStart, cmp) +{ } + +// PbiZmwFilter + +inline PbiZmwFilter::PbiZmwFilter(const int32_t zmw, const Compare::Type cmp) + : internal::BasicDataFilterBase(zmw, cmp) +{ } + +inline PbiZmwFilter::PbiZmwFilter(const std::vector& whitelist) + : internal::BasicDataFilterBase(whitelist) +{ } + +inline PbiZmwFilter::PbiZmwFilter(std::vector&& whitelist) + : internal::BasicDataFilterBase(std::move(whitelist)) +{ } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/PbiIndex.inl b/include/pbbam/internal/PbiIndex.inl new file mode 100644 index 0000000..ca4c4ce --- /dev/null +++ b/include/pbbam/internal/PbiIndex.inl @@ -0,0 +1,165 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiIndex.inl +/// \brief Inline implementations for the PbiIndex class. +// +// Author: Derek Barnett + +#include "pbbam/BamRecord.h" +#include "pbbam/PbiFile.h" +#include "pbbam/PbiIndex.h" +#include "pbbam/PbiRawData.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +// -------------------------- +// Pbi Lookup Aggregate +// -------------------------- + +class PbiIndexPrivate +{ +public: + PbiIndexPrivate(void); + PbiIndexPrivate(const PbiRawData& rawIndex); + PbiIndexPrivate(PbiRawData&& rawIndex); + + std::unique_ptr DeepCopy(void) const; + +public: + bool HasSection(const PbiFile::Section flag) const; + void SetSection(const PbiFile::Section flag, bool ok = true); + +public: + IndexResultBlocks LookupReference(const int32_t tId) const; + +private: + IndexResultBlocks MergeBlocksWithOffsets(const IndexList& indices) const; + +public: + std::string filename_; + PbiFile::VersionEnum version_; + PbiFile::Sections sections_; + uint32_t numReads_; + + // lookup structures + BasicLookupData basicData_; + MappedLookupData mappedData_; + ReferenceLookupData referenceData_; + BarcodeLookupData barcodeData_; + +private: + // not-implemented - ensure no copy + PbiIndexPrivate(const PbiIndexPrivate& other); + PbiIndexPrivate& operator=(const PbiIndexPrivate& other); +}; + +inline bool PbiIndexPrivate::HasSection(const PbiFile::Section flag) const +{ return (sections_ & flag) != 0; } + +inline void PbiIndexPrivate::SetSection(const PbiFile::Section flag, bool ok) +{ if (ok) sections_ |= flag; else sections_ &= ~flag; } + +inline IndexResultBlocks +PbiIndexPrivate::LookupReference(const int32_t tId) const +{ + if (!HasSection(PbiFile::REFERENCE)) + return IndexResultBlocks{ }; + + const auto& indexRange = referenceData_.Indices(tId); + if (indexRange.first == nullIndex() && indexRange.second == nullIndex()) + return IndexResultBlocks{ }; + const auto numReads = indexRange.second - indexRange.first; + auto blocks = IndexResultBlocks{ IndexResultBlock(indexRange.first, numReads) }; + basicData_.ApplyOffsets(blocks); + return blocks; +} + +inline IndexResultBlocks +PbiIndexPrivate::MergeBlocksWithOffsets(const IndexList& indices) const +{ + auto blocks = mergedIndexBlocks(indices); + basicData_.ApplyOffsets(blocks); + return blocks; +} + +} // namespace internal + +inline PbiFile::Sections PbiIndex::FileSections(void) const +{ return d_->sections_; } + +inline bool PbiIndex::HasBarcodeData(void) const +{ return d_->HasSection(PbiFile::BARCODE); } + +inline bool PbiIndex::HasMappedData(void) const +{ return d_->HasSection(PbiFile::MAPPED); } + +inline bool PbiIndex::HasReferenceData(void) const +{ return d_->HasSection(PbiFile::REFERENCE); } + +inline bool PbiIndex::HasSection(const PbiFile::Section section) const +{ return d_->HasSection(section); } + +inline uint32_t PbiIndex::NumReads(void) const +{ return d_->numReads_; } + +inline PbiFile::VersionEnum PbiIndex::Version(void) const +{ return d_->version_; } + +inline const BarcodeLookupData& PbiIndex::BarcodeData(void) const +{ return d_->barcodeData_; } + +inline const BasicLookupData& PbiIndex::BasicData(void) const +{ return d_->basicData_; } + +inline const MappedLookupData& PbiIndex::MappedData(void) const +{ return d_->mappedData_; } + +inline const ReferenceLookupData& PbiIndex::ReferenceData(void) const +{ return d_->referenceData_; } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/PbiLookupData.inl b/include/pbbam/internal/PbiLookupData.inl new file mode 100644 index 0000000..2ca38f3 --- /dev/null +++ b/include/pbbam/internal/PbiLookupData.inl @@ -0,0 +1,531 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiLookupData.inl +/// \brief Inline implementations for the classes used for PBI data lookup. +// +// Author: Derek Barnett + +#include "pbbam/PbiLookupData.h" +#include "pbbam/PbiRawData.h" +#include "pbbam/Strand.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { + +// ---------------- +// helper methods +// ---------------- + +inline IndexResultBlocks mergedIndexBlocks(IndexList&& indices) +{ + if (indices.empty()) + return IndexResultBlocks{ }; + + std::sort(indices.begin(), indices.end()); + auto newEndIter = std::unique(indices.begin(), indices.end()); + auto numIndices = std::distance(indices.begin(), newEndIter); + assert(!indices.empty()); + auto result = IndexResultBlocks{ IndexResultBlock(indices.at(0), 1) }; + for (auto i = 1; i < numIndices; ++i) { + if (indices.at(i) == indices.at(i-1)+1) + ++result.back().numReads_; + else + result.push_back(IndexResultBlock(indices.at(i), 1)); + } + return result; +} + +inline IndexResultBlocks mergedIndexBlocks(const IndexList& indices) +{ + auto copy = indices; + return mergedIndexBlocks(std::move(copy)); +} + +inline size_t nullIndex(void) +{ return static_cast(-1); } + +inline void pushBackIndices(IndexList& result, + const IndexList& toAppend) +{ + result.reserve(result.size() + toAppend.size()); + for (auto element : toAppend) + result.push_back(element); +} + +// ----------------- +// OrderedLookup +// ----------------- + +template +inline OrderedLookup::OrderedLookup(void) { } + +template +inline OrderedLookup::OrderedLookup(const container_type& data) + : data_(data) +{ } + +template +inline OrderedLookup::OrderedLookup(container_type&& data) + : data_(std::move(data)) +{ } + +template +inline OrderedLookup::OrderedLookup(const std::vector& rawData) +{ + const auto numElements = rawData.size(); + for (auto i = decltype(numElements){0}; i < numElements; ++i) + data_[rawData.at(i)].push_back(i); +} + +template +inline OrderedLookup::OrderedLookup(std::vector&& rawData) +{ + const auto numElements = rawData.size(); + for (auto i = decltype(numElements){0}; i < numElements; ++i) + data_[rawData.at(i)].push_back(i); +} + +template +inline bool OrderedLookup::operator==(const OrderedLookup& other) const +{ return data_ == other.data_; } + +template +inline bool OrderedLookup::operator!=(const OrderedLookup& other) const +{ return !(*this == other); } + +template +inline typename OrderedLookup::iterator OrderedLookup::begin(void) +{ return data_.begin(); } + +template +inline typename OrderedLookup::const_iterator OrderedLookup::begin(void) const +{ return data_.cbegin(); } + +template +inline typename OrderedLookup::const_iterator OrderedLookup::cbegin(void) const +{ return data_.cbegin(); } + +template +inline typename OrderedLookup::iterator OrderedLookup::end(void) +{ return data_.end(); } + +template +inline typename OrderedLookup::const_iterator OrderedLookup::end(void) const +{ return data_.cend(); } + +template +inline typename OrderedLookup::const_iterator OrderedLookup::cend(void) const +{ return data_.cend(); } + +template +inline bool OrderedLookup::empty(void) const +{ return data_.empty(); } + +template +inline size_t OrderedLookup::size(void) const +{ return data_.size(); } + +template +inline IndexList +OrderedLookup::LookupInclusiveRange(const const_iterator &begin, + const const_iterator &end) const +{ + auto result = IndexList{ }; + for (auto iter = begin; iter != end; ++iter) + pushBackIndices(result, iter->second); + std::sort(result.begin(), result.end()); + return result; +} + +template +inline IndexList +OrderedLookup::LookupExclusiveRange(const const_iterator& begin, + const const_iterator& end, + const key_type& key) const +{ + auto result = IndexList{ }; + for (auto iter = begin; iter != end; ++iter) { + if (iter->first != key) + pushBackIndices(result, iter->second); + } + std::sort(result.begin(), result.end()); + return result; +} + +template +inline IndexList +OrderedLookup::LookupIndices(const OrderedLookup::key_type& key, + const Compare::Type& compare) const +{ + auto begin = data_.cbegin(); + auto end = data_.cend(); + switch(compare) + { + case Compare::EQUAL: + { + const auto found = data_.find(key); + if (found != end) + return found->second; + return IndexList(); + } + case Compare::LESS_THAN: return LookupExclusiveRange(begin, data_.upper_bound(key), key); + case Compare::LESS_THAN_EQUAL: return LookupInclusiveRange(begin, data_.upper_bound(key)); + case Compare::GREATER_THAN: return LookupExclusiveRange(data_.lower_bound(key), end, key); + case Compare::GREATER_THAN_EQUAL: return LookupInclusiveRange(data_.lower_bound(key), end); + case Compare::NOT_EQUAL: return LookupExclusiveRange(begin, end, key); + default: + assert(false); + } + return IndexList{ }; +} + +template +inline std::vector OrderedLookup::Unpack(void) const +{ + auto result = std::vector{ }; + auto iter = cbegin(); + const auto end = cend(); + for ( ; iter != end; ++iter ) { + const auto& indices = iter->second; + for (auto&& i : indices) { + if (result.size() <= i) + result.resize(i+1); + result[i] = iter->first; + } + } + return result; +} + +// ----------------- +// UnorderedLookup +// ----------------- + +template +inline UnorderedLookup::UnorderedLookup(void) { } + +template +inline UnorderedLookup::UnorderedLookup(const container_type& data) + : data_(data) +{ } + +template +inline UnorderedLookup::UnorderedLookup(container_type&& data) + : data_(std::move(data)) +{ } + +template +inline UnorderedLookup::UnorderedLookup(const std::vector& rawData) +{ + const auto numElements = rawData.size(); + for (auto i = decltype(numElements){0}; i < numElements; ++i) + data_[rawData.at(i)].push_back(i); +} + +template +inline UnorderedLookup::UnorderedLookup(std::vector&& rawData) +{ + const auto numElements = rawData.size(); + for (auto i = decltype(numElements){0}; i < numElements; ++i) + data_[rawData.at(i)].push_back(i); +} + +template +inline bool UnorderedLookup::operator==(const UnorderedLookup& other) const +{ return data_ == other.data_; } + +template +inline bool UnorderedLookup::operator!=(const UnorderedLookup& other) const +{ return !(*this == other); } + +template +inline typename UnorderedLookup::iterator UnorderedLookup::begin(void) +{ return data_.begin(); } + +template +inline typename UnorderedLookup::const_iterator UnorderedLookup::begin(void) const +{ return data_.cbegin(); } + +template +inline typename UnorderedLookup::const_iterator UnorderedLookup::cbegin(void) const +{ return data_.cbegin(); } + +template +inline typename UnorderedLookup::iterator UnorderedLookup::end(void) +{ return data_.end(); } + +template +inline typename UnorderedLookup::const_iterator UnorderedLookup::end(void) const +{ return data_.cend(); } + +template +inline typename UnorderedLookup::const_iterator UnorderedLookup::cend(void) const +{ return data_.cend(); } + +template +inline bool UnorderedLookup::empty(void) const +{ return data_.empty(); } + +template +inline size_t UnorderedLookup::size(void) const +{ return data_.size(); } + +template +template +inline IndexList +UnorderedLookup::LookupHelper(const UnorderedLookup::key_type& key, + const Compare& cmp) const +{ + auto result = IndexList{ }; // init with some avg size ?? + const auto end = data_.cend(); + for (auto iter = data_.cbegin(); iter != end; ++iter) { + const auto e = (iter->first); + if (cmp(e, key)) + pushBackIndices(result, iter->second); + } + std::sort(result.begin(), result.end()); + return result; +} + +template +inline IndexList +UnorderedLookup::LookupIndices(const UnorderedLookup::key_type& key, + const Compare::Type& compare) const +{ + switch (compare) { + case Compare::EQUAL: + { + const auto found = data_.find(key); + if (found != data_.cend()) + return found->second; + else + return IndexList(); + } + case Compare::LESS_THAN: return LookupHelper(key, std::less()); + case Compare::LESS_THAN_EQUAL: return LookupHelper(key, std::less_equal()); + case Compare::GREATER_THAN: return LookupHelper(key, std::greater()); + case Compare::GREATER_THAN_EQUAL: return LookupHelper(key, std::greater_equal()); + case Compare::NOT_EQUAL: return LookupHelper(key, std::not_equal_to()); + default: + assert(false); + } + return IndexList{ }; +} + +template +inline std::vector UnorderedLookup::Unpack(void) const +{ + auto result = std::vector{ }; + auto iter = cbegin(); + const auto end = cend(); + for ( ; iter != end; ++iter ) { + const auto& indices = iter->second; + for (auto&& i : indices) { + if (result.size() <= i) + result.resize(i+1); + result[i] = iter->first; + } + } + return result; +} + +// ------------------- +// SubreadLookupData +// ------------------- + +inline +void BasicLookupData::ApplyOffsets(IndexResultBlocks& blocks) const +{ + for (IndexResultBlock& block : blocks) + block.virtualOffset_ = fileOffset_.at(block.firstIndex_); +} + +template +inline IndexList BasicLookupData::Indices(const BasicLookupData::Field& field, + const T& value, + const Compare::Type& compareType) const +{ + switch(field) { + case BasicLookupData::RG_ID: return rgId_.LookupIndices(value, compareType); + case BasicLookupData::Q_START: return qStart_.LookupIndices(value, compareType); + case BasicLookupData::Q_END: return qEnd_.LookupIndices(value, compareType); + case BasicLookupData::ZMW: return holeNumber_.LookupIndices(value, compareType); + case BasicLookupData::READ_QUALITY: return readQual_.LookupIndices(value, compareType); + case BasicLookupData::CONTEXT_FLAG: return ctxtFlag_.LookupIndices(value, compareType); + + case BasicLookupData::VIRTUAL_OFFSET : // fall-through, not supported this way + default: + assert(false); + } + return IndexList{ }; +} + +template +inline IndexList BasicLookupData::IndicesMulti(const BasicLookupData::Field& field, + const std::vector& values) const +{ + auto result = IndexList{ }; + for (auto value : values) { + const auto valueIndices = Indices(field, value, Compare::EQUAL); + result.reserve(result.size() + valueIndices.size()); + for (auto i : valueIndices) + result.push_back(i); + } + return result; +} + +inline const std::vector& BasicLookupData::VirtualFileOffsets(void) const +{ return fileOffset_; } + +// ------------------- +// MappedLookupData +// ------------------- + +template +inline IndexList MappedLookupData::Indices(const MappedLookupData::Field& field, + const T& value, + const Compare::Type& compareType) const +{ + switch(field) { + case MappedLookupData::T_ID: return tId_.LookupIndices(value, compareType); + case MappedLookupData::T_START: return tStart_.LookupIndices(value, compareType); + case MappedLookupData::T_END: return tEnd_.LookupIndices(value, compareType); + case MappedLookupData::A_START: return aStart_.LookupIndices(value, compareType); + case MappedLookupData::A_END: return aEnd_.LookupIndices(value, compareType); + case MappedLookupData::N_M: return nM_.LookupIndices(value, compareType); + case MappedLookupData::N_MM: return nMM_.LookupIndices(value, compareType); + case MappedLookupData::N_DEL: return nDel_.LookupIndices(value, compareType); + case MappedLookupData::N_INS: return nIns_.LookupIndices(value, compareType); + case MappedLookupData::MAP_QUALITY: return mapQV_.LookupIndices(value, compareType); + + // MappedField::STRAND has its own specialization + + default: + assert(false); + } + return IndexList{ }; +} + +template<> +inline IndexList MappedLookupData::Indices(const MappedLookupData::Field& field, + const Strand& strand, + const Compare::Type& compareType) const +{ + assert(field == MappedLookupData::STRAND); + (void)field; // quash warnings building in release mode + + if (compareType == Compare::EQUAL) { + if (strand == Strand::FORWARD) + return forwardStrand_; + else + return reverseStrand_; + } else if (compareType == Compare::NOT_EQUAL) { + if (strand == Strand::FORWARD) + return reverseStrand_; + else + return forwardStrand_; + } + + // only EQUAL/NOT_EQUAL supported + assert(false); + return IndexList{ }; +} + +template +inline IndexList MappedLookupData::IndicesMulti(const MappedLookupData::Field& field, + const std::vector& values) const +{ + auto result = IndexList{ }; + for (auto value : values) { + auto valueIndices = Indices(field, value, Compare::EQUAL); + result.reserve(result.size() + valueIndices.size()); + for (auto i : valueIndices) + result.push_back(i); + } + return result; +} + + +// --------------------- +// ReferenceLookupData +// --------------------- + +inline IndexRange ReferenceLookupData::Indices(const int32_t tId) const +{ + auto found = references_.find(tId); + if (found == references_.cend()) + return IndexRange{ nullIndex(), nullIndex() }; + return found->second; +} + +// ------------------- +// BarcodeLookupData +// ------------------- + +template +inline IndexList BarcodeLookupData::Indices(const BarcodeLookupData::Field &field, + const T& value, + const Compare::Type &compareType) const +{ + switch(field) { + case BarcodeLookupData::BC_FORWARD: return bcForward_.LookupIndices(value, compareType); + case BarcodeLookupData::BC_REVERSE: return bcReverse_.LookupIndices(value, compareType); + case BarcodeLookupData::BC_QUALITY: return bcQual_.LookupIndices(value, compareType); + default: + assert(false); + } + return IndexList{ }; +} + +template +inline IndexList BarcodeLookupData::IndicesMulti(const BarcodeLookupData::Field &field, + const std::vector& values) const +{ + IndexList result; + for (auto value : values) { + const IndexList& valueIndices = Indices(field, value, Compare::EQUAL); + result.reserve(result.size() + valueIndices.size()); + for (auto i : valueIndices) + result.push_back(i); + } + return result; +} + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/PbiRawData.inl b/include/pbbam/internal/PbiRawData.inl new file mode 100644 index 0000000..af24376 --- /dev/null +++ b/include/pbbam/internal/PbiRawData.inl @@ -0,0 +1,113 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiRawData.inl +/// \brief Inline implementations for the classes used for working with raw PBI +/// data. +// +// Author: Derek Barnett + +#include "pbbam/PbiRawData.h" + +namespace PacBio { +namespace BAM { + +inline const PbiRawBarcodeData& PbiRawData::BarcodeData(void) const +{ return barcodeData_; } + +inline PbiRawBarcodeData& PbiRawData::BarcodeData(void) +{ return barcodeData_; } + +inline const PbiRawBasicData& PbiRawData::BasicData(void) const +{ return basicData_; } + +inline PbiRawBasicData& PbiRawData::BasicData(void) +{ return basicData_; } + +inline std::string PbiRawData::Filename(void) const +{ return filename_; } + +inline PbiFile::Sections PbiRawData::FileSections(void) const +{ return sections_; } + +inline PbiRawData& PbiRawData::FileSections(PbiFile::Sections sections) +{ sections_ = sections; return *this; } + +inline bool PbiRawData::HasBarcodeData(void) const +{ return HasSection(PbiFile::BARCODE); } + +inline bool PbiRawData::HasMappedData(void) const +{ return HasSection(PbiFile::MAPPED); } + +inline bool PbiRawData::HasReferenceData(void) const +{ return HasSection(PbiFile::REFERENCE); } + +inline bool PbiRawData::HasSection(const PbiFile::Section section) const +{ return (sections_ & section) != 0; } + +inline uint32_t PbiRawData::NumReads(void) const +{ return numReads_; } + +inline PbiRawData& PbiRawData::NumReads(uint32_t num) +{ numReads_ = num; return *this; } + +inline const PbiRawMappedData& PbiRawData::MappedData(void) const +{ return mappedData_; } + +inline PbiRawMappedData& PbiRawData::MappedData(void) +{ return mappedData_; } + +inline const PbiRawReferenceData& PbiRawData::ReferenceData(void) const +{ return referenceData_; } + +inline PbiRawReferenceData& PbiRawData::ReferenceData(void) +{ return referenceData_; } + +inline PbiFile::VersionEnum PbiRawData::Version(void) const +{ return version_; } + +inline PbiRawData& PbiRawData::Version(PbiFile::VersionEnum version) +{ version_ = version; return *this; } + +inline bool PbiReferenceEntry::operator==(const PbiReferenceEntry& other) const +{ + return tId_ == other.tId_ && + beginRow_ == other.beginRow_ && + endRow_ == other.endRow_; +} + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/ProgramInfo.inl b/include/pbbam/internal/ProgramInfo.inl new file mode 100644 index 0000000..2f0287f --- /dev/null +++ b/include/pbbam/internal/ProgramInfo.inl @@ -0,0 +1,97 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ProgramInfo.inl +/// \brief Inline implementations for the ProgramInfo class. +// +// Author: Derek Barnett + +#include "pbbam/ProgramInfo.h" + +namespace PacBio { +namespace BAM { + +inline std::string ProgramInfo::CommandLine(void) const +{ return commandLine_; } + +inline ProgramInfo& ProgramInfo::CommandLine(const std::string& cmd) +{ commandLine_ = cmd; return *this; } + +inline std::map ProgramInfo::CustomTags(void) const +{ return custom_; } + +inline ProgramInfo& ProgramInfo::CustomTags(const std::map& custom) +{ custom_ = custom; return *this; } + +inline std::string ProgramInfo::Description(void) const +{ return description_; } + +inline ProgramInfo& ProgramInfo::Description(const std::string& description) +{ description_ = description; return *this; } + +inline std::string ProgramInfo::Id(void) const +{ return id_; } + +inline ProgramInfo& ProgramInfo::Id(const std::string& id) +{ id_ = id; return *this; } + +inline bool ProgramInfo::IsValid(void) const +{ return !id_.empty(); } + +inline std::string ProgramInfo::Name(void) const +{ return name_; } + +inline ProgramInfo& ProgramInfo::Name(const std::string& name) +{ name_ = name; return *this; } + +inline std::string ProgramInfo::PreviousProgramId(void) const +{ return previousProgramId_; } + +inline ProgramInfo& ProgramInfo::PreviousProgramId(const std::string& id) +{ previousProgramId_ = id; return *this; } + +inline std::string ProgramInfo::ToSam(const ProgramInfo& prog) +{ return prog.ToSam(); } + +inline std::string ProgramInfo::Version(void) const +{ return version_; } + +inline ProgramInfo& ProgramInfo::Version(const std::string& version) +{ version_ = version; return *this; } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/QualityValue.inl b/include/pbbam/internal/QualityValue.inl new file mode 100644 index 0000000..07db35b --- /dev/null +++ b/include/pbbam/internal/QualityValue.inl @@ -0,0 +1,71 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file QualityValue.inl +/// \brief Inline implementations for the QualityValue class. +// +// Author: Derek Barnett + +#include "pbbam/QualityValue.h" + +namespace PacBio { +namespace BAM { + +inline QualityValue::QualityValue(const uint8_t value) + : value_(value) +{ + // clamp QV + if (value_ > QualityValue::MAX) + value_ = QualityValue::MAX; +} + +inline QualityValue::QualityValue(const QualityValue& other) + : value_(other.value_) +{ } + +inline QualityValue::~QualityValue(void) { } + +inline char QualityValue::Fastq(void) const +{ return static_cast(value_ + 33); } + +inline QualityValue::operator uint8_t(void) const +{ return value_; } + +inline QualityValue QualityValue::FromFastq(const char c) +{ return QualityValue(static_cast(c-33)); } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/QualityValues.inl b/include/pbbam/internal/QualityValues.inl new file mode 100644 index 0000000..0eabf49 --- /dev/null +++ b/include/pbbam/internal/QualityValues.inl @@ -0,0 +1,148 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file QualityValues.inl +/// \brief Inline implementations for the QualityValues class. +// +// Author: Derek Barnett + +#include "pbbam/QualityValues.h" +#include + +namespace PacBio { +namespace BAM { + +inline QualityValues::QualityValues(void) + : std::vector() +{ } + +inline QualityValues::QualityValues(const std::string& fastqString) + : std::vector() +{ + resize(fastqString.size()); + std::transform(fastqString.cbegin(), fastqString.cend(), + begin(), QualityValue::FromFastq); +} + +inline QualityValues::QualityValues(const std::vector& quals) + : std::vector(quals) +{ } + +inline QualityValues::QualityValues(const std::vector& quals) + : std::vector() +{ + resize(quals.size()); + std::copy(quals.cbegin(), quals.cend(), begin()); +} + +inline QualityValues::QualityValues(const std::vector::const_iterator first, + const std::vector::const_iterator last) + : std::vector(first, last) +{ } + +inline QualityValues::QualityValues(const QualityValues::const_iterator first, + const QualityValues::const_iterator last) + : std::vector() +{ + assign(first, last); +} + +inline QualityValues::QualityValues(const QualityValues& other) + : std::vector(other) +{ } + +inline QualityValues::QualityValues(std::vector&& quals) + : std::vector(std::move(quals)) +{ } + +inline QualityValues::QualityValues(QualityValues&& other) + : std::vector(std::move(other)) +{ } + +inline QualityValues& QualityValues::operator=(const QualityValues& other) +{ std::vector::operator=(other); return *this; } + +inline QualityValues& QualityValues::operator=(const std::vector& quals) +{ std::vector::operator=(quals); return *this; } + +inline QualityValues& QualityValues::operator=(QualityValues&& other) +{ std::vector::operator=(std::move(other)); return *this; } + +inline QualityValues& QualityValues::operator=(std::vector&& quals) +{ std::vector::operator=(std::move(quals)); return *this; } + +inline QualityValues::~QualityValues(void) { } + +inline std::vector::const_iterator QualityValues::cbegin(void) const +{ return std::vector::cbegin(); } + +inline std::vector::const_iterator QualityValues::cend(void) const +{ return std::vector::cend(); } + +inline std::vector::const_iterator QualityValues::begin(void) const +{ return std::vector::begin(); } + +inline std::vector::const_iterator QualityValues::end(void) const +{ return std::vector::end(); } + +inline std::vector::iterator QualityValues::begin(void) +{ return std::vector::begin(); } + +inline std::vector::iterator QualityValues::end(void) +{ return std::vector::end(); } + +inline QualityValues QualityValues::FromFastq(const std::string& fastq) +{ return QualityValues(fastq); } + +inline std::string QualityValues::Fastq(void) const +{ + std::string result; + result.reserve(size()); + auto iter = cbegin(); + const auto end = cend(); + for (; iter != end; ++iter) + result.push_back((*iter).Fastq()); + return result; +} + +inline bool QualityValues::operator==(const std::string& fastq) const +{ return *this == QualityValues(fastq); } + +inline bool QualityValues::operator!=(const std::string& fastq) const +{ return *this != QualityValues(fastq); } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/QueryBase.h b/include/pbbam/internal/QueryBase.h new file mode 100644 index 0000000..e012f86 --- /dev/null +++ b/include/pbbam/internal/QueryBase.h @@ -0,0 +1,138 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef QUERYBASE_H +#define QUERYBASE_H + +#include "pbbam/BamFile.h" +#include "pbbam/BamRecord.h" +#include "pbbam/DataSet.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +template +class QueryBase; + +template +class QueryIteratorBase +{ +public: + virtual ~QueryIteratorBase(void); + + bool operator==(const QueryIteratorBase& other) const; + bool operator!=(const QueryIteratorBase& other) const; + +protected: + QueryIteratorBase(void); + QueryIteratorBase(QueryBase& query); + + void ReadNext(void); + +protected: + QueryBase* query_; + T record_; +}; + +template +class QueryIterator : public QueryIteratorBase +{ +public: + QueryIterator(void); + QueryIterator(QueryBase& query); + + T& operator*(void); + T* operator->(void); + + QueryIterator& operator++(void); + QueryIterator operator++(int); +}; + +template +class QueryConstIterator : public QueryIteratorBase +{ +public: + QueryConstIterator(void); + QueryConstIterator(const QueryBase& query); + + const T& operator*(void) const; + const T* operator->(void) const; + + QueryConstIterator& operator++(void); + QueryConstIterator operator++(int); +}; + +template +class QueryBase { + +public: + typedef QueryIterator iterator; + typedef QueryConstIterator const_iterator; + +public: + virtual ~QueryBase(void); + +public: + QueryConstIterator begin(void) const; + QueryConstIterator cbegin(void) const; + QueryIterator begin(void); + + QueryConstIterator end(void) const; + QueryConstIterator cend(void) const; + QueryIterator end(void); + +public: + virtual bool GetNext(T& r) =0; + +protected: + QueryBase(void); +}; + +typedef QueryBase IQuery; +typedef QueryBase > IGroupQuery; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#include "pbbam/internal/QueryBase.inl" + +#endif // QUERYBASE_H diff --git a/include/pbbam/internal/QueryBase.inl b/include/pbbam/internal/QueryBase.inl new file mode 100644 index 0000000..7f2376f --- /dev/null +++ b/include/pbbam/internal/QueryBase.inl @@ -0,0 +1,177 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "pbbam/internal/QueryBase.h" + +namespace PacBio { +namespace BAM { +namespace internal { + +// ------------------- +// QueryIteratorBase +// ------------------- + +template +inline QueryIteratorBase::QueryIteratorBase(void) + : query_(nullptr) +{ } + +template +inline QueryIteratorBase::QueryIteratorBase(QueryBase& query) + : query_(&query) +{ ReadNext(); } + +template inline +QueryIteratorBase::~QueryIteratorBase(void) { } + +template inline +bool QueryIteratorBase::operator==(const QueryIteratorBase& other) const +{ return query_ == other.query_; } + +template inline +bool QueryIteratorBase::operator!=(const QueryIteratorBase& other) const +{ return !(*this == other); } + +// ------------------- +// QueryIterator +// ------------------- + +template inline +QueryIterator::QueryIterator(void) : QueryIteratorBase() { } + +template inline +QueryIterator::QueryIterator(QueryBase& query) + : QueryIteratorBase(query) +{ } + +template inline +T& QueryIterator::operator*(void) +{ return QueryIteratorBase::record_; } + +template inline +T* QueryIterator::operator->(void) +{ return &(operator*()); } + +template inline +QueryIterator& QueryIterator::operator++(void) +{ QueryIteratorBase::ReadNext(); return *this; } + +template inline +QueryIterator QueryIterator::operator++(int) +{ + QueryIterator result(*this); + ++(*this); + return result; +} + +// -------------------- +// QueryConstIterator +// -------------------- + +template inline +QueryConstIterator::QueryConstIterator(void) : QueryIteratorBase() { } + +template inline +QueryConstIterator::QueryConstIterator(const QueryBase& query) + : QueryIteratorBase(const_cast&>(query)) +{ } + +template inline +const T& QueryConstIterator::operator*(void) const +{ return QueryIteratorBase::record_; } + +template inline +const T* QueryConstIterator::operator->(void) const +{ return &(operator*()); } + +template inline +QueryConstIterator& QueryConstIterator::operator++(void) +{ QueryIteratorBase::ReadNext(); return *this; } + +template inline +QueryConstIterator QueryConstIterator::operator++(int) +{ + QueryConstIterator result(*this); + ++(*this); + return result; +} + +// ----------- +// QueryBase +// ----------- + +template inline +QueryBase::QueryBase(void) { } + +template inline +QueryBase::~QueryBase(void) { } + +template inline +QueryConstIterator QueryBase::begin(void) const +{ return QueryConstIterator(*this); } + +template inline +QueryIterator QueryBase::begin(void) +{ return QueryIterator(*this); } + +template inline +QueryConstIterator QueryBase::cbegin(void) const +{ return QueryConstIterator(*this); } + +template inline +QueryConstIterator QueryBase::cend(void) const +{ return QueryConstIterator(); } + +template inline +QueryConstIterator QueryBase::end(void) const +{ return QueryConstIterator(); } + +template inline +QueryIterator QueryBase::end(void) +{ return QueryIterator(); } + +template +inline void QueryIteratorBase::ReadNext(void) +{ + assert(query_); + if (!query_->GetNext(record_)) + query_ = nullptr; +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/ReadGroupInfo.inl b/include/pbbam/internal/ReadGroupInfo.inl new file mode 100644 index 0000000..b8a24e0 --- /dev/null +++ b/include/pbbam/internal/ReadGroupInfo.inl @@ -0,0 +1,279 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ReadGroupInfo.inl +/// \brief Inline implementations for the ReadGroupInfo class. +// +// Author: Derek Barnett + +#include +#include "pbbam/ReadGroupInfo.h" + +namespace PacBio { +namespace BAM { + +inline size_t ReadGroupInfo::BarcodeCount(void) const +{ + if (!hasBarcodeData_) + throw std::runtime_error("barcode count requested but barcode data is missing"); + return barcodeCount_; +} + +inline ReadGroupInfo& ReadGroupInfo::BarcodeData(const std::string& barcodeFile, + const std::string& barcodeHash, + size_t barcodeCount, + BarcodeModeType barcodeMode, + BarcodeQualityType barcodeQuality) +{ + barcodeFile_ = barcodeFile; + barcodeHash_ = barcodeHash; + barcodeCount_ = barcodeCount; + barcodeMode_ = barcodeMode; + barcodeQuality_ = barcodeQuality; + hasBarcodeData_ = true; + return *this; +} + +inline std::string ReadGroupInfo::BarcodeFile(void) const +{ + if (!hasBarcodeData_) + throw std::runtime_error("barcode file requested but barcode data is missing"); + return barcodeFile_; +} + +inline std::string ReadGroupInfo::BarcodeHash(void) const +{ + if (!hasBarcodeData_) + throw std::runtime_error("barcode hash requested but barcode data is missing"); + return barcodeHash_; +} + +inline BarcodeModeType ReadGroupInfo::BarcodeMode(void) const +{ + if (!hasBarcodeData_) + throw std::runtime_error("barcode mode requested but barcode data is missing"); + return barcodeMode_; +} + +inline BarcodeQualityType ReadGroupInfo::BarcodeQuality(void) const +{ + if (!hasBarcodeData_) + throw std::runtime_error("barcode quality requested but barcode data is missing"); + return barcodeQuality_; +} + +inline std::string ReadGroupInfo::BasecallerVersion(void) const +{ return basecallerVersion_; } + +inline ReadGroupInfo& ReadGroupInfo::BasecallerVersion(const std::string& versionNumber) +{ basecallerVersion_ = versionNumber; return *this; } + +inline std::string ReadGroupInfo::BaseFeatureTag(const BaseFeature& feature) const +{ + const auto iter = features_.find(feature); + if (iter == features_.end()) + return std::string(); + return iter->second; +} + +inline ReadGroupInfo& ReadGroupInfo::BaseFeatureTag(const BaseFeature& feature, + const std::string& tag) +{ features_[feature] = tag; return *this; } + +inline std::string ReadGroupInfo::BindingKit(void) const +{ return bindingKit_; } + +inline ReadGroupInfo& ReadGroupInfo::BindingKit(const std::string& kitNumber) +{ bindingKit_ = kitNumber; return *this; } + +inline ReadGroupInfo& ReadGroupInfo::ClearBarcodeData(void) +{ + barcodeFile_.clear(); + barcodeHash_.clear(); + hasBarcodeData_ = false; + return *this; +} + +inline ReadGroupInfo& ReadGroupInfo::ClearBaseFeatures(void) +{ + features_.clear(); + return *this; +} + +inline bool ReadGroupInfo::Control(void) const +{ return control_; } + +inline ReadGroupInfo& ReadGroupInfo::Control(const bool ctrl) +{ control_ = ctrl; return *this; } + +inline std::map ReadGroupInfo::CustomTags(void) const +{ return custom_; } + +inline ReadGroupInfo& ReadGroupInfo::CustomTags(const std::map& custom) +{ custom_ = custom; return *this; } + +inline std::string ReadGroupInfo::Date(void) const +{ return date_; } + +inline ReadGroupInfo& ReadGroupInfo::Date(const std::string& date) +{ date_ = date; return *this; } + +inline std::string ReadGroupInfo::FlowOrder(void) const +{ return flowOrder_; } + +inline ReadGroupInfo& ReadGroupInfo::FlowOrder(const std::string& order) +{ flowOrder_ = order; return *this; } + +inline std::string ReadGroupInfo::FrameRateHz(void) const +{ return frameRateHz_; } + +inline ReadGroupInfo& ReadGroupInfo::FrameRateHz(const std::string& frameRateHz) +{ frameRateHz_ = frameRateHz; return *this; } + +inline bool ReadGroupInfo::HasBarcodeData(void) const +{ return hasBarcodeData_; } + +inline bool ReadGroupInfo::HasBaseFeature(const BaseFeature& feature) const +{ return features_.find(feature) != features_.end(); } + +inline std::string ReadGroupInfo::Id(void) const +{ return id_; } + +inline ReadGroupInfo& ReadGroupInfo::Id(const std::string& id) +{ id_ = id; return *this; } + +inline ReadGroupInfo& ReadGroupInfo::Id(const std::string& movieName, + const std::string& readType) +{ id_ = MakeReadGroupId(movieName, readType); return *this; } + +inline int32_t ReadGroupInfo::IdToInt(const std::string& rgId) +{ + const uint32_t rawid = std::stoul(rgId, nullptr, 16); + return static_cast(rawid); +} + +inline FrameCodec ReadGroupInfo::IpdCodec(void) const +{ return ipdCodec_; } + +inline bool ReadGroupInfo::IsValid(void) const +{ return !id_.empty(); } + +inline std::string ReadGroupInfo::KeySequence(void) const +{ return keySequence_; } + +inline ReadGroupInfo& ReadGroupInfo::KeySequence(const std::string& sequence) +{ keySequence_ = sequence; return *this; } + +inline std::string ReadGroupInfo::Library(void) const +{ return library_; } + +inline ReadGroupInfo& ReadGroupInfo::Library(const std::string& library) +{ library_ = library; return *this; } + +inline std::string ReadGroupInfo::MovieName(void) const +{ return movieName_; } + +inline ReadGroupInfo& ReadGroupInfo::MovieName(const std::string& movieName) +{ movieName_ = movieName; return *this; } + +inline std::string ReadGroupInfo::Platform(void) const +{ return std::string("PACBIO"); } + +inline PlatformModelType ReadGroupInfo::PlatformModel(void) const +{ return platformModel_; } + +inline ReadGroupInfo& ReadGroupInfo::PlatformModel(const PlatformModelType& platform) +{ platformModel_ = platform; return *this; } + +inline std::string ReadGroupInfo::PredictedInsertSize(void) const +{ return predictedInsertSize_; } + +inline ReadGroupInfo& ReadGroupInfo::PredictedInsertSize(const std::string& size) +{ predictedInsertSize_ = size; return *this; } + +inline std::string ReadGroupInfo::Programs(void) const +{ return programs_; } + +inline ReadGroupInfo& ReadGroupInfo::Programs(const std::string& programs) +{ programs_ = programs; return *this; } + +inline FrameCodec ReadGroupInfo::PulseWidthCodec(void) const +{ return pulseWidthCodec_; } + +inline std::string ReadGroupInfo::ReadType(void) const +{ return readType_; } + +inline ReadGroupInfo& ReadGroupInfo::ReadType(const std::string& type) +{ readType_ = type; return *this; } + +inline ReadGroupInfo& ReadGroupInfo::RemoveBaseFeature(const BaseFeature& feature) +{ + auto iter = features_.find(feature); + if (iter != features_.end()) + features_.erase(iter); + return *this; +} + +inline std::string ReadGroupInfo::Sample(void) const +{ return sample_; } + +inline ReadGroupInfo& ReadGroupInfo::Sample(const std::string& sample) +{ sample_ = sample; return *this; } + +inline std::string ReadGroupInfo::SequencingCenter(void) const +{ return sequencingCenter_; } + +inline ReadGroupInfo& ReadGroupInfo::SequencingCenter(const std::string& center) +{ sequencingCenter_ = center; return *this; } + +inline std::string ReadGroupInfo::SequencingChemistry(void) const +{ + return SequencingChemistryFromTriple(BindingKit(), + SequencingKit(), + BasecallerVersion()); +} + +inline std::string ReadGroupInfo::SequencingKit(void) const +{ return sequencingKit_; } + +inline ReadGroupInfo& ReadGroupInfo::SequencingKit(const std::string& kitNumber) +{ sequencingKit_ = kitNumber; return *this; } + +inline std::string ReadGroupInfo::ToSam(const ReadGroupInfo& rg) +{ return rg.ToSam(); } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/SequenceInfo.inl b/include/pbbam/internal/SequenceInfo.inl new file mode 100644 index 0000000..93b653d --- /dev/null +++ b/include/pbbam/internal/SequenceInfo.inl @@ -0,0 +1,107 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file SequenceInfo.inl +/// \brief Inline implementations for the SequenceInfo class. +// +// Author: Derek Barnett + +#include "pbbam/SequenceInfo.h" + +namespace PacBio { +namespace BAM { + +inline bool SequenceInfo::operator==(const SequenceInfo& other) const +{ + return assemblyId_ == other.assemblyId_ && + checksum_ == other.checksum_ && + length_ == other.length_ && + name_ == other.name_ && + species_ == other.species_ && + uri_ == other.uri_ && + custom_ == other.custom_; +} + +inline bool SequenceInfo::operator!=(const SequenceInfo& other) const +{ return !(*this == other); } + +inline std::string SequenceInfo::AssemblyId(void) const +{ return assemblyId_; } + +inline SequenceInfo& SequenceInfo::AssemblyId(const std::string& id) +{ assemblyId_ = id; return *this; } + +inline std::string SequenceInfo::Checksum(void) const +{ return checksum_; } + +inline SequenceInfo& SequenceInfo::Checksum(const std::string& checksum) +{ checksum_ = checksum; return *this; } + +inline std::map SequenceInfo::CustomTags(void) const +{ return custom_; } + +inline SequenceInfo& SequenceInfo::CustomTags(const std::map& custom) +{ custom_ = custom; return *this; } + +inline std::string SequenceInfo::Length(void) const +{ return length_; } + +inline SequenceInfo& SequenceInfo::Length(const std::string& length) +{ length_ = length; return *this; } + +inline std::string SequenceInfo::Name(void) const +{ return name_; } + +inline SequenceInfo& SequenceInfo::Name(const std::string& name) +{ name_ = name; return *this; } + +inline std::string SequenceInfo::Species(void) const +{ return species_; } + +inline SequenceInfo& SequenceInfo::Species(const std::string& species) +{ species_ = species; return *this; } + +inline std::string SequenceInfo::ToSam(const SequenceInfo& seq) +{ return seq.ToSam(); } + +inline std::string SequenceInfo::Uri(void) const +{ return uri_; } + +inline SequenceInfo& SequenceInfo::Uri(const std::string& uri) +{ uri_ = uri; return *this; } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/Tag.inl b/include/pbbam/internal/Tag.inl new file mode 100644 index 0000000..f8d4af2 --- /dev/null +++ b/include/pbbam/internal/Tag.inl @@ -0,0 +1,323 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Tag.inl +/// \brief Inline implementations for the Tag class. +// +// Author: Derek Barnett + +#include "pbbam/Tag.h" +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +template +inline bool InAsciiRange(const T& x) +{ return (x >=33 && x <= 127); } + +struct AsciiConvertVisitor : public boost::static_visitor +{ + // only valid for numeric types - maybe even more restrictive? + char operator() (const int8_t& x) const { return Helper(x); } + char operator() (const uint8_t& x) const { return Helper(x); } + char operator() (const int16_t& x) const { return Helper(x); } + char operator() (const uint16_t& x) const { return Helper(x); } + char operator() (const int32_t& x) const { return Helper(x); } + char operator() (const uint32_t& x) const { return Helper(x); } + + // anything else always throws + template + char operator()(const T&) const + { throw std::runtime_error("conversion not supported"); return 0; } + +private: + template + char Helper(const T& x) const + { + if (!InAsciiRange(x)) + throw std::runtime_error("not valid ASCII"); + return static_cast(x); + } +}; + +template +struct NumericConvertVisitor : public boost::static_visitor +{ + // only valid for integral types + DesiredType operator() (const int8_t& x) const { return boost::numeric_cast(x); } + DesiredType operator() (const uint8_t& x) const { return boost::numeric_cast(x); } + DesiredType operator() (const int16_t& x) const { return boost::numeric_cast(x); } + DesiredType operator() (const uint16_t& x) const { return boost::numeric_cast(x); } + DesiredType operator() (const int32_t& x) const { return boost::numeric_cast(x); } + DesiredType operator() (const uint32_t& x) const { return boost::numeric_cast(x); } + + // anything else always throws + template DesiredType operator()(const T& t) const + { + const std::string from = typeid(t).name(); + const std::string to = typeid(DesiredType).name(); + const std::string msg = std::string("conversion not supported: ") + from + " -> " + to; + throw std::runtime_error(msg); + return 0; + } +}; + +typedef NumericConvertVisitor ToInt8ConvertVisitor; +typedef NumericConvertVisitor ToUInt8ConvertVisitor; +typedef NumericConvertVisitor ToInt16ConvertVisitor; +typedef NumericConvertVisitor ToUInt16ConvertVisitor; +typedef NumericConvertVisitor ToInt32ConvertVisitor; +typedef NumericConvertVisitor ToUInt32ConvertVisitor; + +struct IsEqualVisitor : public boost::static_visitor +{ + template + bool operator() (const T&, const U&) const + { + // maybe allow conversions down the road? + // but for now, just fail if types are different + return false; + } + + bool operator() (const boost::blank&, const boost::blank&) const + { return true; } + + template + bool operator() (const T& lhs, const T& rhs) const + { return lhs == rhs; } +}; + +struct TypenameVisitor : public boost::static_visitor +{ + std::string operator() (const boost::blank&) const { return "none"; } + std::string operator() (const int8_t&) const { return "int8_t"; } + std::string operator() (const uint8_t&) const { return "uint8_t"; } + std::string operator() (const int16_t&) const { return "int16_t"; } + std::string operator() (const uint16_t&) const { return "uint16_t"; } + std::string operator() (const int32_t&) const { return "int32_t"; } + std::string operator() (const uint32_t&) const { return "uint32_t"; } + std::string operator() (const float&) const { return "float"; } + std::string operator() (const std::string&) const { return "string"; } + std::string operator() (const std::vector&) const { return "vector"; } + std::string operator() (const std::vector&) const { return "vector"; } + std::string operator() (const std::vector&) const { return "vector"; } + std::string operator() (const std::vector&) const { return "vector"; } + std::string operator() (const std::vector&) const { return "vector"; } + std::string operator() (const std::vector&) const { return "vector"; } + std::string operator() (const std::vector&) const { return "vector"; } +}; + +} // namespace internal + +inline bool Tag::operator== (const Tag& other) const +{ + return boost::apply_visitor(internal::IsEqualVisitor(), data_, other.data_) && + (modifier_ == other.modifier_) ; +} + +inline bool Tag::operator!= (const Tag& other) const +{ return !(*this == other); } + +inline bool Tag::HasModifier(const TagModifier m) const +{ + // we just allow one at a time (for now at least) + return modifier_ == m; +} + +inline bool Tag::IsNull(void) const +{ return Type() == TagDataType::INVALID; } + +inline bool Tag::IsInt8(void) const +{ return Type() == TagDataType::INT8; } + +inline bool Tag::IsUInt8(void) const +{ return Type() == TagDataType::UINT8; } + +inline bool Tag::IsInt16(void) const +{ return Type() == TagDataType::INT16; } + +inline bool Tag::IsUInt16(void) const +{ return Type() == TagDataType::UINT16; } + +inline bool Tag::IsInt32(void) const +{ return Type() == TagDataType::INT32; } + +inline bool Tag::IsUInt32(void) const +{ return Type() == TagDataType::UINT32; } + +inline bool Tag::IsFloat(void) const +{ return Type() == TagDataType::FLOAT; } + +inline bool Tag::IsString(void) const +{ return Type() == TagDataType::STRING; } + +inline bool Tag::IsHexString(void) const +{ return IsString() && modifier_ == TagModifier::HEX_STRING; } + +inline bool Tag::IsInt8Array(void) const +{ return Type() == TagDataType::INT8_ARRAY; } + +inline bool Tag::IsUInt8Array(void) const +{ return Type() == TagDataType::UINT8_ARRAY; } + +inline bool Tag::IsInt16Array(void) const +{ return Type() == TagDataType::INT16_ARRAY; } + +inline bool Tag::IsUInt16Array(void) const +{ return Type() == TagDataType::UINT16_ARRAY; } + +inline bool Tag::IsInt32Array(void) const +{ return Type() == TagDataType::INT32_ARRAY; } + +inline bool Tag::IsUInt32Array(void) const +{ return Type() == TagDataType::UINT32_ARRAY; } + +inline bool Tag::IsFloatArray(void) const +{ return Type() == TagDataType::FLOAT_ARRAY; } + +inline bool Tag::IsSignedInt(void) const +{ return IsInt8() || IsInt16() || IsInt32(); } + +inline bool Tag::IsUnsignedInt(void) const +{ return IsUInt8() || IsUInt16() || IsUInt32(); } + +inline bool Tag::IsIntegral(void) const +{ return IsSignedInt() || IsUnsignedInt(); } + +inline bool Tag::IsNumeric(void) const +{ return IsIntegral() || IsFloat(); } + +inline bool Tag::IsSignedArray(void) const +{ return IsInt8Array() || IsInt16Array() || IsInt32Array(); } + +inline bool Tag::IsUnsignedArray(void) const +{ return IsUInt8Array() || IsUInt16Array() || IsUInt32Array(); } + +inline bool Tag::IsIntegralArray(void) const +{ return IsSignedArray() || IsUnsignedArray(); } + +inline bool Tag::IsArray(void) const +{ return IsIntegralArray() || IsFloatArray(); } + +inline TagModifier Tag::Modifier(void) const +{ return modifier_; } + +inline Tag& Tag::Modifier(const TagModifier m) +{ modifier_ = m; return *this; } + +inline char Tag::ToAscii(void) const +{ return boost::apply_visitor(internal::AsciiConvertVisitor(), data_); } + +inline int8_t Tag::ToInt8(void) const +{ + if (IsInt8()) + return boost::get(data_); + return boost::apply_visitor(internal::ToInt8ConvertVisitor(), data_); +} + +inline uint8_t Tag::ToUInt8(void) const +{ + if (IsUInt8()) + return boost::get(data_); + return boost::apply_visitor(internal::ToUInt8ConvertVisitor(), data_); +} + +inline int16_t Tag::ToInt16(void) const +{ + if (IsInt16()) + return boost::get(data_); + return boost::apply_visitor(internal::ToInt16ConvertVisitor(), data_); +} + +inline uint16_t Tag::ToUInt16(void) const +{ + if (IsUInt16()) + return boost::get(data_); + return boost::apply_visitor(internal::ToUInt16ConvertVisitor(), data_); +} + +inline int32_t Tag::ToInt32(void) const +{ + if (IsInt32()) + return boost::get(data_); + return boost::apply_visitor(internal::ToInt32ConvertVisitor(), data_); +} + +inline uint32_t Tag::ToUInt32(void) const +{ + if (IsUInt32()) + return boost::get(data_); + return boost::apply_visitor(internal::ToUInt32ConvertVisitor(), data_); +} + +inline float Tag::ToFloat(void) const +{ return boost::get(data_); } + +inline std::string Tag::ToString(void) const +{ return boost::get(data_); } + +inline std::vector Tag::ToInt8Array(void) const +{ return boost::get< std::vector >(data_); } + +inline std::vector Tag::ToUInt8Array(void) const +{ return boost::get< std::vector >(data_); } + +inline std::vector Tag::ToInt16Array(void) const +{ return boost::get< std::vector >(data_); } + +inline std::vector Tag::ToUInt16Array(void) const +{ return boost::get< std::vector >(data_); } + +inline std::vector Tag::ToInt32Array(void) const +{ return boost::get< std::vector >(data_); } + +inline std::vector Tag::ToUInt32Array(void) const +{ return boost::get< std::vector >(data_); } + +inline std::vector Tag::ToFloatArray(void) const +{ return boost::get< std::vector >(data_); } + +inline TagDataType Tag::Type(void) const +{ return TagDataType(data_.which() ); } + +inline std::string Tag::Typename(void) const +{ return boost::apply_visitor(internal::TypenameVisitor(), data_); } + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/internal/Validator.inl b/include/pbbam/internal/Validator.inl new file mode 100644 index 0000000..123cfad --- /dev/null +++ b/include/pbbam/internal/Validator.inl @@ -0,0 +1,92 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Validator.inl +/// \brief Inline implementations for the Validator class. +// +// Author: Derek Barnett + +#include "pbbam/Validator.h" +#include + +namespace PacBio { +namespace BAM { + +inline bool Validator::IsValid(const BamFile& file, const bool entireFile) +{ + try { + if (entireFile) + ValidateEntireFile(file, 1); + else + ValidateFileMetadata(file, 1); + return true; + } catch (std::exception&) { + return false; + } +} + +inline bool Validator::IsValid(const BamHeader& header) +{ + try { + Validate(header, 1); + return true; + } catch (std::exception&) { + return false; + } +} + +inline bool Validator::IsValid(const BamRecord& record) +{ + try { + Validate(record, 1); + return true; + } catch (std::exception&) { + return false; + } +} + +inline bool Validator::IsValid(const ReadGroupInfo& rg) +{ + try { + Validate(rg, 1); + return true; + } catch (std::exception&) { + return false; + } +} + +} // namespace BAM +} // namespace PacBio diff --git a/include/pbbam/virtual/VirtualPolymeraseBamRecord.h b/include/pbbam/virtual/VirtualPolymeraseBamRecord.h new file mode 100644 index 0000000..ee473b7 --- /dev/null +++ b/include/pbbam/virtual/VirtualPolymeraseBamRecord.h @@ -0,0 +1,56 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualPolymeraseBamRecord.h +/// \brief Defines the VirtualPolymeraseBamRecord class. +// +// Author: Armin Töpfer + +#ifndef VIRTUALPOLYMERASEBAMRECORD_H +#define VIRTUALPOLYMERASEBAMRECORD_H + +#include "pbbam/virtual/VirtualZmwBamRecord.h" + +namespace PacBio { +namespace BAM { + +/// \deprecated Use VirtualZmwBamRecord instead. +typedef VirtualZmwBamRecord VirtualPolymeraseBamRecord; + +} // namespace BAM +} // namespace PacBio + +#endif // VIRTUALPOLYMERASEBAMRECORD_H diff --git a/include/pbbam/virtual/VirtualPolymeraseCompositeReader.h b/include/pbbam/virtual/VirtualPolymeraseCompositeReader.h new file mode 100644 index 0000000..7d37240 --- /dev/null +++ b/include/pbbam/virtual/VirtualPolymeraseCompositeReader.h @@ -0,0 +1,57 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualPolymeraseCompositeReader.h +/// \brief Defines the VirtualPolymeraseCompositeReader class. +// +// Author: Derek Barnett + +#ifndef VIRTUALPOLYMERASECOMPOSITEREADER_H +#define VIRTUALPOLYMERASECOMPOSITEREADER_H + +#include "pbbam/virtual/VirtualPolymeraseBamRecord.h" +#include "pbbam/virtual/ZmwReadStitcher.h" + +namespace PacBio { +namespace BAM { + +/// \deprecated Use ZmwReadStitcher instead. +typedef ZmwReadStitcher VirtualPolymeraseCompositeReader; + +} // namespace BAM +} // namespace PacBio + +#endif // VIRTUALPOLYMERASECOMPOSITEREADER_H diff --git a/include/pbbam/virtual/VirtualPolymeraseReader.h b/include/pbbam/virtual/VirtualPolymeraseReader.h new file mode 100644 index 0000000..5ccfa27 --- /dev/null +++ b/include/pbbam/virtual/VirtualPolymeraseReader.h @@ -0,0 +1,57 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualPolymeraseReader.h +/// \brief Defines the VirtualPolymeraseReader class. +// +// Author: Armin Töpfer + +#ifndef VIRTUALPOLYMERASEREADER_H +#define VIRTUALPOLYMERASEREADER_H + +#include "pbbam/virtual/VirtualPolymeraseBamRecord.h" +#include "pbbam/virtual/ZmwReadStitcher.h" + +namespace PacBio { +namespace BAM { + +/// \deprecated Use ZmwReadStitcher instead. +typedef ZmwReadStitcher VirtualPolymeraseReader; + +} // namespace BAM +} // namespace PacBio + +#endif // VIRTUALPOLYMERASEREADER_H diff --git a/include/pbbam/virtual/VirtualRegion.h b/include/pbbam/virtual/VirtualRegion.h new file mode 100644 index 0000000..facce7d --- /dev/null +++ b/include/pbbam/virtual/VirtualRegion.h @@ -0,0 +1,130 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualRegion.h +/// \brief Defines the VirtualRegion class. +// +// Author: Armin Töpfer + +#ifndef VIRTUALREGION_H +#define VIRTUALREGION_H + +#include "pbbam/Config.h" +#include "pbbam/virtual/VirtualRegionType.h" +#include "pbbam/LocalContextFlags.h" + +namespace PacBio { +namespace BAM { + +/// \brief The VirtualRegion represents an annotation of a polymerase region. +/// +struct VirtualRegion +{ +public: + VirtualRegionType type; + int beginPos; + int endPos; + LocalContextFlags cxTag = LocalContextFlags::NO_LOCAL_CONTEXT; + int barcodeLeft = -1; + int barcodeRight = -1; + int score = 0; + +public: + /// \brief Creates a virtual region with basic type & position info. + /// + VirtualRegion(const VirtualRegionType type, + const int beginPos, + const int endPos, + const int score = 0); + + /// \brief Creates a virtual region with type/position info, as well as context & barcode. + /// + VirtualRegion(const VirtualRegionType type, + const int beginPos, + const int endPos, + const LocalContextFlags cxTag, + const int barcodeLeft, + const int barcodeRight, + const int score = 0); + + VirtualRegion(void) = default; + VirtualRegion(const VirtualRegion&) = default; + VirtualRegion(VirtualRegion&&) = default; + VirtualRegion& operator=(const VirtualRegion&) = default; // un-"delete"-ed for SWIG + VirtualRegion& operator=(VirtualRegion&&) = default; + ~VirtualRegion(void) = default; + + bool operator==(const VirtualRegion &v1) const; + +}; + +inline VirtualRegion::VirtualRegion(const VirtualRegionType type, + const int beginPos, + const int endPos, + const int score) + : type(type) + , beginPos(beginPos) + , endPos(endPos), cxTag() + , score(score) +{} + +inline VirtualRegion::VirtualRegion(const VirtualRegionType type, + const int beginPos, + const int endPos, + const LocalContextFlags cxTag, + const int barcodeLeft, + const int barcodeRight, + const int score) + : type(type) + , beginPos(beginPos) + , endPos(endPos) + , cxTag(cxTag) + , barcodeLeft(barcodeLeft) + , barcodeRight(barcodeRight) + , score(score) +{} + +inline bool VirtualRegion::operator==(const VirtualRegion& v1) const +{ + return (v1.type == this->type && + v1.beginPos == this->beginPos && + v1.endPos == this->endPos); +} + +} // namespace BAM +} // namespace PacBio + +#endif // VIRTUALREGION_H diff --git a/include/pbbam/virtual/VirtualRegionType.h b/include/pbbam/virtual/VirtualRegionType.h new file mode 100644 index 0000000..d359094 --- /dev/null +++ b/include/pbbam/virtual/VirtualRegionType.h @@ -0,0 +1,65 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualRegionType.h +/// \brief Defines the VirtualRegionType enum. +// +// Author: Derek Barnett + +#ifndef REGIONTYPE_H +#define REGIONTYPE_H + +#include "pbbam/Config.h" + +namespace PacBio { +namespace BAM { + +/// \brief This enum defines the types of annotated region. +/// +enum class VirtualRegionType // : char +{ + ADAPTER = 0x41, ///< Adapter region ('A') + BARCODE = 0x42, ///< Barcode region ('B') + FILTERED = 0x46, ///< Filtered subread ('F') + SUBREAD = 0x53, ///< Subread ('S') + HQREGION = 0x48, ///< High-quality region ('H') + LQREGION = 0x4C ///< Low-quality region ('L'), i.e. outside the HQ region +}; + +} // namespace BAM +} // namespace PacBio + +#endif // REGIONTYPE_H diff --git a/include/pbbam/virtual/VirtualRegionTypeMap.h b/include/pbbam/virtual/VirtualRegionTypeMap.h new file mode 100644 index 0000000..200f12f --- /dev/null +++ b/include/pbbam/virtual/VirtualRegionTypeMap.h @@ -0,0 +1,65 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualRegionTypeMap.h +/// \brief Defines the VirtualRegionTypeMap class. +// +// Author: Derek Barnett + +#ifndef VIRTUALREGIONTYPEMAP_H +#define VIRTUALREGIONTYPEMAP_H + +#include + +#include "pbbam/Config.h" +#include "pbbam/virtual/VirtualRegionType.h" + +namespace PacBio { +namespace BAM { + +/// \brief The VirtualRegionTypeMap class provides mapping between char codes and +/// VirtualRegionType enum keys. +/// +class VirtualRegionTypeMap +{ +public: + static std::map ParseChar; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // VIRTUALREGIONTYPEMAP_H diff --git a/include/pbbam/virtual/VirtualZmwBamRecord.h b/include/pbbam/virtual/VirtualZmwBamRecord.h new file mode 100644 index 0000000..32149bd --- /dev/null +++ b/include/pbbam/virtual/VirtualZmwBamRecord.h @@ -0,0 +1,122 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualZmwBamRecord.h +/// \brief Defines the VirtualZmwBamRecord class. +// +// Author: Armin Töpfer + +#ifndef VirtualZmwBAMRECORD_H +#define VirtualZmwBAMRECORD_H + +#include +#include + +#include "pbbam/BamHeader.h" +#include "pbbam/BamRecord.h" +#include "pbbam/Config.h" +#include "pbbam/virtual/VirtualRegion.h" +#include "pbbam/virtual/VirtualRegionType.h" + +namespace PacBio { +namespace BAM { + +/// \brief The VirtualZmwBamRecord class represents a ZMW read stitched +/// on-the-fly from subreads|hqregion + scraps. +/// +class VirtualZmwBamRecord : public BamRecord +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates a "virtual" ZMW %BAM record, by re-stitching its + /// constituent segments. + /// + /// \param[in] unorderedSources source data (subreads, scraps, etc.) + /// \param[in] header %BAM header to associate with the new record + /// + /// \throws std::runtime_error on failure to stitch virtual record + /// + VirtualZmwBamRecord(std::vector&& unorderedSources, + const BamHeader& header); + + VirtualZmwBamRecord(void) = delete; + VirtualZmwBamRecord(const VirtualZmwBamRecord&) = default; + VirtualZmwBamRecord(VirtualZmwBamRecord&&) = default; + VirtualZmwBamRecord& operator=(const VirtualZmwBamRecord&) = default; + VirtualZmwBamRecord& operator=(VirtualZmwBamRecord&&) = default; + virtual ~VirtualZmwBamRecord() = default; + + /// \} + +public: + /// \name Virtual Record Attributes + /// + + /// \returns true if requested VirtualRegionType has been annotated. + /// + bool HasVirtualRegionType(const VirtualRegionType regionType) const; + + /// \returns IPD frame data + /// + Frames IPDV1Frames(Orientation orientation = Orientation::NATIVE) const; + + /// \brief Provides all annotations of the polymerase read as a map (type => regions) + /// + std::map> VirtualRegionsMap(void) const; + + /// \brief Provides annotations of the polymerase read for a given VirtualRegionType. + /// + /// \param[in] regionType requested region type + /// \returns regions that match the requested type (empty vector if none found). + /// + std::vector VirtualRegionsTable(const VirtualRegionType regionType) const; + + /// \} + +private: + std::vector sources_; + std::map> virtualRegionsMap_; + +private: + void StitchSources(void); +}; + +} // namespace BAM +} // namespace PacBio + +#endif // VirtualZmwBAMRECORD_H diff --git a/include/pbbam/virtual/WhitelistedZmwReadStitcher.h b/include/pbbam/virtual/WhitelistedZmwReadStitcher.h new file mode 100644 index 0000000..d6fa13a --- /dev/null +++ b/include/pbbam/virtual/WhitelistedZmwReadStitcher.h @@ -0,0 +1,142 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file WhitelistedZmwReadStitcher.h +/// \brief Defines the ZmwReadStitcher class. +// +// Author: Derek Barnett + +#ifndef WHITELISTEDZMWREADSTITCHER_H +#define WHITELISTEDZMWREADSTITCHER_H + +#include "pbbam/Config.h" +#include "pbbam/virtual/VirtualZmwBamRecord.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { + +class DataSet; +class PbiFilter; + +/// \brief The WhitelistedZmwReadStitcher class provides an interface for +/// re-stitching "virtual" ZMW reads from their constituent parts, +/// limiting results to only those reads originating from a 'whitelist' +/// of ZMW hole numbers. +/// +/// Whitelisted ZMWs that are not present in both primary and scraps BAMs +/// will be "pre-removed." This ensures that, given client code like this: +/// +/// \include code/WhitelistedZmwReadStitcher.txt +/// +/// each iteration will always provide valid data - either a valid virtual +/// record from Next() or a non-empty vector from NextRaw(). +/// +/// \note This reader requires that both input %BAM files also have associated +/// PBI files available for query. See BamFile::EnsurePacBioIndexExists . +/// +class PBBAM_EXPORT WhitelistedZmwReadStitcher +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// \brief Creates a reader that will operate on a primary %BAM file (e.g. subread data) + /// and a scraps file, using a ZMW whitelist to filter the input. + /// + /// \param[in] zmwWhitelist list of ZMWs to restrict iteration over + /// \param[in] primaryBamFilePath hqregion.bam or subreads.bam file path + /// \param[in] scrapsBamFilePath scraps.bam file path + /// + /// \note This reader requires that both input %BAM files also have associated PBI + /// files available for query. See BamFile::EnsurePacBioIndexExists . + /// + /// \throws std::runtime_error if any files (*.bam and/or *.pbi) were not available for reading, or + /// if malformed data encountered + /// + WhitelistedZmwReadStitcher(const std::vector& zmwWhitelist, + const std::string& primaryBamFilePath, + const std::string& scrapsBamFilePath); + + WhitelistedZmwReadStitcher(void) = delete; + WhitelistedZmwReadStitcher(const WhitelistedZmwReadStitcher&) = delete; + WhitelistedZmwReadStitcher(WhitelistedZmwReadStitcher&&) = delete; + WhitelistedZmwReadStitcher& operator=(const WhitelistedZmwReadStitcher&) = delete; + WhitelistedZmwReadStitcher& operator=(WhitelistedZmwReadStitcher&&) = delete; + ~WhitelistedZmwReadStitcher(void); + + /// \} + +public: + /// \name Stitched Record Reading + /// \{ + + /// \returns true if more ZMWs are available for reading. + bool HasNext(void) const; + + /// \returns the re-stitched polymerase read from the next ZMW in the whitelist + VirtualZmwBamRecord Next(void); + + /// \returns the set of reads that belong to the next ZMW in the whitelist. + /// This enables stitching records in a distinct thread. + /// + std::vector NextRaw(void); + + /// \} + +public: + /// \name File Headers + /// \{ + + /// \returns the BamHeader associated with this reader's "primary" %BAM file + BamHeader PrimaryHeader(void) const; + + /// \returns the BamHeader associated with this reader's "scraps" %BAM file + BamHeader ScrapsHeader(void) const; + + /// \} + +private: + struct WhitelistedZmwReadStitcherPrivate; + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // WHITELISTEDZMWREADSTITCHER diff --git a/include/pbbam/virtual/ZmwReadStitcher.h b/include/pbbam/virtual/ZmwReadStitcher.h new file mode 100644 index 0000000..a2e020a --- /dev/null +++ b/include/pbbam/virtual/ZmwReadStitcher.h @@ -0,0 +1,128 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ZmwReadStitcher.h +/// \brief Defines the ZmwReadStitcher class. +// +// Author: Derek Barnett + +#ifndef ZMWREADSTITCHER_H +#define ZMWREADSTITCHER_H + +#include "pbbam/BamRecord.h" +#include "pbbam/Config.h" +#include "pbbam/virtual/VirtualZmwBamRecord.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { + +class DataSet; +class PbiFilter; + +/// \brief The ZmwReadStitcher class provides an interface for re-stitching +/// "virtual" polymerase reads from their constituent parts. +/// +/// \note This reader requires that any input %BAM files also have associated PBI +/// files available for query. See BamFile::EnsurePacBioIndexExists . +/// +class PBBAM_EXPORT ZmwReadStitcher +{ +public: + /// \name Constructors & Related Methods + /// \{ + + /// entire file, from BAM names + ZmwReadStitcher(const std::string& primaryBamFilePath, + const std::string& scrapsBamFilePath); + + /// filtered input from BAM names + ZmwReadStitcher(const std::string& primaryBamFilePath, + const std::string& scrapsBamFilePath, + const PbiFilter& filter); + + /// maybe filtered, from DataSet input + ZmwReadStitcher(const DataSet& dataset); + + ZmwReadStitcher(void) = delete; + ZmwReadStitcher(const ZmwReadStitcher&) = delete; + ZmwReadStitcher(ZmwReadStitcher&&) = delete; + ZmwReadStitcher& operator=(const ZmwReadStitcher&) = delete; + ZmwReadStitcher& operator=(ZmwReadStitcher&&) = delete; + ~ZmwReadStitcher(void); + + /// \} + +public: + /// \name File Headers + /// \{ + + /// \returns the BamHeader associated with this reader's "primary" %BAM file + BamHeader PrimaryHeader(void) const; + + /// \returns the BamHeader associated with this reader's "scraps" %BAM file + BamHeader ScrapsHeader(void) const; + + /// \} + +public: + /// \name Stitched Record Reading + /// + + /// \returns true if more ZMWs are available for reading. + bool HasNext(void); + + /// \returns the next stitched polymerase read + VirtualZmwBamRecord Next(void); + + /// \returns the next set of reads that belong to one ZMW. + /// This enables stitching records in a distinct thread. + /// + std::vector NextRaw(void); + + /// \} + +private: + struct ZmwReadStitcherPrivate; + std::unique_ptr d_; +}; + +} // namespace BAM +} // namespace PacBio + +#endif // ZMWREADSTITCHER_H diff --git a/include/pbbam/virtual/ZmwWhitelistVirtualReader.h b/include/pbbam/virtual/ZmwWhitelistVirtualReader.h new file mode 100644 index 0000000..8b99e3c --- /dev/null +++ b/include/pbbam/virtual/ZmwWhitelistVirtualReader.h @@ -0,0 +1,56 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ZmwWhitelistVirtualReader.h +/// \brief Defines the ZmwWhitelistVirtualReader class. +// +// Author: Derek Barnett + +#ifndef ZMWWHITELISTVIRTUALREADER_H +#define ZMWWHITELISTVIRTUALREADER_H + +#include "pbbam/virtual/WhitelistedZmwReadStitcher.h" + +namespace PacBio { +namespace BAM { + +/// \deprecated Use WhitelistedZmwReadStitcher instead. +typedef WhitelistedZmwReadStitcher ZmwWhitelistVirtualReader; + +} // namespace BAM +} // namespace PacBio + +#endif // ZMWWHITELISTVIRTUALREADER_H diff --git a/src/Accuracy.cpp b/src/Accuracy.cpp new file mode 100644 index 0000000..e335abf --- /dev/null +++ b/src/Accuracy.cpp @@ -0,0 +1,47 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Accuracy.cpp +/// \brief Implements the Accuracy class. +// +// Author: Derek Barnett + +#include "pbbam/Accuracy.h" +using namespace PacBio; +using namespace PacBio::BAM; + +const float Accuracy::MIN = 0.0f; +const float Accuracy::MAX = 1.0f; diff --git a/src/AlignmentPrinter.cpp b/src/AlignmentPrinter.cpp new file mode 100644 index 0000000..5155859 --- /dev/null +++ b/src/AlignmentPrinter.cpp @@ -0,0 +1,155 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file AlignmentPrinter.cpp +/// \brief Implements the AlignmentPrinter class. +// +// Author: Armin Töpfer + +#include "pbbam/AlignmentPrinter.h" + +#include +#include +#include +#include +#include + +using namespace PacBio; +using namespace PacBio::BAM; + +AlignmentPrinter::AlignmentPrinter(const IndexedFastaReader& ifr) + : ifr_(std::unique_ptr(new IndexedFastaReader(ifr))) +{ } + +std::string AlignmentPrinter::Print(const BamRecord& record, + const Orientation orientation) +{ + std::string seq = record.Sequence(orientation, true, true); + std::string ref = ifr_->ReferenceSubsequence(record, orientation, true, true); + + if (seq.size() != ref.size()) + throw std::runtime_error("Sequence and reference parts are of different size"); + + int seqLength = 0; + float matches = 0; + std::string pretty; + Position refCoord = record.ReferenceStart(); + Position seqCoord = record.QueryStart(); + + for (size_t i = 0; i < seq.size();) + { + std::string refCoordStr = std::to_string(refCoord); + std::string seqCoordStr = std::to_string(seqCoord); + + size_t maxCoordLength = std::max(refCoordStr.size(), seqCoordStr.size()); + while (refCoordStr.size() < maxCoordLength) + refCoordStr = " "+refCoordStr; + while (seqCoordStr.size() < maxCoordLength) + seqCoordStr = " "+seqCoordStr; + + std::string seqWrap = seqCoordStr + " : "; + std::string refWrap = refCoordStr + " : "; + std::string prettyWrap(maxCoordLength+3, ' '); + prettyWrap.reserve(seq.size()); + for (int j = 0; i < seq.size() && j < 40; ++i, ++j) + { + refWrap += ref[i]; + + if (seq[i] == ref[i]) + { + ++matches; + if (refCoord == 0 || refCoord % 10) + prettyWrap += '|'; + else + { + prettyWrap += "\033[1m\x1b[31m"; + prettyWrap += '|'; + prettyWrap += "\033[0m\x1b[39;49m"; + } + seqWrap += seq[i]; + } + else if (seq[i] == '-' || ref[i] == '-') + { + prettyWrap += ' '; + seqWrap += seq[i]; + } + else + { + prettyWrap += '.'; + seqWrap += "\033[1m\x1b[31m"; + seqWrap += seq[i]; + seqWrap += "\033[0m\x1b[39;49m"; + } + if (seq[i] != '-') + { + ++seqLength; + ++seqCoord; + } + if (ref[i] != '-') + { + ++refCoord; + } + } + + refCoordStr = std::to_string(refCoord); + seqCoordStr = std::to_string(seqCoord); + + maxCoordLength = std::max(refCoordStr.size(), seqCoordStr.size()); + while (refCoordStr.size() < maxCoordLength) + refCoordStr = " "+refCoordStr; + while (seqCoordStr.size() < maxCoordLength) + seqCoordStr = " "+seqCoordStr; + + seqWrap += " : " + seqCoordStr; + refWrap += " : " + refCoordStr; + + pretty += refWrap + '\n' + prettyWrap + '\n' + seqWrap + "\n\n"; + } + float similarity = matches/seq.size(); + + std::stringstream output; + + output << "Read : " << record.FullName() << std::endl; + output << "Reference : " << record.ReferenceName() << std::endl; + output << std::endl; + output << "Read-length : " << seqLength << std::endl; + output << "Concordance : " << std::setprecision(3) << (similarity); + output << std::endl; + output << std::endl; + output << pretty; + + return output.str(); +} diff --git a/src/AssertUtils.cpp b/src/AssertUtils.cpp new file mode 100644 index 0000000..42f4ea8 --- /dev/null +++ b/src/AssertUtils.cpp @@ -0,0 +1,90 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "AssertUtils.h" +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +static inline +void message_out(FILE* stream, + const char* format, + const char* msg) +{ + fprintf(stream, format, msg); + fprintf(stream, "\n"); + fflush(stream); +} + +void printInfo(const char* msg, ...) { + + va_list ap; + va_start(ap, msg); + + char buffer[256] = {'\0' }; + buffer[255] = '\0'; + if (msg) + vsnprintf(buffer, 255, msg, ap); + va_end(ap); + + message_out(stdout, "%s", buffer); +} + +void printError(const char* msg, ...) { + + va_list ap; + va_start(ap, msg); + + char buffer[256] = {'\0' }; + buffer[255] = '\0'; + if (msg) + vsnprintf(buffer, 255, msg, ap); + va_end(ap); + + message_out(stderr, "%s", buffer); +} + +void printFailedAssert(const char* msg) { + printError("ASSERT FAILED: %s", msg); +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio diff --git a/src/AssertUtils.h b/src/AssertUtils.h new file mode 100644 index 0000000..d098964 --- /dev/null +++ b/src/AssertUtils.h @@ -0,0 +1,93 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef ASSERTUTILS_H +#define ASSERTUTILS_H + +// --------------------------------------------------- +// This file contains dev/debugging helper utilities +// --------------------------------------------------- + +#ifndef PBBAM_UNUSED +# define PBBAM_UNUSED(x) (void)x; +#endif + +namespace PacBio { +namespace BAM { +namespace internal { + +inline void pbbam_noop(void) { } + +// a la fprintf(...). Auto-adds a newline +void printError(const char* msg, ...); +void printInfo(const char* msg, ...); +void printFailedAssert(const char* msg); + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +// +// This assert construct below allows us to report failures as well as take some +// fallback action (return, break, continue, etc) so as not to crash at runtime. +// In other words, it's basically a 'weak' assert with customized information & +// failure response. +// +// PB_VERIFY(cond) if condition fails, print message +// PB_ASSERT(cond, action) if condition fails, print message & perform action +// PB_ASSERT_OR_BREAK overload of ASSERT where action is 'break' +// PB_ASSERT_OR_CONTINUE overload of ASSERT where action is 'continue' +// PB_ASSERT_OR_RETURN overload of ASSERT where action is 'return' +// PB_ASSERT_OR_RETURN_VALUE overload of ASSERT where action is 'return ' +// PB_ASSERT_UNREACHABLE overload of ASSERT(false) where action is a no-op. Used as a visual marker for +// unreachable code-paths (e.g. invalid values in a switch statement) +// +#define PB_ASSERT_STRINGIFY2(x) #x +#define PB_ASSERT_STRINGIFY(x) PB_ASSERT_STRINGIFY2(x) +#define PB_ASSERT_STRING(cond) ::PacBio::BAM::internal::printFailedAssert( \ + "\"" cond"\" in file " __FILE__ ", line " PB_ASSERT_STRINGIFY(__LINE__)) + +#define PB_VERIFY(cond) if (cond) {} else { PB_ASSERT_STRING(#cond); } do {} while (0) +#define PB_ASSERT(cond, action) if (cond) {} else { PB_ASSERT_STRING(#cond); action; } do {} while (0) +#define PB_ASSERT_OR_BREAK(cond) PB_ASSERT(cond, break) +#define PB_ASSERT_OR_CONTINUE(cond) PB_ASSERT(cond, continue) +#define PB_ASSERT_OR_RETURN(cond) PB_ASSERT(cond, return) +#define PB_ASSERT_OR_RETURN_VALUE(cond, value) PB_ASSERT(cond, return value) + +#define PB_ASSERT_UNREACHABLE PB_ASSERT(false, ::PacBio::BAM::internal::pbbam_noop()) + +#endif // ASSERTUTILS_H diff --git a/src/BaiIndexedBamReader.cpp b/src/BaiIndexedBamReader.cpp new file mode 100644 index 0000000..3f9d538 --- /dev/null +++ b/src/BaiIndexedBamReader.cpp @@ -0,0 +1,141 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BaiIndexedBamReader.cpp +/// \brief Implements the BaiIndexedBamReader class. +// +// Author: Derek Barnett + +#include "pbbam/BaiIndexedBamReader.h" +#include "MemoryUtils.h" +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +struct BaiIndexedBamReaderPrivate +{ +public: + BaiIndexedBamReaderPrivate(const BamFile& file, + const GenomicInterval& interval) + : htsIndex_(nullptr) + , htsIterator_(nullptr) + { + LoadIndex(file.Filename()); + Interval(file.Header(), interval); + } + + void Interval(const BamHeader& header, + const GenomicInterval& interval) + { + htsIterator_.reset(nullptr); + + if (header.HasSequence(interval.Name())) { + auto id = header.SequenceId(interval.Name()); + if (id >= 0 && static_cast(id) < header.NumSequences()) { + htsIterator_.reset(bam_itr_queryi(htsIndex_.get(), + id, + interval.Start(), + interval.Stop())); + } + } + + if (!htsIterator_) + throw std::runtime_error("could not create iterator for requested region"); + } + + void LoadIndex(const string& fn) + { + htsIndex_.reset(bam_index_load(fn.c_str())); + if (!htsIndex_) + throw std::runtime_error("could not load BAI index data"); + } + + int ReadRawData(BGZF* bgzf, bam1_t* b) + { + assert(htsIterator_.get()); + return hts_itr_next(bgzf, htsIterator_.get(), b, nullptr); + } + +public: + GenomicInterval interval_; + std::unique_ptr htsIndex_; + std::unique_ptr htsIterator_; +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +BaiIndexedBamReader::BaiIndexedBamReader(const GenomicInterval& interval, + const std::string& filename) + : BaiIndexedBamReader(interval, BamFile(filename)) +{ } + +BaiIndexedBamReader::BaiIndexedBamReader(const GenomicInterval& interval, + const BamFile& bamFile) + : BamReader(bamFile) + , d_(new BaiIndexedBamReaderPrivate(File(), interval)) +{ } + +BaiIndexedBamReader::BaiIndexedBamReader(const GenomicInterval& interval, + BamFile&& bamFile) + : BamReader(std::move(bamFile)) + , d_(new BaiIndexedBamReaderPrivate(File(), interval)) +{ } + +const GenomicInterval& BaiIndexedBamReader::Interval(void) const +{ + assert(d_); + return d_->interval_; +} + +int BaiIndexedBamReader::ReadRawData(BGZF* bgzf, bam1_t* b) +{ + assert(d_); + return d_->ReadRawData(bgzf, b); +} + +BaiIndexedBamReader& BaiIndexedBamReader::Interval(const GenomicInterval& interval) +{ + assert(d_); + d_->Interval(Header(), interval); + return *this; +} diff --git a/src/BamFile.cpp b/src/BamFile.cpp new file mode 100644 index 0000000..ed942b9 --- /dev/null +++ b/src/BamFile.cpp @@ -0,0 +1,245 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamFile.cpp +/// \brief Implements the BamFile class. +// +// Author: Derek Barnett + +#include "pbbam/BamFile.h" +#include "pbbam/PbiFile.h" +#include "FileUtils.h" +#include "MemoryUtils.h" +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +class BamFilePrivate +{ +public: + BamFilePrivate(const string& fn) + : filename_(fn) + , firstAlignmentOffset_(-1) + { + // ensure we've updated htslib verbosity with requested verbosity here + hts_verbose = ( PacBio::BAM::HtslibVerbosity == -1 ? 0 : PacBio::BAM::HtslibVerbosity); + + // attempt open + auto f = RawOpen(); + +#if !defined (PBBAM_NO_CHECK_EOF) || PBBAM_AUTOVALIDATE + // sanity check on file + const int eofCheck = bgzf_check_EOF(f->fp.bgzf); + if (eofCheck <= 0 ) { + // 1: EOF present & correct + // 2: not seekable (e.g. reading from stdin) + // 0: EOF absent + // -1: some other error + stringstream e; + if (eofCheck == 0) + e << fn << " : is missing EOF block" << endl; + else + e << fn << " : unknown error while checking EOF block" << endl; + throw std::runtime_error(e.str()); + } +#endif + + // attempt fetch header + std::unique_ptr hdr(sam_hdr_read(f.get())); + header_ = internal::BamHeaderMemory::FromRawData(hdr.get()); + + // cache first alignment offset + firstAlignmentOffset_ = bgzf_tell(f->fp.bgzf); + } + + unique_ptr DeepCopy(void) + { + return unique_ptr(new BamFilePrivate(filename_)); + } + + bool HasEOF(void) const + { + // streamed input is unknown, since it's not random-accessible + if (filename_ == "-") + return false; + + // attempt open + auto f = RawOpen(); + return RawEOFCheck(f) == 1; + } + + int RawEOFCheck(const std::unique_ptr& f) const + { + assert(f); + assert(f->fp.bgzf); + return bgzf_check_EOF(f->fp.bgzf); + } + + std::unique_ptr RawOpen(void) const + { + std::unique_ptr f(sam_open(filename_.c_str(), "rb")); + if (!f || !f->fp.bgzf) + throw std::runtime_error(string("could not open BAM file: ") + filename_); + if (f->format.format != bam) + throw std::runtime_error("expected BAM, unknown format"); + return f; + } + +public: + std::string filename_; + BamHeader header_; + int64_t firstAlignmentOffset_; +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +// ------------------------ +// BamFile implementation +// ------------------------ + +BamFile::BamFile(const std::string& filename) + : d_(new internal::BamFilePrivate(filename)) +{ } + +BamFile::BamFile(const BamFile& other) + : d_(other.d_->DeepCopy()) +{ } + +BamFile::BamFile(BamFile&& other) + : d_(std::move(other.d_)) +{ } + +BamFile& BamFile::operator=(const BamFile& other) +{ + d_ = other.d_->DeepCopy(); + return *this; +} + +BamFile& BamFile::operator=(BamFile&& other) +{ d_ = std::move(other.d_); return *this; } + +BamFile::~BamFile(void) { } + +void BamFile::CreatePacBioIndex(void) const +{ + PbiFile::CreateFrom(*this); +} + +void BamFile::CreateStandardIndex(void) const +{ + if (bam_index_build(d_->filename_.c_str(), 0) != 0) + throw std::runtime_error("could not build BAI index"); +} + +void BamFile::EnsurePacBioIndexExists(void) const +{ + if (!PacBioIndexExists()) + CreatePacBioIndex(); +} + +void BamFile::EnsureStandardIndexExists(void) const +{ + if (!StandardIndexExists()) + CreateStandardIndex(); +} + +std::string BamFile::Filename(void) const +{ return d_->filename_; } + +int64_t BamFile::FirstAlignmentOffset(void) const +{ return d_->firstAlignmentOffset_; } + +bool BamFile::HasEOF(void) const +{ return d_->HasEOF(); } + +bool BamFile::HasReference(const std::string& name) const +{ return d_->header_.HasSequence(name); } + +const BamHeader& BamFile::Header(void) const +{ return d_->header_; } + +bool BamFile::IsPacBioBAM(void) const +{ return !d_->header_.PacBioBamVersion().empty(); } + +bool BamFile::PacBioIndexExists(void) const +{ return internal::FileUtils::Exists(PacBioIndexFilename()); } + +std::string BamFile::PacBioIndexFilename(void) const +{ return d_->filename_ + ".pbi"; } + +bool BamFile::PacBioIndexIsNewer(void) const +{ + const auto bamTimestamp = internal::FileUtils::LastModified(Filename()); + const auto pbiTimestamp = internal::FileUtils::LastModified(PacBioIndexFilename()); + return bamTimestamp <= pbiTimestamp; +} + +int BamFile::ReferenceId(const std::string& name) const +{ return d_->header_.SequenceId(name); } + +uint32_t BamFile::ReferenceLength(const std::string& name) const +{ return ReferenceLength(ReferenceId(name)); } + +uint32_t BamFile::ReferenceLength(const int id) const +{ return std::stoul(d_->header_.SequenceLength(id)); } + +std::string BamFile::ReferenceName(const int id) const +{ return d_->header_.SequenceName(id); } + +bool BamFile::StandardIndexExists(void) const +{ return internal::FileUtils::Exists(StandardIndexFilename()); } + +std::string BamFile::StandardIndexFilename(void) const +{ return d_->filename_ + ".bai"; } + +bool BamFile::StandardIndexIsNewer(void) const +{ + const auto bamTimestamp = internal::FileUtils::LastModified(Filename()); + const auto baiTimestamp = internal::FileUtils::LastModified(StandardIndexFilename()); + return bamTimestamp <= baiTimestamp; +} + diff --git a/src/BamHeader.cpp b/src/BamHeader.cpp new file mode 100644 index 0000000..b69f4a3 --- /dev/null +++ b/src/BamHeader.cpp @@ -0,0 +1,386 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamHeader.cpp +/// \brief Implements the BamHeader class. +// +// Author: Derek Barnett + +#include "pbbam/BamHeader.h" +#include "StringUtils.h" +#include "Version.h" +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +static const string prefix_HD = string("@HD"); +static const string prefix_SQ = string("@SQ"); +static const string prefix_RG = string("@RG"); +static const string prefix_PG = string("@PG"); +static const string prefix_CO = string("@CO"); + +static const string token_VN = string("VN"); +static const string token_SO = string("SO"); +static const string token_pb = string("pb"); + +static inline +bool CheckSortOrder(const string& lhs, const string& rhs) +{ return lhs == rhs; } + +static inline +bool CheckPbVersion(const string& lhs, const string& rhs) +{ + return ( Version{ lhs } >= Version::Minimum && + Version{ rhs } >= Version::Minimum); +} + +static inline +bool CheckSequences(const string& sortOrder, + const vector& lhs, + const vector& rhs) +{ + return ( (sortOrder == "coordinate") ? lhs == rhs : true); +} + +static +void EnsureCanMerge(const BamHeader& lhs, const BamHeader& rhs) +{ + // check compatibility + const bool sortOrderOk = CheckSortOrder(lhs.SortOrder(), rhs.SortOrder()); + const bool pbVersionOk = CheckPbVersion(lhs.PacBioBamVersion(), rhs.PacBioBamVersion()); + const bool sequencesOk = CheckSequences(lhs.SortOrder(), lhs.Sequences(), rhs.Sequences()); + if (sortOrderOk && pbVersionOk && sequencesOk) + return; + + // if any checks failed, format error message & throw + stringstream e; + e << "could not merge BAM headers:" << endl; + + if (!sortOrderOk) { + e << " mismatched sort orders (@HD:SO) : (" + << lhs.SortOrder() << ", " << rhs.SortOrder() + << ")" << endl; + } + + if (!pbVersionOk) { + e << " incompatible PacBio BAM versions (@HD:pb) : (" + << lhs.PacBioBamVersion() << ", " << rhs.PacBioBamVersion() + << ")" << endl; + } + + if (!sequencesOk) + e << " mismatched sequence lists (@SQ entries)" << endl; + + throw std::runtime_error(e.str()); +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +BamHeader::BamHeader(const string& samHeaderText) + : d_(new internal::BamHeaderPrivate) +{ + istringstream s(samHeaderText); + string line(""); + string firstToken; + while (getline(s, line)) { + + // skip if line is not long enough to contain true values + if (line.length() < 5) + continue; + + // determine token at beginning of line + firstToken = line.substr(0,3); + + if (firstToken == internal::prefix_HD) { + + // pop off '@HD\t', then split HD lines into tokens + const vector& tokens = internal::Split(line.substr(4), '\t'); + for (const string& token : tokens) { + const string& tokenTag = token.substr(0,2); + const string& tokenValue = token.substr(3); + + // set header contents + if (tokenTag == internal::token_VN) Version(tokenValue); + else if (tokenTag == internal::token_SO) SortOrder(tokenValue); + else if (tokenTag == internal::token_pb) PacBioBamVersion(tokenValue); + } + + // check for required tags + if (Version().empty()) + Version(string(hts_version())); + } + + else if (firstToken == internal::prefix_SQ) + AddSequence(SequenceInfo::FromSam(line)); + + else if (firstToken == internal::prefix_RG) + AddReadGroup(ReadGroupInfo::FromSam(line)); + + else if (firstToken == internal::prefix_PG) + AddProgram(ProgramInfo::FromSam(line)); + + else if (firstToken == internal::prefix_CO) + AddComment(line.substr(4)); + } +} + +BamHeader& BamHeader::operator+=(const BamHeader& other) +{ + internal::EnsureCanMerge(*this, other); + + // merge read groups + for (const auto& rg : other.ReadGroups()) { + if (!HasReadGroup(rg.Id())) + AddReadGroup(rg); + } + + // merge programs + for (const auto& pg : other.Programs()) { + if (!HasProgram(pg.Id())) + AddProgram(pg); + } + + // merge comments + for (const auto& comment : other.Comments()) + AddComment(comment); + + return *this; +} + +BamHeader& BamHeader::AddSequence(const SequenceInfo& sequence) +{ + d_->sequences_.push_back(sequence); + d_->sequenceIdLookup_[sequence.Name()] = d_->sequences_.size() - 1; + return *this; +} + +BamHeader& BamHeader::ClearSequences(void) +{ + d_->sequenceIdLookup_.clear(); + d_->sequences_.clear(); + return *this; +} + +BamHeader BamHeader::DeepCopy(void) const +{ + BamHeader result; + result.d_->version_ = d_->version_; + result.d_->pacbioBamVersion_ = d_->pacbioBamVersion_; + result.d_->sortOrder_ = d_->sortOrder_; + result.d_->headerLineCustom_ = d_->headerLineCustom_; + result.d_->readGroups_ = d_->readGroups_; + result.d_->programs_ = d_->programs_; + result.d_->comments_ = d_->comments_; + result.d_->sequences_ = d_->sequences_; + result.d_->sequenceIdLookup_ = d_->sequenceIdLookup_; + return result; +} + +BamHeader& BamHeader::PacBioBamVersion(const std::string& version) +{ + d_->pacbioBamVersion_ = version; + const auto fileVersion = internal::Version{ version }; + if (fileVersion < internal::Version::Minimum) { + auto msg = string{ "invalid PacBio BAM version number" }; + msg += ( "(" + fileVersion.ToString() + ")"); + msg += string{ "is older than the minimum supported version" }; + msg += ( "(" + internal::Version::Minimum.ToString() + ")"); + throw std::runtime_error(msg); + } + return *this; +} + +ProgramInfo BamHeader::Program(const std::string& id) const +{ + const auto iter = d_->programs_.find(id); + if (iter == d_->programs_.cend()) + throw std::runtime_error("program ID not found"); + return iter->second; +} + +vector BamHeader::ProgramIds(void) const +{ + vector result; + result.reserve(d_->programs_.size()); + const auto end = d_->programs_.cend(); + auto iter = d_->programs_.cbegin(); + for ( ; iter != end; ++iter ) + result.push_back(iter->first); + return result; +} + +vector BamHeader::Programs(void) const +{ + vector result; + result.reserve(d_->programs_.size()); + const auto end = d_->programs_.cend(); + auto iter = d_->programs_.cbegin(); + for ( ; iter != end; ++iter ) + result.push_back(iter->second); + return result; +} + +BamHeader& BamHeader::Programs(const vector& programs) +{ + d_->programs_.clear(); + for (const ProgramInfo& pg : programs) + d_->programs_[pg.Id()] = pg; + return *this; +} + +ReadGroupInfo BamHeader::ReadGroup(const std::string& id) const +{ + const auto iter = d_->readGroups_.find(id); + if (iter == d_->readGroups_.cend()) + throw std::runtime_error("read group ID not found"); + return iter->second; +} + +vector BamHeader::ReadGroupIds(void) const +{ + vector result; + result.reserve(d_->readGroups_.size()); + const auto end = d_->readGroups_.cend(); + auto iter = d_->readGroups_.cbegin(); + for ( ; iter != end; ++iter ) + result.push_back(iter->first); + return result; +} + +vector BamHeader::ReadGroups(void) const +{ + vector result; + result.reserve(d_->readGroups_.size()); + const auto end = d_->readGroups_.cend(); + auto iter = d_->readGroups_.cbegin(); + for ( ; iter != end; ++iter ) + result.push_back(iter->second); + return result; +} + +BamHeader& BamHeader::ReadGroups(const vector& readGroups) +{ + d_->readGroups_.clear(); + for (const ReadGroupInfo& rg : readGroups) + d_->readGroups_[rg.Id()] = rg; + return *this; +} + +SequenceInfo BamHeader::Sequence(const std::string& name) const +{ + // TODO: SequenceId(name) throws if not found, should we do so here as well? + + const auto iter = d_->sequenceIdLookup_.find(name); + if (iter == d_->sequenceIdLookup_.cend()) + return SequenceInfo(); + const int index = iter->second; + assert(index >= 0 && (size_t)index < d_->sequences_.size()); + return d_->sequences_.at(index); +} + +int32_t BamHeader::SequenceId(const std::string& name) const +{ + const auto iter = d_->sequenceIdLookup_.find(name); + if (iter == d_->sequenceIdLookup_.cend()) + throw std::runtime_error("sequence not found"); + return iter->second; +} + +vector BamHeader::SequenceNames(void) const +{ + vector result; + result.reserve(d_->sequences_.size()); + const auto end = d_->sequences_.cend(); + auto iter = d_->sequences_.cbegin(); + for ( ; iter != end; ++iter ) + result.push_back(iter->Name()); + return result; +} + +BamHeader& BamHeader::Sequences(const vector& sequences) +{ + d_->sequences_.clear(); + for (const SequenceInfo& seq : sequences) + AddSequence(seq); + return *this; +} + +string BamHeader::ToSam(void) const +{ + // init stream + stringstream out(""); + + // @HD + const string& outputVersion = (d_->version_.empty() ? string(hts_version()) : d_->version_); + const string& outputSortOrder = (d_->sortOrder_.empty() ? string("unknown") : d_->sortOrder_); + const string& outputPbBamVersion = (d_->pacbioBamVersion_.empty() ? internal::Version::Current.ToString() + : d_->pacbioBamVersion_); + + out << internal::prefix_HD + << internal::MakeSamTag(internal::token_VN, outputVersion) + << internal::MakeSamTag(internal::token_SO, outputSortOrder) + << internal::MakeSamTag(internal::token_pb, outputPbBamVersion) + << endl; + + // @SQ + for (const SequenceInfo& seq : d_->sequences_) + out << seq.ToSam() << endl; + + // @RG + for (const auto& rgIter : d_->readGroups_) + out << rgIter.second.ToSam() << endl; + + // @PG + for (const auto& progIter : d_->programs_) + out << progIter.second.ToSam() << endl; + + // @CO + for (const string& comment : d_->comments_) + out << internal::prefix_CO << '\t' << comment << endl; + + // return result + return out.str(); +} diff --git a/src/BamReader.cpp b/src/BamReader.cpp new file mode 100644 index 0000000..e10eeba --- /dev/null +++ b/src/BamReader.cpp @@ -0,0 +1,195 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamReader.cpp +/// \brief Implements the BamReader class. +// +// Author: Derek Barnett + +#include "pbbam/BamReader.h" +#include "pbbam/Validator.h" +#include "MemoryUtils.h" +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +struct BamReaderPrivate +{ +public: + BamReaderPrivate(const BamFile& bamFile) + : htsFile_(nullptr) + , bamFile_(bamFile) + { + DoOpen(); + } + + BamReaderPrivate(BamFile&& bamFile) + : htsFile_(nullptr) + , bamFile_(std::move(bamFile)) + { + DoOpen(); + } + + void DoOpen(void) { + + // fetch file pointer + htsFile_.reset(sam_open(bamFile_.Filename().c_str(), "rb")); + if (!htsFile_) + throw std::runtime_error("could not open BAM file for reading"); + } + +public: + std::unique_ptr htsFile_; + BamFile bamFile_; +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +BamReader::BamReader(const string& fn) + : BamReader(BamFile(fn)) +{ } + +BamReader::BamReader(const BamFile& bamFile) + : d_(new internal::BamReaderPrivate(bamFile)) +{ + // skip header + VirtualSeek(d_->bamFile_.FirstAlignmentOffset()); +} + +BamReader::BamReader(BamFile&& bamFile) + : d_(new internal::BamReaderPrivate(std::move(bamFile))) +{ + // skip header + VirtualSeek(d_->bamFile_.FirstAlignmentOffset()); +} + +BamReader::~BamReader(void) { } + +BGZF* BamReader::Bgzf(void) const +{ + assert(d_); + assert(d_->htsFile_); + assert(d_->htsFile_->fp.bgzf); + return d_->htsFile_->fp.bgzf; +} + +const BamFile& BamReader::File(void) const +{ + assert(d_); + return d_->bamFile_; +} + +std::string BamReader::Filename(void) const +{ + assert(d_); + return d_->bamFile_.Filename(); +} + +const BamHeader& BamReader::Header(void) const +{ + assert(d_); + return d_->bamFile_.Header(); +} + +bool BamReader::GetNext(BamRecord& record) +{ + assert(Bgzf()); + assert(internal::BamRecordMemory::GetRawData(record).get()); + + auto result = ReadRawData(Bgzf(), internal::BamRecordMemory::GetRawData(record).get()); + + // success + if (result >= 0) { + internal::BamRecordMemory::UpdateRecordTags(record); + record.header_ = Header(); + record.ResetCachedPositions(); + +#if PBBAM_AUTOVALIDATE + Validator::Validate(record); +#endif + return true; + } + + // EOF or end-of-data range (not an error) + else if (result == -1) + return false; + + // error corrupted file + else { + auto errorMsg = string{"corrupted BAM file: "}; + if (result == -2) + errorMsg += "probably truncated"; + else if (result == -3) + errorMsg += "could not read BAM record's' core data"; + else if (result == -4) + errorMsg += "could not read BAM record's' variable-length data"; + else + errorMsg += "unknown reason " + to_string(result); + errorMsg += string{" ("}; + errorMsg += Filename(); + errorMsg += string{")"}; + throw std::runtime_error{errorMsg}; + } +} + +int BamReader::ReadRawData(BGZF* bgzf, bam1_t* b) +{ + return bam_read1(bgzf, b); +} + +void BamReader::VirtualSeek(int64_t virtualOffset) +{ + auto result = bgzf_seek(Bgzf(), virtualOffset, SEEK_SET); + if (result != 0) + throw std::runtime_error("Failed to seek in BAM file"); +} + +int64_t BamReader::VirtualTell(void) const +{ + return bgzf_tell(Bgzf()); +} diff --git a/src/BamRecord.cpp b/src/BamRecord.cpp new file mode 100644 index 0000000..ae5253e --- /dev/null +++ b/src/BamRecord.cpp @@ -0,0 +1,2457 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamRecord.cpp +/// \brief Implements the BamRecord class. +// +// Author: Derek Barnett + +#include "pbbam/BamRecord.h" +#include "pbbam/virtual/VirtualRegionTypeMap.h" +#include "pbbam/ZmwTypeMap.h" +#include "AssertUtils.h" +#include "BamRecordTags.h" +#include "MemoryUtils.h" +#include "Pulse2BaseCache.h" +#include "SequenceUtils.h" +#include +#include +#include +#include + +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +// record type names +static const string recordTypeName_ZMW = "ZMW"; +static const string recordTypeName_Polymerase = "POLYMERASE"; +static const string recordTypeName_HqRegion = "HQREGION"; +static const string recordTypeName_Subread = "SUBREAD"; +static const string recordTypeName_CCS = "CCS"; +static const string recordTypeName_Scrap = "SCRAP"; +static const string recordTypeName_Unknown = "UNKNOWN"; + +static +int32_t HoleNumberFromName(const string& fullName) +{ + const auto mainTokens = Split(fullName, '/'); + if (mainTokens.size() != 3) + throw std::runtime_error("malformed record name"); + return stoi(mainTokens.at(1)); +} + +static +Position QueryEndFromName(const string& fullName) +{ + const auto mainTokens = Split(fullName, '/'); + if (mainTokens.size() != 3) + throw std::runtime_error("malformed record name"); + const auto queryTokens = Split(mainTokens.at(2), '_'); + if (queryTokens.size() != 2) + throw std::runtime_error("malformed record name"); + return stoi(queryTokens.at(1)); +} + +static +Position QueryStartFromName(const string& fullName) +{ + const auto mainTokens = Split(fullName, '/'); + if (mainTokens.size() != 3) + throw std::runtime_error("malformed record name"); + const auto queryTokens = Split(mainTokens.at(2), '_'); + if (queryTokens.size() != 2) + throw std::runtime_error("malformed record name"); + return stoi(queryTokens.at(0)); +} + +static inline +string Label(const BamRecordTag tag) +{ + return BamRecordTags::LabelFor(tag); +} + +static +BamRecordImpl* CreateOrEdit(const BamRecordTag tag, + const Tag& value, + BamRecordImpl* impl) +{ + if (impl->HasTag(tag)) + impl->EditTag(tag, value); + else + impl->AddTag(tag, value); + return impl; +} + +static +pair AlignedOffsets(const BamRecord& record, + const int seqLength) +{ + int32_t startOffset = 0; + int32_t endOffset = seqLength; + + PBBAM_SHARED_PTR b = internal::BamRecordMemory::GetRawData(record); + uint32_t* cigarData = bam_get_cigar(b.get()); + const size_t numCigarOps = b->core.n_cigar; + if (numCigarOps > 0) { + + // start offset + for (size_t i = 0; i < numCigarOps; ++i) { + const CigarOperationType type = static_cast(bam_cigar_op(cigarData[i])); + if (type == CigarOperationType::HARD_CLIP) { + if (startOffset != 0 && startOffset != seqLength) { + startOffset = -1; + break; + } + } + else if (type == CigarOperationType::SOFT_CLIP) + startOffset += bam_cigar_oplen(cigarData[i]); + else + break; + } + + // end offset + for (int i = numCigarOps-1; i >= 0; --i) { + const CigarOperationType type = static_cast(bam_cigar_op(cigarData[i])); + if (type == CigarOperationType::HARD_CLIP) { + if (endOffset != 0 && endOffset != seqLength) { + endOffset = -1; + break; + } + } + else if (type == CigarOperationType::SOFT_CLIP) + endOffset -= bam_cigar_oplen(cigarData[i]); + else + break; + + } + + if (endOffset == 0) + endOffset = seqLength; + } + return std::make_pair(startOffset, endOffset); +} + +template +T Clip(const T& input, const size_t pos, const size_t len) +{ + if (input.empty()) + return T(); + return T{ input.cbegin() + pos, + input.cbegin() + pos + len }; +} + +template +T ClipPulse(const T& input, + internal::Pulse2BaseCache* p2bCache, + const size_t pos, + const size_t len) +{ + assert(p2bCache); + if (input.empty()) + return T(); + + // find start + size_t start = p2bCache->FindFirst(); + size_t basesSeen = 0; + while (basesSeen < pos) { + start = p2bCache->FindNext(start); + ++basesSeen; + } + + // find end + size_t end = start; + size_t seen = 1; + while (seen < len) { + end = p2bCache->FindNext(end); + ++seen; + } + + // return clipped + return T{ input.cbegin() + start, + input.cbegin() + end + 1 }; +} + +template< class InputIt, class Size, class OutputIt> +OutputIt Move_N(InputIt first, Size count, OutputIt result) +{ + return std::move(first, first+count, result); +} + +template +static void ClipAndGapify(const BamRecordImpl& impl, + const bool aligned, + const bool exciseSoftClips, + F* seq, + N paddingNullValue, + N deletionNullValue) +{ + assert(seq); + + const bool clipOrGapRequested = aligned || exciseSoftClips; + if (impl.IsMapped() && clipOrGapRequested) + { + // determine final container length + auto incrementsOutputLength = [](const CigarOperationType type, + const bool aligned, + const bool exciseSoftClips) + { + if (type == CigarOperationType::HARD_CLIP || + type == CigarOperationType::REFERENCE_SKIP) + { + return false; + } + else if (type == CigarOperationType::SOFT_CLIP && exciseSoftClips) + { + return false; + } + else if (!aligned && (type == CigarOperationType::DELETION || + type == CigarOperationType::PADDING)) + { + return false; + } + else + return true; + }; + + size_t outputLength = 0; + const auto cigar = impl.CigarData(); + for (const CigarOperation& op : cigar) { + if (incrementsOutputLength(op.Type(), aligned, exciseSoftClips)) + outputLength += op.Length(); + } + + // move original data to temp, prep output container size + F originalSeq = std::move(*seq); + seq->resize(outputLength); + + // apply CIGAR ops + size_t srcIndex = 0; + size_t dstIndex = 0; + for (const CigarOperation& op : cigar) { + const auto opType = op.Type(); + const auto opLength = op.Length(); + + // nothing to do for hard-clipped & ref-skipped positions + if (opType == CigarOperationType::HARD_CLIP || + opType == CigarOperationType::REFERENCE_SKIP) + { + continue; + } + + // maybe skip soft-clipped positions + else if (opType == CigarOperationType::SOFT_CLIP) { + if (exciseSoftClips) + srcIndex += opLength; + else { + Move_N(originalSeq.begin() + srcIndex, + opLength, + seq->begin() + dstIndex); + srcIndex += opLength; + dstIndex += opLength; + } + } + + // maybe add deletion/padding values + else if (aligned && opType == CigarOperationType::DELETION) { + for (size_t i = 0; i < opLength; ++i) + (*seq)[dstIndex++] = deletionNullValue; + } + else if (aligned && opType == CigarOperationType::PADDING) { + for (size_t i = 0; i < opLength; ++i) + (*seq)[dstIndex++] = paddingNullValue; + } + + // all other CIGAR ops + else { + Move_N(originalSeq.begin() + srcIndex, + opLength, + seq->begin() + dstIndex); + srcIndex += opLength; + dstIndex += opLength; + } + } + } +} + +static inline +void ClipAndGapifyBases(const BamRecordImpl& impl, + const bool aligned, + const bool exciseSoftClips, + string* seq) +{ + ClipAndGapify(impl, aligned, exciseSoftClips, + seq, char('*'), char('-')); +} + +static inline +void ClipAndGapifyFrames(const BamRecordImpl& impl, + const bool aligned, + const bool exciseSoftClips, + Frames* frames) +{ + assert(frames); + std::vector data = std::move(frames->Data()); + ClipAndGapify, uint16_t>(impl, aligned, exciseSoftClips, + &data, 0, 0); + frames->Data(data); +} + +static inline +void ClipAndGapifyPhotons(const BamRecordImpl& impl, + const bool aligned, + const bool exciseSoftClips, + std::vector* data) +{ + ClipAndGapify, float>(impl, aligned, exciseSoftClips, + data, 0.0, 0.0); +} + +static inline +void ClipAndGapifyQualities(const BamRecordImpl& impl, + const bool aligned, + const bool exciseSoftClips, + QualityValues* quals) +{ + ClipAndGapify(impl, aligned, exciseSoftClips, + quals, QualityValue(0), QualityValue(0)); +} + +static inline +void ClipAndGapifyUInts(const BamRecordImpl& impl, + const bool aligned, + const bool exciseSoftClips, + std::vector* data) +{ + ClipAndGapify, uint32_t>(impl, aligned, exciseSoftClips, + data, 0, 0); +} + +static +RecordType NameToType(const string& name) +{ + if (name == recordTypeName_Subread) + return RecordType::SUBREAD; + if (name == recordTypeName_ZMW || name == recordTypeName_Polymerase) + return RecordType::ZMW; + if (name == recordTypeName_HqRegion) + return RecordType::HQREGION; + if (name == recordTypeName_CCS) + return RecordType::CCS; + if (name == recordTypeName_Scrap) + return RecordType::SCRAP; + return RecordType::UNKNOWN; +} + +static +void OrientBasesAsRequested(string* bases, + Orientation current, + Orientation requested, + bool isReverseStrand, + bool isPulse) +{ + assert(bases); + if (current != requested && isReverseStrand) { + if (isPulse) + internal::ReverseComplementCaseSens(*bases); + else + internal::ReverseComplement(*bases); + } +} + +template inline +void OrientTagDataAsRequested(Container* data, + Orientation current, + Orientation requested, + bool isReverseStrand) +{ + assert(data); + if (current != requested && isReverseStrand) + std::reverse(data->begin(), data->end()); +} + +static inline +bool ConsumesQuery(const CigarOperationType type) +{ return (bam_cigar_type(static_cast(type)) & 0x1) != 0; } + +static inline +bool ConsumesReference(const CigarOperationType type) +{ return (bam_cigar_type(static_cast(type)) & 0x2) != 0; } + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +const float BamRecord::photonFactor = 10.0; + +BamRecord::BamRecord(void) + : alignedStart_(PacBio::BAM::UnmappedPosition) + , alignedEnd_(PacBio::BAM::UnmappedPosition) + , p2bCache_(nullptr) +{ } + +BamRecord::BamRecord(const BamHeader& header) + : header_(header) + , alignedStart_(PacBio::BAM::UnmappedPosition) + , alignedEnd_(PacBio::BAM::UnmappedPosition) + , p2bCache_(nullptr) +{ } + +BamRecord::BamRecord(const BamRecordImpl& impl) + : impl_(impl) + , alignedStart_(PacBio::BAM::UnmappedPosition) + , alignedEnd_(PacBio::BAM::UnmappedPosition) + , p2bCache_(nullptr) +{ } + +BamRecord::BamRecord(BamRecordImpl&& impl) + : impl_(std::move(impl)) + , alignedStart_(PacBio::BAM::UnmappedPosition) + , alignedEnd_(PacBio::BAM::UnmappedPosition) + , p2bCache_(nullptr) +{ } + +BamRecord::BamRecord(const BamRecord& other) + : impl_(other.impl_) + , header_(other.header_) + , alignedStart_(other.alignedStart_) + , alignedEnd_(other.alignedEnd_) + , p2bCache_(nullptr) // just reset, for now at least +{ } + +BamRecord::BamRecord(BamRecord&& other) + : impl_(std::move(other.impl_)) + , header_(std::move(other.header_)) + , alignedStart_(std::move(other.alignedStart_)) + , alignedEnd_(std::move(other.alignedEnd_)) + , p2bCache_(std::move(other.p2bCache_)) +{ } + +BamRecord& BamRecord::operator=(const BamRecord& other) +{ + impl_ = other.impl_; + header_ = other.header_; + alignedStart_ = other.alignedStart_; + alignedEnd_ = other.alignedEnd_; + p2bCache_.reset(nullptr); // just reset, for now at least + return *this; +} + +BamRecord& BamRecord::operator=(BamRecord&& other) +{ + impl_ = std::move(other.impl_); + header_ = std::move(other.header_); + alignedStart_ = std::move(other.alignedStart_); + alignedEnd_ = std::move(other.alignedEnd_); + p2bCache_ = std::move(other.p2bCache_); + return *this; +} + +BamRecord::~BamRecord(void) { } + +Position BamRecord::AlignedEnd(void) const +{ + if (alignedEnd_ == PacBio::BAM::UnmappedPosition) + CalculateAlignedPositions(); + return alignedEnd_; +} + +Position BamRecord::AlignedStart(void) const +{ + if (alignedStart_ == PacBio::BAM::UnmappedPosition) + CalculateAlignedPositions(); + return alignedStart_; +} + +Strand BamRecord::AlignedStrand(void) const +{ return impl_.IsReverseStrand() ? Strand::REVERSE : Strand::FORWARD; } + +QualityValues BamRecord::AltLabelQV(Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) const +{ + return FetchQualities(BamRecordTag::ALT_LABEL_QV, + orientation, + aligned, + exciseSoftClips, + pulseBehavior); +} + +BamRecord& BamRecord::AltLabelQV(const QualityValues& altLabelQVs) +{ + internal::CreateOrEdit(BamRecordTag::ALT_LABEL_QV, + altLabelQVs.Fastq(), + &impl_); + return *this; +} + +string BamRecord::AltLabelTag(Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) const +{ + return FetchBases(BamRecordTag::ALT_LABEL_TAG, + orientation, + aligned, + exciseSoftClips, + pulseBehavior); +} + +BamRecord& BamRecord::AltLabelTag(const string& tags) +{ + internal::CreateOrEdit(BamRecordTag::ALT_LABEL_TAG, + tags, + &impl_); + return *this; +} + +int16_t BamRecord::BarcodeForward(void) const +{ return Barcodes().first; } + +int16_t BamRecord::BarcodeReverse(void) const +{ return Barcodes().second; } + +uint8_t BamRecord::BarcodeQuality(void) const +{ + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::BARCODE_QUALITY); + const auto bq = impl_.TagValue(tagName); + if (bq.IsNull()) + return 0; // ?? "missing" value for tags ?? should we consider boost::optional for these kind of guys ?? + return bq.ToUInt8(); +} + +BamRecord& BamRecord::BarcodeQuality(const uint8_t quality) +{ + internal::CreateOrEdit(BamRecordTag::BARCODE_QUALITY, + quality, + &impl_); + return *this; +} + +std::pair BamRecord::Barcodes(void) const +{ + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::BARCODES); + const Tag& bc = impl_.TagValue(tagName); + if (bc.IsNull()) + throw std::runtime_error("barcode tag (bc) was requested but is missing"); + + // NOTE: barcodes are still stored, per the spec, as uint16, even though + // we're now using them as int16_t in the API (bug 31511) + // + if (!bc.IsUInt16Array()) + throw std::runtime_error("barcode tag (bc) is malformed: should be a uint16_t array of size==2."); + const auto bcArray = bc.ToUInt16Array(); + if (bcArray.size() != 2) + throw std::runtime_error("barcode tag (bc) is malformed: should be a uint16_t array of size==2."); + + return std::make_pair(boost::numeric_cast(bcArray[0]), + boost::numeric_cast(bcArray[1])); +} + +BamRecord& BamRecord::Barcodes(const std::pair& barcodeIds) +{ + const vector data = + { + boost::numeric_cast(barcodeIds.first), + boost::numeric_cast(barcodeIds.second) + }; + internal::CreateOrEdit(BamRecordTag::BARCODES, + data, + &impl_); + return *this; +} + +void BamRecord::CalculateAlignedPositions(void) const +{ + // reset + ResetCachedPositions(); + + // skip if unmapped, or has no queryStart/End + if (!impl_.IsMapped()) + return; + + // get the query start/end + const size_t seqLength = impl_.SequenceLength(); + const RecordType type = Type(); + const Position qStart = (type == RecordType::CCS) ? Position(0) : QueryStart(); + const Position qEnd = (type == RecordType::CCS) ? Position(seqLength) : QueryEnd(); + + if (qStart == PacBio::BAM::UnmappedPosition || qEnd == PacBio::BAM::UnmappedPosition) + return; + + // determine clipped end ranges + const std::pair alignedOffsets = internal::AlignedOffsets(*this, seqLength); + const int32_t startOffset = alignedOffsets.first; + const int32_t endOffset = alignedOffsets.second; + if (endOffset == -1 || startOffset == -1) + return; // TODO: handle error more?? + + // store aligned positions (polymerase read coordinates) + if (impl_.IsReverseStrand()) { + alignedStart_ = qStart + (seqLength - endOffset); + alignedEnd_ = qEnd - startOffset; + } + else { + alignedStart_ = qStart + startOffset; + alignedEnd_ = qEnd - (seqLength - endOffset); + } +} + +void BamRecord::CalculatePulse2BaseCache(void) const +{ + // skip already calculated + if (p2bCache_) + return; + + // else try to calculate p2b cache. + if (!HasPulseCall()) + throw std::runtime_error("BamRecord cannot calculate pulse2base mapping without 'pc' tag."); + const auto pulseCalls = FetchBases(BamRecordTag::PULSE_CALL, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL); + p2bCache_.reset(new internal::Pulse2BaseCache{ pulseCalls }); +} + +Cigar BamRecord::CigarData(bool exciseAllClips) const +{ + auto isClippingOp = [](const CigarOperation& op) + { + const auto type = op.Type(); + return type == CigarOperationType::SOFT_CLIP || + type == CigarOperationType::HARD_CLIP; + }; + + auto cigar = impl_.CigarData(); + if (exciseAllClips) { + cigar.erase(std::remove_if(cigar.begin(), + cigar.end(), + isClippingOp), + cigar.end()); + } + return cigar; +} + +BamRecord& BamRecord::Clip(const ClipType clipType, + const Position start, + const Position end) +{ + switch (clipType) + { + case ClipType::CLIP_NONE : return *this; + case ClipType::CLIP_TO_QUERY : return ClipToQuery(start, end); + case ClipType::CLIP_TO_REFERENCE : return ClipToReference(start, end); + default: + throw std::runtime_error("unsupported clip type requested"); + } +} + +void BamRecord::ClipFields(const size_t clipFrom, + const size_t clipLength) +{ + const bool isForwardStrand = (AlignedStrand() == Strand::FORWARD); + + // clip seq, quals + string sequence = internal::Clip(Sequence(Orientation::NATIVE), clipFrom, clipLength); + QualityValues qualities = internal::Clip(Qualities(Orientation::NATIVE), clipFrom, clipLength); + if (!isForwardStrand) { + internal::ReverseComplement(sequence); + internal::Reverse(qualities); + } + impl_.SetSequenceAndQualities(sequence, qualities.Fastq()); + + // update BAM tags + TagCollection tags = impl_.Tags(); + if (HasDeletionQV()) + tags[internal::Label(BamRecordTag::DELETION_QV)] = internal::Clip(DeletionQV(Orientation::NATIVE), clipFrom, clipLength).Fastq(); + if (HasInsertionQV()) + tags[internal::Label(BamRecordTag::INSERTION_QV)] = internal::Clip(InsertionQV(Orientation::NATIVE), clipFrom, clipLength).Fastq(); + if (HasMergeQV()) + tags[internal::Label(BamRecordTag::MERGE_QV)] = internal::Clip(MergeQV(Orientation::NATIVE), clipFrom, clipLength).Fastq(); + if (HasSubstitutionQV()) + tags[internal::Label(BamRecordTag::SUBSTITUTION_QV)] = internal::Clip(SubstitutionQV(Orientation::NATIVE), clipFrom, clipLength).Fastq(); + if (HasIPD()) + tags[internal::Label(BamRecordTag::IPD)] = internal::Clip(IPD(Orientation::NATIVE).Data(), clipFrom, clipLength); + if (HasPulseWidth()) + tags[internal::Label(BamRecordTag::PULSE_WIDTH)] = internal::Clip(PulseWidth(Orientation::NATIVE).Data(), clipFrom, clipLength); + if (HasDeletionTag()) + tags[internal::Label(BamRecordTag::DELETION_TAG)] = internal::Clip(DeletionTag(Orientation::NATIVE), clipFrom, clipLength); + if (HasSubstitutionTag()) + tags[internal::Label(BamRecordTag::SUBSTITUTION_TAG)] = internal::Clip(SubstitutionTag(Orientation::NATIVE), clipFrom, clipLength); + + // internal BAM tags + if (HasPulseCall()) { + + // ensure p2bCache initialized + CalculatePulse2BaseCache(); + internal::Pulse2BaseCache* p2bCache = p2bCache_.get(); + + if (HasAltLabelQV()) + tags[internal::Label(BamRecordTag::ALT_LABEL_QV)] = internal::ClipPulse(AltLabelQV(Orientation::NATIVE), p2bCache, clipFrom, clipLength).Fastq(); + if (HasLabelQV()) + tags[internal::Label(BamRecordTag::LABEL_QV)] = internal::ClipPulse(LabelQV(Orientation::NATIVE), p2bCache, clipFrom, clipLength).Fastq(); + if (HasPulseMergeQV()) + tags[internal::Label(BamRecordTag::PULSE_MERGE_QV)] = internal::ClipPulse(PulseMergeQV(Orientation::NATIVE), p2bCache, clipFrom, clipLength).Fastq(); + if (HasAltLabelTag()) + tags[internal::Label(BamRecordTag::ALT_LABEL_TAG)] = internal::ClipPulse(AltLabelTag(Orientation::NATIVE), p2bCache, clipFrom, clipLength); + if (HasPulseCall()) + tags[internal::Label(BamRecordTag::PULSE_CALL)] = internal::ClipPulse(PulseCall(Orientation::NATIVE), p2bCache, clipFrom, clipLength); + if (HasPkmean()) + tags[internal::Label(BamRecordTag::PKMEAN)] = EncodePhotons(internal::ClipPulse(Pkmean(Orientation::NATIVE), p2bCache, clipFrom, clipLength)); + if (HasPkmid()) + tags[internal::Label(BamRecordTag::PKMID)] = EncodePhotons(internal::ClipPulse(Pkmid(Orientation::NATIVE), p2bCache, clipFrom, clipLength)); + if (HasPkmean2()) + tags[internal::Label(BamRecordTag::PKMEAN_2)] = EncodePhotons(internal::ClipPulse(Pkmean2(Orientation::NATIVE), p2bCache, clipFrom, clipLength)); + if (HasPkmid2()) + tags[internal::Label(BamRecordTag::PKMID_2)] = EncodePhotons(internal::ClipPulse(Pkmid2(Orientation::NATIVE), p2bCache, clipFrom, clipLength)); + if (HasPrePulseFrames()) + tags[internal::Label(BamRecordTag::PRE_PULSE_FRAMES)] = internal::ClipPulse(PrePulseFrames(Orientation::NATIVE).Data(), p2bCache, clipFrom, clipLength); + if (HasPulseCallWidth()) + tags[internal::Label(BamRecordTag::PULSE_CALL_WIDTH)] = internal::ClipPulse(PulseCallWidth(Orientation::NATIVE).Data(), p2bCache, clipFrom, clipLength); + if (HasStartFrame()) + tags[internal::Label(BamRecordTag::START_FRAME)] = internal::ClipPulse(StartFrame(Orientation::NATIVE), p2bCache, clipFrom, clipLength); + + } + + impl_.Tags(tags); +} + +BamRecord& BamRecord::ClipToQuery(const Position start, + const Position end) +{ + // cache original coords, skip out if clip not needed + const size_t seqLength = impl_.SequenceLength(); + const RecordType type = Type(); + const Position origQStart = (type == RecordType::CCS) ? Position(0) : QueryStart(); + const Position origQEnd = (type == RecordType::CCS) ? Position(seqLength) : QueryEnd(); + if (start <= origQStart && end >= origQEnd) + return *this; + + // determine new offsets into data + const size_t startOffset = start - origQStart; + const size_t endOffset = origQEnd - end; + + // maybe update CIGAR & aligned position + if (IsMapped()) { + + // fetch a 'working copy' of CIGAR data + Cigar cigar = impl_.CigarData(); + + // clip leading CIGAR ops + size_t referencePositionOffset = 0; + size_t remaining = startOffset; + while (remaining > 0 && !cigar.empty()) { + CigarOperation& firstOp = cigar.front(); + const size_t firstOpLength = firstOp.Length(); + const bool consumesQuery = internal::ConsumesQuery(firstOp.Type()); + const bool consumesRef = internal::ConsumesReference(firstOp.Type()); + + // if (!consumesQuery) + // just pop (e.g. deletion) ? + // else { + // check bounds, like clip to reference ? + // } + + // CIGAR op ends at or before clip + if (firstOpLength <= remaining) { + cigar.erase(cigar.begin()); + if (consumesQuery) + remaining -= firstOpLength; + if (consumesRef) + referencePositionOffset += firstOpLength; + } + + // CIGAR op straddles clip + else { + firstOp.Length(firstOpLength - remaining); + if (consumesRef) + referencePositionOffset += remaining; + remaining = 0; + } + } + + // clip trailing CIGAR ops + remaining = endOffset; + while (remaining > 0 && !cigar.empty()) { + CigarOperation& lastOp = cigar.back(); + const size_t lastOpLength = lastOp.Length(); + const bool consumesQuery = internal::ConsumesQuery(lastOp.Type()); + + // CIGAR op ends at or after clip + if (lastOpLength <= remaining) { + cigar.pop_back(); + if (consumesQuery) + remaining -= lastOpLength; + } + + // CIGAR op straddles clip + else { + lastOp.Length(lastOpLength - remaining); + remaining = 0; + } + } + + // update CIGAR & position + impl_.CigarData(cigar); + const Position origPosition = impl_.Position(); + impl_.Position(origPosition + referencePositionOffset); + } + + // clip SEQ, QUAL, & tags + const size_t clipFrom = startOffset; + const size_t clipLength = (end - start); + ClipFields(clipFrom, clipLength); + + // update query start/end + // TODO: update name to reflect new QS/QE ??? + internal::CreateOrEdit(BamRecordTag::QUERY_START, start, &impl_); + internal::CreateOrEdit(BamRecordTag::QUERY_END, end, &impl_); +// UpdateName(); + + // reset any cached aligned start/end + ResetCachedPositions(); + return *this; +} + +BamRecord& BamRecord::ClipToReference(const Position start, + const Position end) +{ + // skip if not mapped, clipping to reference doesn't make sense + // or should we even consider throwing here? + if (!IsMapped()) + return *this; + + const bool isForwardStrand = (AlignedStrand() == Strand::FORWARD); + return (isForwardStrand ? ClipToReferenceForward(start, end) + : ClipToReferenceReverse(start, end)); +} + +BamRecord& BamRecord::ClipToReferenceForward(const PacBio::BAM::Position start, + const PacBio::BAM::Position end) +{ + assert(IsMapped()); + assert(AlignedStrand() == Strand::FORWARD); + + // cache original coords + const size_t seqLength = impl_.SequenceLength(); + const RecordType type = Type(); + const Position origQStart = (type == RecordType::CCS) ? Position(0) : QueryStart(); + const Position origQEnd = (type == RecordType::CCS) ? Position(seqLength) : QueryEnd(); + const Position origTStart = ReferenceStart(); + const Position origTEnd = ReferenceEnd(); + assert(AlignedStart() >= origQStart); + assert(AlignedEnd() <= origQEnd); + + // skip if already within requested clip range + if (start <= origTStart && end >= origTEnd) + return *this; + + const Position newTStart = std::max(origTStart, start); + const Position newTEnd = std::min(origTEnd, end); + + // fetch a 'working copy' of CIGAR data + Cigar cigar = impl_.CigarData(); + + // we're going to skip query sequence outside aligned region + size_t queryPosRemovedFront = 0; + size_t queryPosRemovedBack = 0; + + // ------------------------ + // clip leading CIGAR ops + // ------------------------ + + size_t remaining = newTStart - origTStart; + while (remaining > 0 && !cigar.empty()) { + CigarOperation& firstOp = cigar.front(); + const size_t firstOpLength = firstOp.Length(); + const bool consumesQuery = internal::ConsumesQuery(firstOp.Type()); + const bool consumesRef = internal::ConsumesReference(firstOp.Type()); + + if (!consumesRef) { + + // e.g. softclip - just pop it completely + cigar.erase(cigar.begin()); + if (consumesQuery) + queryPosRemovedFront += firstOpLength; + + } else { + assert(consumesRef); + + // CIGAR ends at or before clip + if (firstOpLength <= remaining) { + cigar.erase(cigar.begin()); + if (consumesQuery) + queryPosRemovedFront += firstOpLength; + if (consumesRef) + remaining -= firstOpLength; + } + + // CIGAR straddles clip + else { + assert(firstOpLength > remaining); + firstOp.Length(firstOpLength - remaining); + if (consumesQuery) + queryPosRemovedFront += remaining; + remaining = 0; + } + } + } + + // ------------------------- + // clip trailing CIGAR ops + // ------------------------- + + remaining = origTEnd - newTEnd; + while (remaining > 0 && !cigar.empty()) { + CigarOperation& lastOp = cigar.back(); + const size_t lastOpLength = lastOp.Length(); + const bool consumesQuery = internal::ConsumesQuery(lastOp.Type()); + const bool consumesRef = internal::ConsumesReference(lastOp.Type()); + + if (!consumesRef) { + + // e.g. softclip - just pop it completely + cigar.pop_back(); + if (consumesQuery) + queryPosRemovedBack += lastOpLength; + + } else { + assert(consumesRef); + + // CIGAR ends at or after clip + if (lastOpLength <= remaining) { + cigar.pop_back(); + if (consumesQuery) + queryPosRemovedBack += lastOpLength; + if (consumesRef) + remaining -= lastOpLength; + } + + // CIGAR straddles clip + else { + assert(lastOpLength > remaining); + lastOp.Length(lastOpLength - remaining); + if (consumesQuery) + queryPosRemovedBack += remaining; + remaining = 0; + } + } + } + + // update CIGAR and position + impl_.CigarData(cigar); + impl_.Position(newTStart); + + // clip SEQ, QUAL, tags + const Position qStart = origQStart + queryPosRemovedFront; + const Position qEnd = origQEnd - queryPosRemovedBack; + const size_t clipFrom = queryPosRemovedFront; + const size_t clipLength = qEnd - qStart; + ClipFields(clipFrom, clipLength); + + // update query start/end + internal::CreateOrEdit(BamRecordTag::QUERY_START, qStart, &impl_); + internal::CreateOrEdit(BamRecordTag::QUERY_END, qEnd, &impl_); +// UpdateName(); + + // reset any cached aligned start/end + ResetCachedPositions(); + return *this; +} + +BamRecord& BamRecord::ClipToReferenceReverse(const PacBio::BAM::Position start, + const PacBio::BAM::Position end) +{ + assert(IsMapped()); + assert(AlignedStrand() == Strand::REVERSE); + + // cache original coords + const size_t seqLength = impl_.SequenceLength(); + const RecordType type = Type(); + const Position origQStart = (type == RecordType::CCS) ? Position(0) : QueryStart(); + const Position origQEnd = (type == RecordType::CCS) ? Position(seqLength) : QueryEnd(); + const Position origTStart = ReferenceStart(); + const Position origTEnd = ReferenceEnd(); + + // skip if already within requested clip range + if (start <= origTStart && end >= origTEnd) + return *this; + assert(AlignedStart() >= origQStart); + assert(AlignedEnd() <= origQEnd); + + const Position newTStart = std::max(origTStart, start); + const Position newTEnd = std::min(origTEnd, end); + + Cigar cigar = impl_.CigarData(); + + size_t queryPosRemovedFront = 0; + size_t queryPosRemovedBack = 0; + + // update CIGAR - clip front ops, then clip back ops + size_t remaining = newTStart - origTStart; + while (remaining > 0 && !cigar.empty()) { + CigarOperation& firstOp = cigar.front(); + const CigarOperationType firstOpType = firstOp.Type(); + const size_t firstOpLength = firstOp.Length(); + const bool consumesQuery = internal::ConsumesQuery(firstOpType); + const bool consumesRef = internal::ConsumesReference(firstOpType); + + if (!consumesRef) { + + // e.g. softclip - just pop it completely + cigar.erase(cigar.begin()); + if (consumesQuery) + queryPosRemovedBack += firstOpLength; + + } else { + assert(consumesRef); + + // CIGAR ends at or before clip + if (firstOpLength <= remaining) { + cigar.erase(cigar.begin()); + if (consumesQuery) + queryPosRemovedBack += firstOpLength; + if (consumesRef) + remaining -= firstOpLength; + } + + // CIGAR straddles clip + else { + assert(firstOpLength > remaining); + firstOp.Length(firstOpLength - remaining); + if (consumesQuery) + queryPosRemovedBack += remaining; + remaining = 0; + } + } + } + + remaining = origTEnd - newTEnd; + while (remaining > 0 && !cigar.empty()) { + CigarOperation& lastOp = cigar.back(); + const CigarOperationType lastOpType = lastOp.Type(); + const size_t lastOpLength = lastOp.Length(); + const bool consumesQuery = internal::ConsumesQuery(lastOpType); + const bool consumesRef = internal::ConsumesReference(lastOpType); + + if (!consumesRef) { + + // e.g. softclip - just pop it completely + cigar.pop_back(); + if (consumesQuery) + queryPosRemovedFront += lastOpLength; + + } else { + assert(consumesRef); + + // CIGAR ends at or before clip + if (lastOpLength <= remaining) { + cigar.pop_back(); + if (consumesQuery) + queryPosRemovedFront += lastOpLength; + if (consumesRef) + remaining -= lastOpLength; + } + + // CIGAR straddles clip + else { + assert(lastOpLength > remaining); + lastOp.Length(lastOpLength - remaining); + if (consumesQuery) + queryPosRemovedFront += remaining; + remaining = 0; + } + } + } + impl_.CigarData(cigar); + + // update aligned reference position + impl_.Position(newTStart); + + // clip SEQ, QUAL, tags + const Position qStart = origQStart + queryPosRemovedFront; + const Position qEnd = origQEnd - queryPosRemovedBack; + const size_t clipFrom = queryPosRemovedFront; + const size_t clipLength = qEnd - qStart; + ClipFields(clipFrom, clipLength); + + // update query start/end + internal::CreateOrEdit(BamRecordTag::QUERY_START, qStart, &impl_); + internal::CreateOrEdit(BamRecordTag::QUERY_END, qEnd, &impl_); +// UpdateName(); + + // reset any cached aligned start/end + ResetCachedPositions(); + return *this; +} + +QualityValues BamRecord::DeletionQV(Orientation orientation, + bool aligned, + bool exciseSoftClips) const +{ + return FetchQualities(BamRecordTag::DELETION_QV, + orientation, + aligned, + exciseSoftClips); +} + +BamRecord& BamRecord::DeletionQV(const QualityValues& deletionQVs) +{ + internal::CreateOrEdit(BamRecordTag::DELETION_QV, + deletionQVs.Fastq(), + &impl_); + return *this; +} + + +string BamRecord::DeletionTag(Orientation orientation, + bool aligned, + bool exciseSoftClips) const +{ + return FetchBases(BamRecordTag::DELETION_TAG, + orientation, + aligned, + exciseSoftClips); +} + +BamRecord& BamRecord::DeletionTag(const string& tags) +{ + internal::CreateOrEdit(BamRecordTag::DELETION_TAG, + tags, + &impl_); + return *this; +} + +vector +BamRecord::EncodePhotons(const vector& data) +{ + vector encoded; + encoded.reserve(data.size()); + for (const auto& d : data) + encoded.emplace_back(d * photonFactor); + return encoded; +} + +string BamRecord::FetchBasesRaw(const BamRecordTag tag) const +{ + const Tag& seqTag = impl_.TagValue(tag); + return seqTag.ToString(); +} + +string BamRecord::FetchBases(const BamRecordTag tag, + const Orientation orientation, + const bool aligned, + const bool exciseSoftClips, + const PulseBehavior pulseBehavior) const +{ + const bool isBamSeq = (tag == BamRecordTag::SEQ); + const bool isPulse = internal::BamRecordTags::IsPulse(tag); + + // fetch raw + string bases; + Orientation current; + if (isBamSeq) { // SEQ stored in genomic orientation + bases = impl_.Sequence(); + current = Orientation::GENOMIC; + } else { // all tags stored in native orientation + bases = FetchBasesRaw(tag); + current = Orientation::NATIVE; + } + + // maybe strip 'squashed' pulse loci + if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) { + CalculatePulse2BaseCache(); + bases = p2bCache_->RemoveSquashedPulses(bases); + } + + // if we need to touch CIGAR + if (aligned || exciseSoftClips) { + + if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY) + throw std::runtime_error("Cannot return data at all pulses when gapping and/or soft-clipping are requested. " + "Use PulseBehavior::BASECALLS_ONLY instead."); + + // force into genomic orientation + internal::OrientBasesAsRequested(&bases, + current, + Orientation::GENOMIC, + impl_.IsReverseStrand(), + isPulse); + current = Orientation::GENOMIC; + + // clip & gapify as requested + internal::ClipAndGapifyBases(impl_, + aligned, + exciseSoftClips, + &bases); + } + + // return in the orientation requested + internal::OrientBasesAsRequested(&bases, + current, + orientation, + impl_.IsReverseStrand(), + isPulse); + return bases; +} + +Frames BamRecord::FetchFramesRaw(const BamRecordTag tag) const +{ + Frames frames; + const Tag& frameTag = impl_.TagValue(tag); + if (frameTag.IsNull()) + return frames; // throw ? + + // lossy frame codes + if (frameTag.IsUInt8Array()) { + const vector codes = frameTag.ToUInt8Array(); + frames = Frames::Decode(codes); + } + + // lossless frame data + else { + assert(frameTag.IsUInt16Array()); + frames.Data(frameTag.ToUInt16Array()); + } + + return frames; +} + +Frames BamRecord::FetchFrames(const BamRecordTag tag, + const Orientation orientation, + const bool aligned, + const bool exciseSoftClips, + const PulseBehavior pulseBehavior) const +{ + const bool isPulse = internal::BamRecordTags::IsPulse(tag); + + // fetch raw + Frames frames = FetchFramesRaw(tag); + Orientation current = Orientation::NATIVE; + + // maybe strip 'squashed' pulse loci + if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) { + CalculatePulse2BaseCache(); + frames.DataRaw() = p2bCache_->RemoveSquashedPulses(frames.Data()); + } + + // if we need to touch the CIGAR + if (aligned || exciseSoftClips) { + + if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY) + throw std::runtime_error("Cannot return data at all pulses when gapping and/or soft-clipping are requested. " + "Use PulseBehavior::BASECALLS_ONLY instead."); + + // force into genomic orientation + internal::OrientTagDataAsRequested(&frames, + current, + Orientation::GENOMIC, + impl_.IsReverseStrand()); + current = Orientation::GENOMIC; + + // clip & gapify as requested + internal::ClipAndGapifyFrames(impl_, + aligned, + exciseSoftClips, + &frames); + } + + // return in the orientation requested + internal::OrientTagDataAsRequested(&frames, + current, + orientation, + impl_.IsReverseStrand()); + return frames; + +} + +vector BamRecord::FetchPhotonsRaw(const BamRecordTag tag) const +{ + const Tag& frameTag = impl_.TagValue(tag); + if (frameTag.IsNull()) + return vector(); + if(!frameTag.IsUInt16Array()) + throw std::runtime_error("Photons are not a uint16_t array, tag " + + internal::BamRecordTags::LabelFor(tag)); + const vector data = frameTag.ToUInt16Array(); + + vector photons; + photons.reserve(data.size()); + for (const auto& d : data) + photons.emplace_back(d / photonFactor); + return photons; +} + +vector BamRecord::FetchPhotons(const BamRecordTag tag, + const Orientation orientation, + const bool aligned, + const bool exciseSoftClips, + const PulseBehavior pulseBehavior) const +{ + const bool isPulse = internal::BamRecordTags::IsPulse(tag); + + // fetch raw + auto data = FetchPhotonsRaw(tag); + Orientation current = Orientation::NATIVE; + + if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) { + // strip 'squashed' pulse loci + CalculatePulse2BaseCache(); + data = p2bCache_->RemoveSquashedPulses(data); + } + + if (aligned || exciseSoftClips) { + + if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY) + throw std::runtime_error("Cannot return data at all pulses when gapping and/or soft-clipping are requested. " + "Use PulseBehavior::BASECALLS_ONLY instead."); + + // force into genomic orientation + internal::OrientTagDataAsRequested(&data, + current, + Orientation::GENOMIC, + impl_.IsReverseStrand()); + current = Orientation::GENOMIC; + + // clip & gapify as requested + internal::ClipAndGapifyPhotons(impl_, + aligned, + exciseSoftClips, + &data); + } + + // return in the orientation requested + internal::OrientTagDataAsRequested(&data, + current, + orientation, + impl_.IsReverseStrand()); + return data; +} + +QualityValues BamRecord::FetchQualitiesRaw(const BamRecordTag tag) const +{ + const Tag& qvsTag = impl_.TagValue(tag); + return QualityValues::FromFastq(qvsTag.ToString()); +} + +QualityValues BamRecord::FetchQualities(const BamRecordTag tag, + const Orientation orientation, + const bool aligned, + const bool exciseSoftClips, + const PulseBehavior pulseBehavior) const +{ + // requested data info + const bool isBamQual = (tag == BamRecordTag::QUAL); + const bool isPulse = internal::BamRecordTags::IsPulse(tag); + + // fetch raw + QualityValues quals; + Orientation current; + if (isBamQual) { // QUAL stored in genomic orientation + quals = impl_.Qualities(); + current = Orientation::GENOMIC; + } else { // all tags stored in native orientation + quals = FetchQualitiesRaw(tag); + current = Orientation::NATIVE; + } + + if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) { + // strip 'squashed' pulse loci + CalculatePulse2BaseCache(); + quals = p2bCache_->RemoveSquashedPulses(quals); + } + + // if we need to touch CIGAR + if (aligned || exciseSoftClips) { + + if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY) + throw std::runtime_error("Cannot return data at all pulses when gapping and/or soft-clipping are requested. " + "Use PulseBehavior::BASECALLS_ONLY instead."); + + // force into genomic orientation + internal::OrientTagDataAsRequested(&quals, + current, + Orientation::GENOMIC, + impl_.IsReverseStrand()); + current = Orientation::GENOMIC; + + // clip & gapify as requested + internal::ClipAndGapifyQualities(impl_, + aligned, + exciseSoftClips, + &quals); + } + + // return in the orientation requested + internal::OrientTagDataAsRequested(&quals, + current, + orientation, + impl_.IsReverseStrand()); + return quals; +} + +vector BamRecord::FetchUIntsRaw(const BamRecordTag tag) const +{ + // fetch tag data + const Tag& frameTag = impl_.TagValue(tag); + if (frameTag.IsNull()) + return std::vector(); + if(!frameTag.IsUInt32Array()) + throw std::runtime_error("Tag data are not a uint32_t array, tag " + + internal::BamRecordTags::LabelFor(tag)); + return frameTag.ToUInt32Array(); +} + +vector BamRecord::FetchUInts(const BamRecordTag tag, + const Orientation orientation, + const bool aligned, + const bool exciseSoftClips, + const PulseBehavior pulseBehavior) const +{ + const bool isPulse = internal::BamRecordTags::IsPulse(tag); + + // fetch raw + auto arr = FetchUIntsRaw(tag); + Orientation current = Orientation::NATIVE; + + if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) { + // strip 'squashed' pulse loci + CalculatePulse2BaseCache(); + arr = p2bCache_->RemoveSquashedPulses(arr); + } + + if (aligned || exciseSoftClips) { + + if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY) + throw std::runtime_error("Cannot return data at all pulses when gapping and/or soft-clipping are requested. " + "Use PulseBehavior::BASECALLS_ONLY instead."); + + // force into genomic orientation + internal::OrientTagDataAsRequested(&arr, + current, + Orientation::GENOMIC, + impl_.IsReverseStrand()); + current = Orientation::GENOMIC; + + // clip & gapify as requested + internal::ClipAndGapifyUInts(impl_, + aligned, + exciseSoftClips, + &arr); + } + + // return in the orientation requested + internal::OrientTagDataAsRequested(&arr, + current, + orientation, + impl_.IsReverseStrand()); + return arr; +} + +string BamRecord::FullName(void) const +{ return impl_.Name(); } + +bool BamRecord::HasAltLabelQV(void) const +{ return impl_.HasTag(BamRecordTag::ALT_LABEL_QV); } + +bool BamRecord::HasAltLabelTag(void) const +{ return impl_.HasTag(BamRecordTag::ALT_LABEL_TAG); } + +bool BamRecord::HasBarcodes(void) const +{ return impl_.HasTag(BamRecordTag::BARCODES); } + +bool BamRecord::HasBarcodeQuality(void) const +{ return impl_.HasTag(BamRecordTag::BARCODE_QUALITY); } + +bool BamRecord::HasLabelQV(void) const +{ return impl_.HasTag(BamRecordTag::LABEL_QV); } + +bool BamRecord::HasDeletionQV(void) const +{ return impl_.HasTag(BamRecordTag::DELETION_QV); } + +bool BamRecord::HasDeletionTag(void) const +{ return impl_.HasTag(BamRecordTag::DELETION_TAG); } + +bool BamRecord::HasHoleNumber(void) const +{ + return impl_.HasTag(BamRecordTag::HOLE_NUMBER) + && !impl_.TagValue(BamRecordTag::HOLE_NUMBER).IsNull(); +} + +bool BamRecord::HasInsertionQV(void) const +{ return impl_.HasTag(BamRecordTag::INSERTION_QV); } + +bool BamRecord::HasNumPasses(void) const +{ return impl_.HasTag(BamRecordTag::NUM_PASSES); } + +bool BamRecord::HasPreBaseFrames(void) const +{ return HasIPD(); } + +bool BamRecord::HasIPD(void) const +{ return impl_.HasTag(BamRecordTag::IPD); } + +bool BamRecord::HasLocalContextFlags(void) const +{ return impl_.HasTag(BamRecordTag::CONTEXT_FLAGS); } + +bool BamRecord::HasMergeQV(void) const +{ return impl_.HasTag(BamRecordTag::MERGE_QV); } + +bool BamRecord::HasPulseMergeQV(void) const +{ return impl_.HasTag(BamRecordTag::PULSE_MERGE_QV); } + +bool BamRecord::HasPkmean(void) const +{ return impl_.HasTag(BamRecordTag::PKMEAN); } + +bool BamRecord::HasPkmean2(void) const +{ return impl_.HasTag(BamRecordTag::PKMEAN_2); } + +bool BamRecord::HasPkmid(void) const +{ return impl_.HasTag(BamRecordTag::PKMID); } + +bool BamRecord::HasPkmid2(void) const +{ return impl_.HasTag(BamRecordTag::PKMID_2); } + +bool BamRecord::HasPrePulseFrames(void) const +{ return impl_.HasTag(BamRecordTag::PRE_PULSE_FRAMES); } + +bool BamRecord::HasPulseCall(void) const +{ return impl_.HasTag(BamRecordTag::PULSE_CALL) + && !impl_.TagValue(BamRecordTag::PULSE_CALL).IsNull(); +} + +bool BamRecord::HasPulseCallWidth(void) const +{ return impl_.HasTag(BamRecordTag::PULSE_CALL_WIDTH); } + +bool BamRecord::HasPulseWidth(void) const +{ return impl_.HasTag(BamRecordTag::PULSE_WIDTH); } + +bool BamRecord::HasQueryEnd(void) const +{ return impl_.HasTag(BamRecordTag::QUERY_END); } + +bool BamRecord::HasQueryStart(void) const +{ return impl_.HasTag(BamRecordTag::QUERY_START); } + +bool BamRecord::HasReadAccuracy(void) const +{ return impl_.HasTag(BamRecordTag::READ_ACCURACY) + && !impl_.TagValue(BamRecordTag::READ_ACCURACY).IsNull(); +} + +bool BamRecord::HasScrapRegionType(void) const +{ return impl_.HasTag(BamRecordTag::SCRAP_REGION_TYPE) + && !impl_.TagValue(BamRecordTag::SCRAP_REGION_TYPE).IsNull(); +} + +bool BamRecord::HasScrapZmwType(void) const +{ return impl_.HasTag(BamRecordTag::SCRAP_ZMW_TYPE) + && !impl_.TagValue(BamRecordTag::SCRAP_ZMW_TYPE).IsNull(); +} + +bool BamRecord::HasStartFrame(void) const +{ return impl_.HasTag(BamRecordTag::START_FRAME); } + +bool BamRecord::HasSignalToNoise(void) const +{ return impl_.HasTag(BamRecordTag::SNR); } + +bool BamRecord::HasSubstitutionQV(void) const +{ return impl_.HasTag(BamRecordTag::SUBSTITUTION_QV); } + +bool BamRecord::HasSubstitutionTag(void) const +{ return impl_.HasTag(BamRecordTag::SUBSTITUTION_TAG); } + +BamHeader BamRecord::Header(void) const +{ return header_; } + +int32_t BamRecord::HoleNumber(void) const +{ + const Tag& holeNumber = impl_.TagValue(BamRecordTag::HOLE_NUMBER); + if (!holeNumber.IsNull()) + return holeNumber.ToInt32(); + + // missing zm tag - try to pull from name + return internal::HoleNumberFromName(FullName()); +} + +BamRecord& BamRecord::HoleNumber(const int32_t holeNumber) +{ + internal::CreateOrEdit(BamRecordTag::HOLE_NUMBER, + holeNumber, + &impl_); + return *this; +} + +BamRecordImpl& BamRecord::Impl(void) +{ return impl_; } + +const BamRecordImpl& BamRecord::Impl(void) const +{ return impl_; } + +QualityValues BamRecord::InsertionQV(Orientation orientation, + bool aligned, + bool exciseSoftClips) const +{ + return FetchQualities(BamRecordTag::INSERTION_QV, + orientation, + aligned, + exciseSoftClips); +} + +BamRecord& BamRecord::InsertionQV(const QualityValues& insertionQVs) +{ + internal::CreateOrEdit(BamRecordTag::INSERTION_QV, + insertionQVs.Fastq(), + &impl_); + return *this; +} + +Frames BamRecord::IPD(Orientation orientation, + bool aligned, + bool exciseSoftClips) const +{ + return FetchFrames(BamRecordTag::IPD, + orientation, + aligned, + exciseSoftClips); +} + +BamRecord& BamRecord::IPD(const Frames& frames, + const FrameEncodingType encoding) +{ + if (encoding == FrameEncodingType::LOSSY) + internal::CreateOrEdit(BamRecordTag::IPD, frames.Encode(), &impl_); + else + internal::CreateOrEdit(BamRecordTag::IPD, frames.Data(), &impl_); + return *this; +} + +Frames BamRecord::IPDRaw(Orientation orientation) const +{ + Frames frames; + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::IPD); + const Tag& frameTag = impl_.TagValue(tagName); + if (frameTag.IsNull()) + return frames; + + // lossy frame codes + if (frameTag.IsUInt8Array()) { + const vector codes = frameTag.ToUInt8Array(); + const vector codes16(codes.begin(), codes.end()); + frames.Data(std::move(codes16)); + } + + // lossless frame data + else { + assert(frameTag.IsUInt16Array()); + frames.Data(frameTag.ToUInt16Array()); + } + + // return in requested orientation + internal::OrientTagDataAsRequested(&frames, + Orientation::NATIVE, // current + orientation, // requested + impl_.IsReverseStrand()); + return frames; +} + +bool BamRecord::IsMapped(void) const +{ return impl_.IsMapped(); } + +QualityValues BamRecord::LabelQV(Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) const +{ + return FetchQualities(BamRecordTag::LABEL_QV, + orientation, + aligned, + exciseSoftClips, + pulseBehavior); +} + +BamRecord& BamRecord::LabelQV(const QualityValues& labelQVs) +{ + internal::CreateOrEdit(BamRecordTag::LABEL_QV, + labelQVs.Fastq(), + &impl_); + return *this; +} + +LocalContextFlags BamRecord::LocalContextFlags(void) const +{ + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::CONTEXT_FLAGS); + const Tag& cxTag = impl_.TagValue(tagName); + return static_cast(cxTag.ToUInt8()); +} + +BamRecord& BamRecord::LocalContextFlags(const PacBio::BAM::LocalContextFlags flags) +{ + internal::CreateOrEdit(BamRecordTag::CONTEXT_FLAGS, + static_cast(flags), + &impl_); + return *this; +} + +BamRecord& BamRecord::Map(const int32_t referenceId, + const Position refStart, + const Strand strand, + const Cigar& cigar, + const uint8_t mappingQuality) +{ + impl_.Position(refStart); + impl_.ReferenceId(referenceId); + impl_.CigarData(cigar); + impl_.MapQuality(mappingQuality); + impl_.SetMapped(true); + + if (strand == Strand::FORWARD) + impl_.SetReverseStrand(false); + + else { + assert(strand == Strand::REVERSE); + impl_.SetReverseStrand(true); + + // switch seq & qual + string sequence = impl_.Sequence(); + QualityValues qualities = impl_.Qualities(); + + internal::ReverseComplement(sequence); + internal::Reverse(qualities); + + impl_.SetSequenceAndQualities(sequence, qualities.Fastq()); + } + + // reset any cached aligned start/end + alignedStart_ = PacBio::BAM::UnmappedPosition; + alignedEnd_ = PacBio::BAM::UnmappedPosition; + + return *this; +} + +uint8_t BamRecord::MapQuality(void) const +{ return impl_.MapQuality(); } + +QualityValues BamRecord::MergeQV(Orientation orientation, + bool aligned, + bool exciseSoftClips) const +{ + return FetchQualities(BamRecordTag::MERGE_QV, + orientation, + aligned, + exciseSoftClips); +} + +BamRecord& BamRecord::MergeQV(const QualityValues& mergeQVs) +{ + internal::CreateOrEdit(BamRecordTag::MERGE_QV, + mergeQVs.Fastq(), + &impl_); + return *this; +} + +string BamRecord::MovieName(void) const +{ return ReadGroup().MovieName(); } + +size_t BamRecord::NumDeletedBases(void) const +{ + auto tEnd = ReferenceEnd(); + auto tStart = ReferenceStart(); + auto numMatchesAndMismatches = NumMatchesAndMismatches(); + auto nM = numMatchesAndMismatches.first; + auto nMM = numMatchesAndMismatches.second; + return (tEnd - tStart - nM - nMM); +} + +size_t BamRecord::NumInsertedBases(void) const +{ + auto aEnd = AlignedEnd(); + auto aStart = AlignedStart(); + auto numMatchesAndMismatches = NumMatchesAndMismatches(); + auto nM = numMatchesAndMismatches.first; + auto nMM = numMatchesAndMismatches.second; + return (aEnd - aStart - nM - nMM); +} + +size_t BamRecord::NumMatches(void) const +{ return NumMatchesAndMismatches().first; } + +pair BamRecord::NumMatchesAndMismatches(void) const +{ + pair result = make_pair(0,0); + PBBAM_SHARED_PTR b = internal::BamRecordMemory::GetRawData(this); + uint32_t* cigarData = bam_get_cigar(b.get()); + for (uint32_t i = 0; i < b->core.n_cigar; ++i) { + const CigarOperationType type = static_cast(bam_cigar_op(cigarData[i])); + if (type == CigarOperationType::SEQUENCE_MATCH) + result.first += bam_cigar_oplen(cigarData[i]); + else if (type == CigarOperationType::SEQUENCE_MISMATCH) + result.second += bam_cigar_oplen(cigarData[i]); + } + return result; +} + +size_t BamRecord::NumMismatches(void) const +{ return NumMatchesAndMismatches().second; } + +int32_t BamRecord::NumPasses(void) const +{ + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::NUM_PASSES); + const Tag& numPasses = impl_.TagValue(tagName); + return numPasses.ToInt32(); +} + +BamRecord& BamRecord::NumPasses(const int32_t numPasses) +{ + internal::CreateOrEdit(BamRecordTag::NUM_PASSES, + numPasses, + &impl_); + return *this; +} + +vector BamRecord::Pkmean(Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) const +{ + return FetchPhotons(BamRecordTag::PKMEAN, + orientation, + aligned, + exciseSoftClips, + pulseBehavior); +} + +BamRecord& BamRecord::Pkmean(const vector& photons) +{ + Pkmean(EncodePhotons(photons)); + return *this; +} + +BamRecord& BamRecord::Pkmean(const vector& encodedPhotons) +{ + internal::CreateOrEdit(BamRecordTag::PKMEAN, + encodedPhotons, + &impl_); + return *this; +} + +vector BamRecord::Pkmid(Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) const +{ + return FetchPhotons(BamRecordTag::PKMID, + orientation, + aligned, + exciseSoftClips, + pulseBehavior); +} + +BamRecord& BamRecord::Pkmid(const vector& photons) +{ + Pkmid(EncodePhotons(photons)); + return *this; +} + +BamRecord& BamRecord::Pkmid(const vector& encodedPhotons) +{ + internal::CreateOrEdit(BamRecordTag::PKMID, + encodedPhotons, + &impl_); + return *this; +} + +vector BamRecord::Pkmean2(Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) const +{ + return FetchPhotons(BamRecordTag::PKMEAN_2, + orientation, + aligned, + exciseSoftClips, + pulseBehavior); +} + +BamRecord& BamRecord::Pkmean2(const vector& photons) +{ + Pkmean2(EncodePhotons(photons)); + return *this; +} + +BamRecord& BamRecord::Pkmean2(const vector& encodedPhotons) +{ + internal::CreateOrEdit(BamRecordTag::PKMEAN_2, + encodedPhotons, + &impl_); + return *this; +} + +vector BamRecord::Pkmid2(Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) const +{ + return FetchPhotons(BamRecordTag::PKMID_2, + orientation, + aligned, + exciseSoftClips, + pulseBehavior); +} + +BamRecord& BamRecord::Pkmid2(const vector& photons) +{ + Pkmid2(EncodePhotons(photons)); + return *this; +} + +BamRecord& BamRecord::Pkmid2(const vector& encodedPhotons) +{ + internal::CreateOrEdit(BamRecordTag::PKMID_2, + encodedPhotons, + &impl_); + return *this; +} + +Frames BamRecord::PreBaseFrames(Orientation orientation, + bool aligned, + bool exciseSoftClips) const +{ return IPD(orientation, aligned, exciseSoftClips); } + +BamRecord& BamRecord::PreBaseFrames(const Frames& frames, + const FrameEncodingType encoding) +{ return IPD(frames, encoding); } + +Frames BamRecord::PrePulseFrames(Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) const +{ + return FetchFrames(BamRecordTag::PRE_PULSE_FRAMES, + orientation, + aligned, + exciseSoftClips, + pulseBehavior); +} + +BamRecord& BamRecord::PrePulseFrames(const Frames& frames, + const FrameEncodingType encoding) +{ + if (encoding == FrameEncodingType::LOSSY) { + internal::CreateOrEdit(BamRecordTag::PRE_PULSE_FRAMES, + frames.Encode(), + &impl_); + } else { + internal::CreateOrEdit(BamRecordTag::PRE_PULSE_FRAMES, + frames.Data(), + &impl_); + } + return *this; +} + +Frames BamRecord::PulseWidthRaw(Orientation orientation, + bool aligned, + bool exciseSoftClips) const +{ + Frames frames; + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::PULSE_WIDTH); + const Tag& frameTag = impl_.TagValue(tagName); + if (frameTag.IsNull()) + return frames; + + // lossy frame codes + if (frameTag.IsUInt8Array()) { + const vector codes = frameTag.ToUInt8Array(); + const vector codes16(codes.begin(), codes.end()); + frames.Data(std::move(codes16)); + } + + // lossless frame data + else { + assert(frameTag.IsUInt16Array()); + frames.Data(frameTag.ToUInt16Array()); + } + + // return in requested orientation + internal::OrientTagDataAsRequested(&frames, + Orientation::NATIVE, // current + orientation, // requested + impl_.IsReverseStrand()); + return frames; +} + + +QualityValues BamRecord::PulseMergeQV(Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) const +{ + return FetchQualities(BamRecordTag::PULSE_MERGE_QV, + orientation, + aligned, + exciseSoftClips, + pulseBehavior); +} + +BamRecord& BamRecord::PulseMergeQV(const QualityValues& mergeQVs) +{ + internal::CreateOrEdit(BamRecordTag::PULSE_MERGE_QV, + mergeQVs.Fastq(), + &impl_); + return *this; +} + + +string BamRecord::PulseCall(Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) const +{ + return FetchBases(BamRecordTag::PULSE_CALL, + orientation, + aligned, + exciseSoftClips, + pulseBehavior); +} + +BamRecord& BamRecord::PulseCall(const string& tags) +{ + internal::CreateOrEdit(BamRecordTag::PULSE_CALL, + tags, + &impl_); + return *this; +} + +Frames BamRecord::PulseCallWidth(Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) const +{ + return FetchFrames(BamRecordTag::PULSE_CALL_WIDTH, + orientation, + aligned, + exciseSoftClips, + pulseBehavior); +} + +BamRecord& BamRecord::PulseCallWidth(const Frames& frames, + const FrameEncodingType encoding) +{ + if (encoding == FrameEncodingType::LOSSY) { + internal::CreateOrEdit(BamRecordTag::PULSE_CALL_WIDTH, + frames.Encode(), + &impl_); + } else { + internal::CreateOrEdit(BamRecordTag::PULSE_CALL_WIDTH, + frames.Data(), + &impl_); + } + return *this; +} + +Frames BamRecord::PulseWidth(Orientation orientation, + bool aligned, + bool exciseSoftClips) const +{ + return FetchFrames(BamRecordTag::PULSE_WIDTH, + orientation, + aligned, + exciseSoftClips, + PulseBehavior::ALL); +} + +BamRecord& BamRecord::PulseWidth(const Frames& frames, + const FrameEncodingType encoding) +{ + if (encoding == FrameEncodingType::LOSSY) { + internal::CreateOrEdit(BamRecordTag::PULSE_WIDTH, + frames.Encode(), + &impl_); + } else { + internal::CreateOrEdit(BamRecordTag::PULSE_WIDTH, + frames.Data(), + &impl_); + } + return *this; +} + +QualityValues BamRecord::Qualities(Orientation orientation, + bool aligned, + bool exciseSoftClips) const +{ + return FetchQualities(BamRecordTag::QUAL, + orientation, + aligned, + exciseSoftClips); +} + +Position BamRecord::QueryEnd(void) const +{ + // try 'qe' tag + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::QUERY_END); + const Tag& qe = impl_.TagValue(tagName); + if (!qe.IsNull()) + return qe.ToInt32(); + + // tag missing, need to check movie name (fallback for non-PB BAMs, but ignore for CCS reads) + RecordType type; + try { + type = Type(); + } catch (std::exception&) { + return Position(0); + } + if (type == RecordType::CCS) + throw std::runtime_error("no query end for CCS read type"); + + // PacBio BAM, non-CCS + try { + return internal::QueryEndFromName(FullName()); + } catch (std::exception&) { + // return fallback position + return Position(0); + } +} + +BamRecord& BamRecord::QueryEnd(const Position pos) +{ + internal::CreateOrEdit(BamRecordTag::QUERY_END, + static_cast(pos), + &impl_); + UpdateName(); + return *this; +} + +Position BamRecord::QueryStart(void) const +{ + // try 'qs' tag + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::QUERY_START); + const Tag& qs = impl_.TagValue(tagName); + if (!qs.IsNull()) + return qs.ToInt32(); + + // tag missing, need to check movie name (fallback for non-PB BAMs, but ignore for CCS reads) + RecordType type; + try { + type = Type(); + } catch (std::exception&) { + return Position(0); + } + if (type == RecordType::CCS) + throw std::runtime_error("no query start for CCS read type"); + + // PacBio BAM, non-CCS + try { + return internal::QueryStartFromName(FullName()); + } catch (std::exception&) { + // return fallback position + return Position(0); + } +} + +BamRecord& BamRecord::QueryStart(const Position pos) +{ + internal::CreateOrEdit(BamRecordTag::QUERY_START, + static_cast(pos), + &impl_); + UpdateName(); + return *this; +} + +Accuracy BamRecord::ReadAccuracy(void) const +{ + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::READ_ACCURACY); + const Tag& readAccuracy = impl_.TagValue(tagName); + return Accuracy(readAccuracy.ToFloat()); +} + +BamRecord& BamRecord::ReadAccuracy(const Accuracy& accuracy) +{ + internal::CreateOrEdit(BamRecordTag::READ_ACCURACY, + static_cast(accuracy), + &impl_); + return *this; +} + +ReadGroupInfo BamRecord::ReadGroup(void) const +{ return header_.ReadGroup(ReadGroupId()); } + +BamRecord& BamRecord::ReadGroup(const ReadGroupInfo& rg) +{ + internal::CreateOrEdit(BamRecordTag::READ_GROUP, + rg.Id(), + &impl_); + UpdateName(); + return *this; +} + +string BamRecord::ReadGroupId(void) const +{ + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::READ_GROUP); + const Tag& rgTag = impl_.TagValue(tagName); + if (rgTag.IsNull()) + return string(); + return rgTag.ToString(); +} + +BamRecord& BamRecord::ReadGroupId(const string& id) +{ + internal::CreateOrEdit(BamRecordTag::READ_GROUP, + id, + &impl_); + UpdateName(); + return *this; +} + +int32_t BamRecord::ReadGroupNumericId(void) const +{ return ReadGroupInfo::IdToInt(ReadGroupId()); } + +Position BamRecord::ReferenceEnd(void) const +{ + if (!impl_.IsMapped()) + return PacBio::BAM::UnmappedPosition; + PBBAM_SHARED_PTR htsData = internal::BamRecordMemory::GetRawData(impl_); + if (!htsData) + return PacBio::BAM::UnmappedPosition; + return bam_endpos(htsData.get()); +} + +int32_t BamRecord::ReferenceId(void) const +{ return impl_.ReferenceId(); } + +string BamRecord::ReferenceName(void) const +{ + if (IsMapped()) + return Header().SequenceName(ReferenceId()); + else + throw std::runtime_error("unmapped record has no associated reference name"); +} + +Position BamRecord::ReferenceStart(void) const +{ return impl_.Position(); } + +void BamRecord::ResetCachedPositions(void) const +{ + alignedEnd_ = PacBio::BAM::UnmappedPosition; + alignedStart_ = PacBio::BAM::UnmappedPosition; +} + +void BamRecord::ResetCachedPositions(void) +{ + alignedEnd_ = PacBio::BAM::UnmappedPosition; + alignedStart_ = PacBio::BAM::UnmappedPosition; +} + +VirtualRegionType BamRecord::ScrapRegionType(void) const +{ + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::SCRAP_REGION_TYPE); + const Tag& srTag = impl_.TagValue(tagName); + return VirtualRegionTypeMap::ParseChar[srTag.ToUInt8()]; +} + +BamRecord& BamRecord::ScrapRegionType(const VirtualRegionType type) +{ + internal::CreateOrEdit(BamRecordTag::SCRAP_REGION_TYPE, + static_cast(type), + &impl_); + return *this; +} + +BamRecord& BamRecord::ScrapRegionType(const char type) +{ + internal::CreateOrEdit(BamRecordTag::SCRAP_REGION_TYPE, + type, + &impl_); + return *this; +} + +ZmwType BamRecord::ScrapZmwType(void) const +{ + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::SCRAP_ZMW_TYPE); + const Tag& szTag = impl_.TagValue(tagName); + return ZmwTypeMap::ParseChar[szTag.ToUInt8()]; +} + +BamRecord& BamRecord::ScrapZmwType(const ZmwType type) +{ + internal::CreateOrEdit(BamRecordTag::SCRAP_ZMW_TYPE, + static_cast(type), + &impl_); + return *this; +} + +BamRecord& BamRecord::ScrapZmwType(const char type) +{ + internal::CreateOrEdit(BamRecordTag::SCRAP_ZMW_TYPE, + type, + &impl_); + return *this; +} + +string BamRecord::Sequence(const Orientation orientation, + bool aligned, + bool exciseSoftClips) const +{ + return FetchBases(BamRecordTag::SEQ, + orientation, + aligned, + exciseSoftClips); +} + +vector BamRecord::SignalToNoise(void) const +{ + const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::SNR); + const Tag& snTag = impl_.TagValue(tagName); + return snTag.ToFloatArray(); +} + +BamRecord& BamRecord::SignalToNoise(const vector& snr) +{ + internal::CreateOrEdit(BamRecordTag::SNR, + snr, + &impl_); + return *this; +} + +vector BamRecord::StartFrame(Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) const +{ + return FetchUInts(BamRecordTag::START_FRAME, + orientation, + aligned, + exciseSoftClips, + pulseBehavior); +} + +BamRecord& BamRecord::StartFrame(const vector& startFrame) +{ + internal::CreateOrEdit(BamRecordTag::START_FRAME, + startFrame, + &impl_); + return *this; +} + +QualityValues BamRecord::SubstitutionQV(Orientation orientation, + bool aligned, + bool exciseSoftClips) const +{ + return FetchQualities(BamRecordTag::SUBSTITUTION_QV, + orientation, + aligned, + exciseSoftClips); +} + +BamRecord& BamRecord::SubstitutionQV(const QualityValues& substitutionQVs) +{ + internal::CreateOrEdit(BamRecordTag::SUBSTITUTION_QV, + substitutionQVs.Fastq(), + &impl_); + return *this; +} + +string BamRecord::SubstitutionTag(Orientation orientation, + bool aligned, + bool exciseSoftClips) const +{ + return FetchBases(BamRecordTag::SUBSTITUTION_TAG, + orientation, + aligned, + exciseSoftClips); +} + +BamRecord& BamRecord::SubstitutionTag(const string& tags) +{ + internal::CreateOrEdit(BamRecordTag::SUBSTITUTION_TAG, + tags, + &impl_); + return *this; +} + +RecordType BamRecord::Type(void) const +{ + try { + const string& typeName = ReadGroup().ReadType(); + return internal::NameToType(typeName); + } catch (std::exception&) { + + // read group not found + // peek at name to see if we're CCS + if (FullName().find("ccs") != string::npos) + return RecordType::CCS; + + // otherwise unknown + else + return RecordType::UNKNOWN; + } +} + +void BamRecord::UpdateName() +{ + string newName; + newName.reserve(100); + + newName += MovieName(); + newName += "/"; + + if (HasHoleNumber()) + newName += std::to_string(HoleNumber()); + else + newName += "?"; + + newName += "/"; + + if (Type() == RecordType::CCS) + newName += "ccs"; + else { + if (HasQueryStart()) + newName += std::to_string(QueryStart()); + else + newName += "?"; + + newName += '_'; + + if (HasQueryEnd()) + newName += std::to_string(QueryEnd()); + else + newName += "?"; + } + + impl_.Name(newName); +} diff --git a/src/BamRecordBuilder.cpp b/src/BamRecordBuilder.cpp new file mode 100644 index 0000000..004a0ea --- /dev/null +++ b/src/BamRecordBuilder.cpp @@ -0,0 +1,393 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "pbbam/BamRecordBuilder.h" +#include "pbbam/BamTagCodec.h" +#include "AssertUtils.h" +#include "MemoryUtils.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +BamRecordBuilder::BamRecordBuilder(void) +{ + // ensure proper clean slate + Reset(); + + // initialize with some space for data + name_.reserve(256); + sequence_.reserve(2096); + qualities_.reserve(2096); + cigar_.reserve(256); +} + +BamRecordBuilder::BamRecordBuilder(const BamHeader& header) + : header_(header) +{ + // ensure proper clean slate + Reset(); + + // initialize with some space for data + name_.reserve(256); + sequence_.reserve(2096); + qualities_.reserve(2096); + cigar_.reserve(256); +} + +BamRecordBuilder::BamRecordBuilder(const BamRecord& prototype) + : header_(prototype.Header()) +{ + Reset(prototype); +} + +BamRecordBuilder::BamRecordBuilder(const BamRecordBuilder& other) + : core_(other.core_) + , name_(other.name_) + , sequence_(other.sequence_) + , qualities_(other.qualities_) + , cigar_(other.cigar_) + , tags_(other.tags_) +{ } + +BamRecordBuilder::BamRecordBuilder(BamRecordBuilder&& other) + : core_(std::move(other.core_)) + , name_(std::move(other.name_)) + , sequence_(std::move(other.sequence_)) + , qualities_(std::move(other.qualities_)) + , cigar_(std::move(other.cigar_)) + , tags_(std::move(other.tags_)) +{ } + +BamRecordBuilder& BamRecordBuilder::operator=(const BamRecordBuilder& other) +{ + core_ = other.core_; + name_ = other.name_; + sequence_ = other.sequence_; + qualities_ = other.qualities_; + cigar_ = other.cigar_; + tags_ = other.tags_; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::operator=(BamRecordBuilder&& other) +{ + core_ = std::move(other.core_); + name_ = std::move(other.name_); + sequence_ = std::move(other.sequence_); + qualities_ = std::move(other.qualities_); + cigar_ = std::move(other.cigar_); + tags_ = std::move(other.tags_); + return *this; +} + +BamRecordBuilder::~BamRecordBuilder(void) { } + +BamRecord BamRecordBuilder::Build(void) const +{ + BamRecord result(header_); + BuildInPlace(result); + return result; +} + +bool BamRecordBuilder::BuildInPlace(BamRecord& record) const +{ + // initialize with basic 'core data' + PBBAM_SHARED_PTR recordRawData = internal::BamRecordMemory::GetRawData(record); /* record.impl_.RawData().get();*/ + PB_ASSERT_OR_RETURN_VALUE(recordRawData, false); + PB_ASSERT_OR_RETURN_VALUE(recordRawData->data, false); + recordRawData->core = core_; + + // setup variable length data + const vector encodedTags = BamTagCodec::Encode(tags_); + + const size_t nameLength = name_.size() + 1; + const size_t numCigarOps = cigar_.size(); + const size_t cigarLength = numCigarOps * sizeof(uint32_t); + const size_t seqLength = sequence_.size(); + const size_t qualLength = seqLength; + const size_t tagLength = encodedTags.size(); + const size_t dataLength = nameLength + cigarLength + seqLength + qualLength + tagLength; + + // realloc if necessary + uint8_t* varLengthDataBlock = recordRawData->data; + PB_ASSERT_OR_RETURN_VALUE(varLengthDataBlock, false); + size_t allocatedDataLength = recordRawData->m_data; + if (allocatedDataLength < dataLength) { + allocatedDataLength = dataLength; + kroundup32(allocatedDataLength); + varLengthDataBlock = (uint8_t*)realloc(varLengthDataBlock, allocatedDataLength); + } + recordRawData->data = varLengthDataBlock; + recordRawData->l_data = dataLength; + recordRawData->m_data = allocatedDataLength; + + size_t index = 0; + + // name + memcpy(&varLengthDataBlock[index], name_.c_str(), nameLength); + index += nameLength; + + // cigar + if (cigarLength > 0) { + vector encodedCigar(numCigarOps); + for (size_t i = 0; i < numCigarOps; ++i) { + const CigarOperation& op = cigar_.at(i); + encodedCigar[i] = op.Length() << BAM_CIGAR_SHIFT; + const uint8_t type = static_cast(op.Type()); + PB_ASSERT_OR_RETURN_VALUE(type >= 0 && type < 8, false); + encodedCigar[i] |= type; + } + memcpy(&varLengthDataBlock[index], &encodedCigar[0], cigarLength); + index += cigarLength; + + // update bin after we've calculated cigar info + const int32_t endPosition = bam_cigar2rlen(recordRawData->core.n_cigar, &encodedCigar[0]); + recordRawData->core.bin = hts_reg2bin(core_.pos, endPosition, 14, 5); + } + + // seq & qual + if (seqLength > 0) { + + uint8_t* s = &varLengthDataBlock[index]; + for (size_t i = 0; i < seqLength; ++i) + s[i>>1] |= ( seq_nt16_table[static_cast(sequence_.at(i))] << ((~i&1)<<2) ); + index += seqLength; + + uint8_t* q = &varLengthDataBlock[index]; + if (!qualities_.empty()) + memset(q, 0xFF, seqLength); + else { + for (size_t i = 0; i < seqLength; ++i) + q[i] = qualities_.at(i) - 33; + } + index += seqLength; + } + + // tags + if (tagLength > 0) { + PB_ASSERT_OR_RETURN_VALUE(!encodedTags.empty(), false); + memcpy(&varLengthDataBlock[index], &encodedTags[0], tagLength); + index += tagLength; + } + + // sanity check + PB_ASSERT_OR_RETURN_VALUE(index == dataLength, false); + return true; +} + +BamRecordBuilder& BamRecordBuilder::Cigar(const PacBio::BAM::Cigar& cigar) +{ + core_.n_cigar = cigar.size(); + cigar_ = cigar; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::Cigar(PacBio::BAM::Cigar&& cigar) +{ + core_.n_cigar = cigar.size(); + cigar_ = std::move(cigar); + return *this; +} + +BamRecordBuilder& BamRecordBuilder::Name(const std::string& name) +{ + core_.l_qname = name.size() + 1; // (NULL-term) + name_ = name; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::Name(std::string&& name) +{ + core_.l_qname = name.size() + 1; // (NULL-term) + name_ = std::move(name); + return *this; +} + +void BamRecordBuilder::Reset(void) +{ + // zeroize fixed-length data + memset(&core_, 0, sizeof(bam1_core_t)); + core_.l_qname = 1; // always has a NULL-term + + // reset variable-length data + name_.clear(); + sequence_.clear(); + qualities_.clear(); + cigar_.clear(); + tags_.clear(); +} + +void BamRecordBuilder::Reset(const BamRecord& prototype) +{ + // ensure clean slate + Reset(); + header_ = prototype.Header(); + + // reset core data + const PBBAM_SHARED_PTR rawData = internal::BamRecordMemory::GetRawData(prototype); // prototype.impl_.RawData().get(); + PB_ASSERT_OR_RETURN(rawData); + core_ = rawData->core; + + // reset variable-length data + const BamRecordImpl& impl = internal::BamRecordMemory::GetImpl(prototype); + name_ = impl.Name(); + sequence_ = impl.Sequence(); + qualities_ = impl.Qualities().Fastq(); + cigar_ = impl.CigarData(); + tags_ = impl.Tags(); +} + +void BamRecordBuilder::Reset(BamRecord&& prototype) +{ + // ensure clean slate + Reset(); + header_ = prototype.Header(); + + // reset core data + const PBBAM_SHARED_PTR rawData = internal::BamRecordMemory::GetRawData(prototype); // prototype.impl_.RawData().get(); + PB_ASSERT_OR_RETURN(rawData); + core_ = std::move(rawData->core); + + // reset variable-length data + const BamRecordImpl& impl = internal::BamRecordMemory::GetImpl(prototype); + name_ = impl.Name(); + sequence_ = impl.Sequence(); + qualities_ = impl.Qualities().Fastq(); + cigar_ = impl.CigarData(); + tags_ = impl.Tags(); +} + +BamRecordBuilder& BamRecordBuilder::Sequence(const std::string& sequence) +{ + core_.l_qseq = sequence.size(); + sequence_ = sequence; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::Sequence(std::string&& sequence) +{ + core_.l_qseq = sequence.size(); + sequence_ = std::move(sequence); + return *this; +} + +BamRecordBuilder& BamRecordBuilder::SetDuplicate(bool ok) +{ + if (ok) core_.flag |= BamRecordImpl::DUPLICATE; + else core_.flag &= ~BamRecordImpl::DUPLICATE; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::SetFailedQC(bool ok) +{ + if (ok) core_.flag |= BamRecordImpl::FAILED_QC; + else core_.flag &= ~BamRecordImpl::FAILED_QC; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::SetFirstMate(bool ok) +{ + if (ok) core_.flag |= BamRecordImpl::MATE_1; + else core_.flag &= ~BamRecordImpl::MATE_1; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::SetMapped(bool ok) +{ + if (ok) core_.flag &= ~BamRecordImpl::UNMAPPED; + else core_.flag |= BamRecordImpl::UNMAPPED; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::SetMateMapped(bool ok) +{ + if (ok) core_.flag &= ~BamRecordImpl::MATE_UNMAPPED; + else core_.flag |= BamRecordImpl::MATE_UNMAPPED; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::SetMateReverseStrand(bool ok) +{ + if (ok) core_.flag |= BamRecordImpl::MATE_REVERSE_STRAND; + else core_.flag &= ~BamRecordImpl::MATE_REVERSE_STRAND; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::SetPaired(bool ok) +{ + if (ok) core_.flag |= BamRecordImpl::PAIRED; + else core_.flag &= ~BamRecordImpl::PAIRED; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::SetPrimaryAlignment(bool ok) +{ + if (ok) core_.flag &= ~BamRecordImpl::SECONDARY; + else core_.flag |= BamRecordImpl::SECONDARY; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::SetProperPair(bool ok) +{ + if (ok) core_.flag |= BamRecordImpl::PROPER_PAIR; + else core_.flag &= ~BamRecordImpl::PROPER_PAIR; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::SetReverseStrand(bool ok) +{ + if (ok) core_.flag |= BamRecordImpl::REVERSE_STRAND; + else core_.flag &= ~BamRecordImpl::REVERSE_STRAND; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::SetSecondMate(bool ok) +{ + if (ok) core_.flag |= BamRecordImpl::MATE_2; + else core_.flag &= ~BamRecordImpl::MATE_2; + return *this; +} + +BamRecordBuilder& BamRecordBuilder::SetSupplementaryAlignment(bool ok) +{ + if (ok) core_.flag |= BamRecordImpl::SUPPLEMENTARY; + else core_.flag &= ~BamRecordImpl::SUPPLEMENTARY; + return *this; +} diff --git a/src/BamRecordImpl.cpp b/src/BamRecordImpl.cpp new file mode 100644 index 0000000..c6c127e --- /dev/null +++ b/src/BamRecordImpl.cpp @@ -0,0 +1,598 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "pbbam/BamRecordImpl.h" +#include "pbbam/BamTagCodec.h" +#include "AssertUtils.h" +#include "BamRecordTags.h" +#include "MemoryUtils.h" +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +BamRecordImpl::BamRecordImpl(void) + : d_(nullptr) +{ + InitializeData(); +} + +BamRecordImpl::BamRecordImpl(const BamRecordImpl& other) + : d_(bam_dup1(other.d_.get()), internal::HtslibRecordDeleter()) + , tagOffsets_(other.tagOffsets_) +{ } + +BamRecordImpl::BamRecordImpl(BamRecordImpl&& other) + : d_(nullptr) + , tagOffsets_(std::move(other.tagOffsets_)) +{ + d_.swap(other.d_); + other.d_.reset(); +} + +BamRecordImpl& BamRecordImpl::operator=(const BamRecordImpl& other) +{ + if (this != & other) { + if (d_ == nullptr) + InitializeData(); + bam_copy1(d_.get(), other.d_.get()); + tagOffsets_ = other.tagOffsets_; + } + return *this; +} + +BamRecordImpl& BamRecordImpl::operator=(BamRecordImpl&& other) +{ + if (this != & other) { + d_.swap(other.d_); + other.d_.reset(); + + tagOffsets_ = std::move(other.tagOffsets_); + } + return *this; +} + +BamRecordImpl::~BamRecordImpl(void) { } + +bool BamRecordImpl::AddTag(const string& tagName, + const Tag &value) +{ + return AddTag(tagName, value, TagModifier::NONE); +} + +bool BamRecordImpl::AddTag(const BamRecordTag tag, + const Tag& value) +{ + return AddTag(internal::BamRecordTags::LabelFor(tag), + value, + TagModifier::NONE); +} + +bool BamRecordImpl::AddTag(const string& tagName, + const Tag& value, + const TagModifier additionalModifier) +{ + if (tagName.size() != 2 || HasTag(tagName)) + return false; + const bool added = AddTagImpl(tagName, value, additionalModifier); + if (added) + UpdateTagMap(); + return added; +} + +bool BamRecordImpl::AddTag(const BamRecordTag tag, + const Tag& value, + const TagModifier additionalModifier) +{ + return AddTag(internal::BamRecordTags::LabelFor(tag), + value, + additionalModifier); +} + +bool BamRecordImpl::AddTagImpl(const string& tagName, + const Tag& value, + const TagModifier additionalModifier) +{ + const vector rawData = BamTagCodec::ToRawData(value, additionalModifier); + if (rawData.empty()) + return false; + + bam_aux_append(d_.get(), + tagName.c_str(), + BamTagCodec::TagTypeCode(value, additionalModifier), + rawData.size(), + const_cast(rawData.data())); + return true; +} + +Cigar BamRecordImpl::CigarData(void) const +{ + Cigar result; + result.reserve(d_->core.n_cigar); + uint32_t* cigarData = bam_get_cigar(d_); + for (uint32_t i = 0; i < d_->core.n_cigar; ++i) { + const uint32_t length = bam_cigar_oplen(cigarData[i]); + const CigarOperationType type = static_cast(bam_cigar_op(cigarData[i])); + result.push_back(CigarOperation(type, length)); + } + + return result; +} + +BamRecordImpl& BamRecordImpl::CigarData(const Cigar& cigar) +{ + // determine change in memory needed + // diffNumBytes: pos -> growing, neg -> shrinking + const size_t numCigarOps = cigar.size(); + const int diffNumCigars = numCigarOps - d_->core.n_cigar; + const int diffNumBytes = diffNumCigars * sizeof(uint32_t); + const int oldLengthData = d_->l_data; + d_->l_data += diffNumBytes; + MaybeReallocData(); + + // shift trailing data (seq, qual, tags) as needed + const uint8_t* oldSequenceStart = bam_get_seq(d_); + const size_t trailingDataLength = oldLengthData - (oldSequenceStart - d_->data); + d_->core.n_cigar = numCigarOps; + uint8_t* newSequenceStart = bam_get_seq(d_); + memmove(newSequenceStart, oldSequenceStart, trailingDataLength); + + // fill in new CIGAR data + uint32_t* cigarDataStart = bam_get_cigar(d_); + for (size_t i = 0; i < numCigarOps; ++i) { + const CigarOperation& cigarOp = cigar.at(i); + cigarDataStart[i] = bam_cigar_gen(cigarOp.Length(), static_cast(cigarOp.Type())); + } + + return *this; +} + +BamRecordImpl& BamRecordImpl::CigarData(const std::string& cigarString) +{ + return CigarData(Cigar::FromStdString(cigarString)); +} + +bool BamRecordImpl::EditTag(const string& tagName, + const Tag& newValue) +{ + return EditTag(tagName, newValue, TagModifier::NONE); +} + +bool BamRecordImpl::EditTag(const BamRecordTag tag, + const Tag& newValue) +{ + return EditTag(internal::BamRecordTags::LabelFor(tag), + newValue, + TagModifier::NONE); +} + +bool BamRecordImpl::EditTag(const string& tagName, + const Tag& newValue, + const TagModifier additionalModifier) +{ + // try remove old value (with delayed tag map update) + const bool removed = RemoveTagImpl(tagName); + if (!removed) + return false; + + // if old value removed, add new value + const bool added = AddTagImpl(tagName, newValue, additionalModifier); + if (added) + UpdateTagMap(); + return added; +} + +bool BamRecordImpl::EditTag(const BamRecordTag tag, + const Tag& newValue, + const TagModifier additionalModifier) +{ + return EditTag(internal::BamRecordTags::LabelFor(tag), + newValue, + additionalModifier); +} + +BamRecordImpl BamRecordImpl::FromRawData(const PBBAM_SHARED_PTR& rawData) +{ + BamRecordImpl result; + bam_copy1(result.d_.get(), rawData.get()); + return result; +} + +bool BamRecordImpl::HasTag(const string& tagName) const +{ + if (tagName.size() != 2) + return false; + return TagOffset(tagName) != -1; + + // 27635 +// return bam_aux_get(d_.get(), tagName.c_str()) != 0; +} + +bool BamRecordImpl::HasTag(const BamRecordTag tag) const +{ + return HasTag(internal::BamRecordTags::LabelFor(tag)); +} + +void BamRecordImpl::InitializeData(void) +{ + d_.reset(bam_init1(), internal::HtslibRecordDeleter()); + d_->data = (uint8_t*)(calloc(0x800, sizeof(uint8_t))); // maybe make this value tune-able later? + + // init unmapped + Position(PacBio::BAM::UnmappedPosition); + MatePosition(PacBio::BAM::UnmappedPosition); + ReferenceId(-1); + MateReferenceId(-1); + SetMapped(false); + MapQuality(255); + + // initialized with NULL term for qname + d_->core.l_qname = 1; + d_->l_data = 1; + d_->m_data = 0x800; +} + +void BamRecordImpl::MaybeReallocData(void) +{ + // about to grow data contents to l_data size, but m_data is our current max. + // so we may need to grow. if so, use kroundup to double to next power of 2 + if (d_->m_data < d_->l_data) { + d_->m_data = d_->l_data; + kroundup32(d_->m_data); + d_->data = static_cast(realloc(d_->data, d_->m_data)); + } +} + +string BamRecordImpl::Name(void) const +{ + return string(bam_get_qname(d_)); +} + +BamRecordImpl& BamRecordImpl::Name(const std::string& name) +{ + // determine change in memory needed + // diffNumBytes: pos -> growing, neg -> shrinking + const size_t numChars = name.size() + 1; // +1 for NULL-term + const int diffNumBytes = numChars - d_->core.l_qname; + const int oldLengthData = d_->l_data; + d_->l_data += diffNumBytes; + MaybeReallocData(); + + // shift trailing data (cigar, seq, qual, tags) as needed + const uint32_t* oldCigarStart = bam_get_cigar(d_); + const size_t trailingDataLength = oldLengthData - ((uint8_t*)oldCigarStart - d_->data); + d_->core.l_qname = numChars; + uint32_t* newCigarStart = bam_get_cigar(d_); + memmove(newCigarStart, oldCigarStart, trailingDataLength); + + // fill in new name + memcpy(d_->data, name.c_str(), numChars); + return *this; +} + +QualityValues BamRecordImpl::Qualities(void) const +{ + if (d_->core.l_qseq == 0) + return QualityValues(); + + uint8_t* qualData = bam_get_qual(d_); + if (qualData[0] == 0xff) + return QualityValues(); + + const size_t numQuals = d_->core.l_qseq; + QualityValues result; + result.reserve(numQuals); + for (size_t i = 0; i < numQuals; ++i) + result.push_back(QualityValue(qualData[i])); + return result; +} + +bool BamRecordImpl::RemoveTag(const string& tagName) +{ + const bool removed = RemoveTagImpl(tagName); + if (removed) + UpdateTagMap(); + return removed; +} + +bool BamRecordImpl::RemoveTag(const BamRecordTag tag) +{ + return RemoveTag(internal::BamRecordTags::LabelFor(tag)); +} + +bool BamRecordImpl::RemoveTagImpl(const string &tagName) +{ + if (tagName.size() != 2) + return false; + uint8_t* data = bam_aux_get(d_.get(), tagName.c_str()); + if (data == 0) + return false; + const bool ok = bam_aux_del(d_.get(), data) == 0; + return ok; +} + +string BamRecordImpl::Sequence(void) const +{ + string result; + result.reserve(d_->core.l_qseq); + static const string DnaLookup = string("=ACMGRSVTWYHKDBN"); + const uint8_t* seqData = bam_get_seq(d_); + for (int i = 0; i < d_->core.l_qseq; ++i) + result.append(1, DnaLookup[bam_seqi(seqData, i)]); + return result; +} + +size_t BamRecordImpl::SequenceLength(void) const +{ return d_->core.l_qseq; } + +BamRecordImpl& BamRecordImpl::SetSequenceAndQualities(const std::string& sequence, + const std::string& qualities) +{ + // TODO: I'm ok with the assert for now, but how to handle at runtime? + if (!qualities.empty()) { + PB_ASSERT_OR_RETURN_VALUE(sequence.size() == qualities.size(), *this); + } + + return SetSequenceAndQualitiesInternal(sequence.c_str(), + sequence.size(), + qualities.c_str(), + false); +} + +BamRecordImpl& BamRecordImpl::SetSequenceAndQualities(const char* sequence, + const size_t sequenceLength, + const char* qualities) +{ + return SetSequenceAndQualitiesInternal(sequence, + sequenceLength, + qualities, + false); +} + +BamRecordImpl& BamRecordImpl::SetPreencodedSequenceAndQualities(const char* encodedSequence, + const size_t rawSequenceLength, + const char* qualities) +{ + return SetSequenceAndQualitiesInternal(encodedSequence, + rawSequenceLength, + qualities, + true); +} + +BamRecordImpl& BamRecordImpl::SetSequenceAndQualitiesInternal(const char* sequence, + const size_t sequenceLength, + const char* qualities, + bool isPreencoded) +{ + // determine change in memory needed + // diffNumBytes: pos -> growing, neg -> shrinking + const int encodedSequenceLength = static_cast((sequenceLength+1)/2); + const int oldSeqAndQualLength = static_cast((d_->core.l_qseq+1)/2) + d_->core.l_qseq; // encoded seq + qual + const int newSeqAndQualLength = encodedSequenceLength + sequenceLength; // encoded seq + qual + const int diffNumBytes = newSeqAndQualLength - oldSeqAndQualLength; + const int oldLengthData = d_->l_data; + d_->l_data += diffNumBytes; + MaybeReallocData(); + + // shift trailing data (tags) as needed + const uint8_t* oldTagStart = bam_get_aux(d_); + const size_t trailingDataLength = oldLengthData - ((uint8_t*)oldTagStart - d_->data); + d_->core.l_qseq = sequenceLength; + uint8_t* newTagStart = bam_get_aux(d_); + memmove(newTagStart, oldTagStart, trailingDataLength); + + // fill in new sequence + uint8_t* pEncodedSequence = bam_get_seq(d_); + if (isPreencoded) { + memcpy(pEncodedSequence, sequence, encodedSequenceLength); + } else { + memset(pEncodedSequence, 0, encodedSequenceLength); + for (size_t i = 0; i < sequenceLength; ++i) + pEncodedSequence[i>>1] |= seq_nt16_table[(int)sequence[i]] << ((~i&1)<<2); + } + + // fill in quality values + uint8_t* encodedQualities = bam_get_qual(d_); + if ( (qualities == 0 ) || (strlen(qualities) == 0) ) + memset(encodedQualities, 0xff, sequenceLength); + else { + for (size_t i = 0; i < sequenceLength; ++i) + encodedQualities[i] = qualities[i] - 33; // FASTQ ASCII -> int conversion + } + return *this; +} + +int BamRecordImpl::TagOffset(const string& tagName) const +{ + if (tagName.size() != 2) + throw std::runtime_error("invalid tag name size"); + + if (tagOffsets_.empty()) + UpdateTagMap(); + + const uint16_t tagCode = (static_cast(tagName.at(0)) << 8) | static_cast(tagName.at(1)); + const auto found = tagOffsets_.find(tagCode); + return (found != tagOffsets_.cend() ? found->second : -1); +} + +BamRecordImpl& BamRecordImpl::Tags(const TagCollection& tags) +{ + // convert tags to binary + const vector& tagData = BamTagCodec::Encode(tags); + const size_t numBytes = tagData.size(); + const uint8_t* data = tagData.data(); + + // determine change in memory needed + uint8_t* tagStart = bam_get_aux(d_); + const size_t oldNumBytes = d_->l_data - (tagStart - d_->data); + const int diffNumBytes = numBytes - oldNumBytes; + d_->l_data += diffNumBytes; + MaybeReallocData(); + tagStart = bam_get_aux(d_); + + // fill in new tag data + memcpy((void*)tagStart, data, numBytes); + + // update tag info + UpdateTagMap(); + return *this; +} + +TagCollection BamRecordImpl::Tags(void) const +{ + const uint8_t* tagDataStart = bam_get_aux(d_); + const size_t numBytes = d_->l_data - (tagDataStart - d_->data); + return BamTagCodec::Decode(vector(tagDataStart, tagDataStart+numBytes)); +} + +Tag BamRecordImpl::TagValue(const string& tagName) const +{ + if (tagName.size() != 2) + return Tag(); + + const int offset = TagOffset(tagName); + if (offset == -1) + return Tag(); + + bam1_t* b = d_.get(); + assert(bam_get_aux(b)); + uint8_t* tagData = bam_get_aux(b) + offset; + if (offset >= b->l_data) + return Tag(); + + // skip tag name + return BamTagCodec::FromRawData(tagData); +} + +Tag BamRecordImpl::TagValue(const BamRecordTag tag) const +{ + return TagValue(internal::BamRecordTags::LabelFor(tag)); +} + +void BamRecordImpl::UpdateTagMap(void) const +{ + // clear out offsets, leave map structure basically intact + auto tagIter = tagOffsets_.begin(); + auto tagEnd = tagOffsets_.end(); + for ( ; tagIter != tagEnd; ++tagIter ) + tagIter->second = -1; + + const uint8_t* tagStart = bam_get_aux(d_); + if (tagStart == 0) + return; + const ptrdiff_t numBytes = d_->l_data - (tagStart - d_->data); + + // NOTE: using a 16-bit 'code' for tag name here instead of string, to avoid + // a lot of string constructions & comparisons. All valid tags will be 2 chars + // anyway, so this should be a nice lookup mechanism. + // + uint16_t tagNameCode; + int64_t i = 0; + while(i < numBytes) { + + // store (tag name code -> start offset into tag data) + tagNameCode = static_cast(tagStart[i]) << 8 | static_cast(tagStart[i+1]); + i += 2; + tagOffsets_[tagNameCode] = i; + + // skip tag contents + const char tagType = static_cast(tagStart[i++]); + switch (tagType) { + case 'A' : + case 'a' : + case 'c' : + case 'C' : + { + i += 1; + break; + } + case 's' : + case 'S' : + { + i += 2; + break; + } + case 'i' : + case 'I' : + case 'f' : + { + i += 4; + break; + } + + case 'Z' : + case 'H' : + { + // null-terminated string + i += strlen((const char*)&tagStart[i]) + 1; + break; + } + + case 'B' : + { + const char subTagType = tagStart[i++]; + size_t elementSize = 0; + switch (subTagType) { + case 'c' : + case 'C' : elementSize = 1; break; + case 's' : + case 'S' : elementSize = 2; break; + case 'i' : + case 'I' : + case 'f' : elementSize = 4; break; + + // unknown subTagType + default: + PB_ASSERT_OR_RETURN(false); + } + + uint32_t numElements = 0; + memcpy(&numElements, &tagStart[i], sizeof(uint32_t)); + i += (4 + (elementSize * numElements)); + break; + } + + // unknown tagType + default: + PB_ASSERT_OR_RETURN(false); + } + } +} diff --git a/src/BamRecordTags.cpp b/src/BamRecordTags.cpp new file mode 100644 index 0000000..c039aae --- /dev/null +++ b/src/BamRecordTags.cpp @@ -0,0 +1,99 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamRecordTags.h +/// \brief Implements the BamRecordTags utility class. +// +// Author: Derek Barnett + +#include "BamRecordTags.h" +#include "EnumClassHash.h" +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +const BamRecordTags::TagLookupType BamRecordTags::tagLookup = +{ + // enum name label isPulse? + // --------- ----- -------- + { BamRecordTag::ALT_LABEL_QV, {"pv", true} }, + { BamRecordTag::ALT_LABEL_TAG, {"pt", true} }, + { BamRecordTag::BARCODE_QUALITY, {"bq", false} }, + { BamRecordTag::BARCODES, {"bc", false} }, + { BamRecordTag::CONTEXT_FLAGS, {"cx", false} }, + { BamRecordTag::DELETION_QV, {"dq", false} }, + { BamRecordTag::DELETION_TAG, {"dt", false} }, + { BamRecordTag::HOLE_NUMBER, {"zm", false} }, + { BamRecordTag::INSERTION_QV, {"iq", false} }, + { BamRecordTag::IPD, {"ip", false} }, + { BamRecordTag::LABEL_QV, {"pq", true} }, + { BamRecordTag::MERGE_QV, {"mq", false} }, + { BamRecordTag::NUM_PASSES, {"np", false} }, + { BamRecordTag::PKMEAN, {"pa", true} }, + { BamRecordTag::PKMEAN_2, {"ps", true} }, + { BamRecordTag::PKMID, {"pm", true} }, + { BamRecordTag::PKMID_2, {"pi", true} }, + { BamRecordTag::PRE_PULSE_FRAMES, {"pd", true} }, + { BamRecordTag::PULSE_CALL, {"pc", true} }, + { BamRecordTag::PULSE_CALL_WIDTH, {"px", true} }, + { BamRecordTag::PULSE_MERGE_QV, {"pg", true} }, + { BamRecordTag::PULSE_WIDTH, {"pw", false} }, // 'pulse' in the name; but stored per-base, not per-pulse + { BamRecordTag::QUERY_END, {"qe", false} }, + { BamRecordTag::QUERY_START, {"qs", false} }, + { BamRecordTag::READ_ACCURACY, {"rq", false} }, + { BamRecordTag::READ_GROUP, {"RG", false} }, + { BamRecordTag::SCRAP_REGION_TYPE, {"sc", false} }, + { BamRecordTag::SCRAP_ZMW_TYPE, {"sz", false} }, + { BamRecordTag::SNR, {"sn", false} }, + { BamRecordTag::START_FRAME, {"sf", true} }, + { BamRecordTag::SUBSTITUTION_QV, {"sq", false} }, + { BamRecordTag::SUBSTITUTION_TAG, {"st", false} }, + + // faux tags + { BamRecordTag::SEQ, {" ", false} }, + { BamRecordTag::QUAL, {" ", false} } +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio diff --git a/src/BamRecordTags.h b/src/BamRecordTags.h new file mode 100644 index 0000000..002142a --- /dev/null +++ b/src/BamRecordTags.h @@ -0,0 +1,93 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamRecordTags.h +/// \brief Defines the BamRecordTags utility class. +// +// Author: Derek Barnett + +#ifndef BAMRECORDTAGS_H +#define BAMRECORDTAGS_H + +#include "pbbam/BamRecord.h" +#include "pbbam/BamRecordImpl.h" +#include "pbbam/BamRecordTag.h" +#include "EnumClassHash.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +class BamRecordTags +{ +public: + // tag info + static inline bool IsPulse(const BamRecordTag tag); + static inline std::string LabelFor(const BamRecordTag tag); + +private: + struct BamRecordTagData + { + const std::string label_; //[3]; // 2-char tag plus NULL + const bool isPulse_; + }; + typedef std::unordered_map TagLookupType; + + static const TagLookupType tagLookup; +}; + +inline bool BamRecordTags::IsPulse(const BamRecordTag tag) +{ + assert(tagLookup.find(tag) != tagLookup.cend()); + return tagLookup.at(tag).isPulse_; +} + +inline std::string BamRecordTags::LabelFor(const BamRecordTag tag) +{ + assert(tagLookup.find(tag) != tagLookup.cend()); + return tagLookup.at(tag).label_; +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // BAMRECORDTAGS_H diff --git a/src/BamTagCodec.cpp b/src/BamTagCodec.cpp new file mode 100644 index 0000000..fca2cbe --- /dev/null +++ b/src/BamTagCodec.cpp @@ -0,0 +1,515 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BamTagCodec.cpp +/// \brief Implements the BamTagCodec class. +// +// Author: Derek Barnett + +#include "pbbam/BamTagCodec.h" +#include "AssertUtils.h" +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +template +inline void appendBamValue(const T& value, kstring_t* str) +{ + kputsn_((char*)&value, sizeof(value), str); +} + +template +inline void appendBamMultiValue(const vector& container, kstring_t* str) +{ + const uint32_t n = container.size(); + kputsn_(&n, sizeof(n), str); + kputsn_((char*)&container[0], n*sizeof(T), str); +} + +template +inline T readBamValue(const uint8_t* src, size_t& offset) +{ + T value; + memcpy(&value, &src[offset], sizeof(value)); + offset += sizeof(value); + return value; +} + +template +vector readBamMultiValue(const uint8_t* src, size_t& offset) +{ + uint32_t numElements; + memcpy(&numElements, &src[offset], sizeof(uint32_t)); + offset += 4; + + vector result; + result.reserve(numElements); + for (size_t i = 0; i < numElements; ++i) { + const T& value = readBamValue(src, offset); + result.push_back(value); + } + return result; +} + +TagCollection BamTagCodec::Decode(const vector& data) +{ + TagCollection tags; + + // NOTE: not completely safe - no real bounds-checking yet on input data + + const uint8_t* pData = data.data(); + const size_t numBytes = data.size(); + size_t i = 0; + while (i < numBytes) { + + string tagName; + tagName.reserve(2); + tagName.append(1, pData[i++]); + tagName.append(1, pData[i++]); + + const char tagType = static_cast(pData[i++]); + switch (tagType) { + case 'A' : + case 'a' : + { + tags[tagName] = readBamValue(pData, i); + tags[tagName].Modifier(TagModifier::ASCII_CHAR); + break; + } + + case 'c' : tags[tagName] = readBamValue(pData, i); break; + case 'C' : tags[tagName] = readBamValue(pData, i); break; + case 's' : tags[tagName] = readBamValue(pData, i); break; + case 'S' : tags[tagName] = readBamValue(pData, i); break; + case 'i' : tags[tagName] = readBamValue(pData, i); break; + case 'I' : tags[tagName] = readBamValue(pData, i); break; + case 'f' : tags[tagName] = readBamValue(pData, i); break; + + case 'Z' : + case 'H' : + { + const size_t dataLength = strlen((const char*)&pData[i]); + string value; + value.resize(dataLength); + memcpy((char*)value.data(), &pData[i], dataLength); + tags[tagName] = value; + if (tagType == 'H') + tags[tagName].Modifier(TagModifier::HEX_STRING); + i += dataLength + 1; + break; + } + + case 'B' : + { + const char subTagType = pData[i++]; + switch (subTagType) { + case 'c' : tags[tagName] = readBamMultiValue(pData, i); break; + case 'C' : tags[tagName] = readBamMultiValue(pData, i); break; + case 's' : tags[tagName] = readBamMultiValue(pData, i); break; + case 'S' : tags[tagName] = readBamMultiValue(pData, i); break; + case 'i' : tags[tagName] = readBamMultiValue(pData, i); break; + case 'I' : tags[tagName] = readBamMultiValue(pData, i); break; + case 'f' : tags[tagName] = readBamMultiValue(pData, i); break; + + // unknown subTagType + default: + PB_ASSERT_OR_RETURN_VALUE(false, TagCollection()); + } + break; + } + + // unknown tagType + default: + PB_ASSERT_OR_RETURN_VALUE(false, TagCollection()); + } + } + + return tags; +} + +vector BamTagCodec::Encode(const TagCollection& tags) +{ + kstring_t str = { 0, 0, NULL }; + + const auto tagEnd = tags.cend(); + for (auto tagIter = tags.cbegin(); tagIter != tagEnd; ++tagIter) { + const string& name = (*tagIter).first; + const Tag& tag = (*tagIter).second; + PB_ASSERT_OR_CONTINUE(name.size() == 2); + if (tag.IsNull()) + continue; + + // ":" + kputsn_(name.c_str(), 2, &str); + + // ":" for printable, ASCII char + if (tag.HasModifier(TagModifier::ASCII_CHAR)) { + char c = tag.ToAscii(); + if (c != '\0') { + kputc_('A', &str); + kputc_(c, &str); + continue; + } + } + + // ":" for all other data + switch ( tag.Type() ) { + case TagDataType::INT8 : + { + kputc_('c', &str); + appendBamValue(tag.ToInt8(), &str); + break; + } + case TagDataType::UINT8 : + { + kputc_('C', &str); + appendBamValue(tag.ToUInt8(), &str); + break; + } + case TagDataType::INT16 : + { + kputc_('s', &str); + appendBamValue(tag.ToInt16(), &str); + break; + } + case TagDataType::UINT16 : + { + kputc_('S', &str); + appendBamValue(tag.ToUInt16(), &str); + break; + } + case TagDataType::INT32 : + { + kputc_('i', &str); + appendBamValue(tag.ToInt32(), &str); + break; + } + case TagDataType::UINT32 : + { + kputc_('I', &str); + appendBamValue(tag.ToUInt32(), &str); + break; + } + case TagDataType::FLOAT : + { + kputc_('f', &str); + appendBamValue(tag.ToFloat(), &str); + break; + } + + case TagDataType::STRING : + { + if (tag.HasModifier(TagModifier::HEX_STRING)) + kputc_('H', &str); + else + kputc_('Z', &str); + const string& s = tag.ToString(); + kputsn_(s.c_str(), s.size()+1, &str); // this adds the null-term + break; + } + + case TagDataType::INT8_ARRAY : + { + kputc_('B', &str); + kputc_('c', &str); + appendBamMultiValue(tag.ToInt8Array(), &str); + break; + } + case TagDataType::UINT8_ARRAY : + { + kputc_('B', &str); + kputc_('C', &str); + appendBamMultiValue(tag.ToUInt8Array(), &str); + break; + } + case TagDataType::INT16_ARRAY : + { + kputc_('B', &str); + kputc_('s', &str); + appendBamMultiValue(tag.ToInt16Array(), &str); + break; + } + case TagDataType::UINT16_ARRAY : + { + kputc_('B', &str); + kputc_('S', &str); + appendBamMultiValue(tag.ToUInt16Array(), &str); + break; + } + case TagDataType::INT32_ARRAY : + { + kputc_('B', &str); + kputc_('i', &str); + appendBamMultiValue(tag.ToInt32Array(), &str); + break; + } + case TagDataType::UINT32_ARRAY : + { + kputc_('B', &str); + kputc_('I', &str); + appendBamMultiValue(tag.ToUInt32Array(), &str); + break; + } + case TagDataType::FLOAT_ARRAY : + { + kputc_('B', &str); + kputc_('f', &str); + appendBamMultiValue(tag.ToFloatArray(), &str); + break; + } + + // unsupported tag type + default : + free(str.s); + PB_ASSERT_OR_RETURN_VALUE(false, vector()); + } + } + + vector result; + result.resize(str.l); + memcpy((char*)&result[0], str.s, str.l); + free(str.s); + return result; +} + +Tag BamTagCodec::FromRawData(uint8_t* rawData) +{ + size_t offset = 0; + const char tagType = static_cast(*rawData++); + switch (tagType) { + case 'A' : + case 'a' : + { + Tag t = Tag(readBamValue(rawData, offset)); + t.Modifier(TagModifier::ASCII_CHAR); + return t; + } + + case 'c' : return Tag(readBamValue(rawData, offset)); + case 'C' : return Tag(readBamValue(rawData, offset)); + case 's' : return Tag(readBamValue(rawData, offset)); + case 'S' : return Tag(readBamValue(rawData, offset)); + case 'i' : return Tag(readBamValue(rawData, offset)); + case 'I' : return Tag(readBamValue(rawData, offset)); + case 'f' : return Tag(readBamValue(rawData, offset)); + + case 'Z' : + case 'H' : + { + const size_t dataLength = strlen((const char*)&rawData[0]); + string value; + value.resize(dataLength); + memcpy((char*)value.data(), &rawData[0], dataLength); + Tag t(value); + if (tagType == 'H') + t.Modifier(TagModifier::HEX_STRING); + return t; + } + + case 'B' : + { + const char subTagType = *rawData++; + switch (subTagType) { + + case 'c' : return Tag(readBamMultiValue(rawData, offset)); + case 'C' : return Tag(readBamMultiValue(rawData, offset)); + case 's' : return Tag(readBamMultiValue(rawData, offset)); + case 'S' : return Tag(readBamMultiValue(rawData, offset)); + case 'i' : return Tag(readBamMultiValue(rawData, offset)); + case 'I' : return Tag(readBamMultiValue(rawData, offset)); + case 'f' : return Tag(readBamMultiValue(rawData, offset)); + + // unknown subTagType + default: + PB_ASSERT_OR_RETURN_VALUE(false, Tag()); + } + break; + } + + // unknown tagType + default: + PB_ASSERT_OR_RETURN_VALUE(false, Tag()); + } + return Tag(); // to avoid compiler warning +} + +vector BamTagCodec::ToRawData(const Tag& tag, + const TagModifier& additionalModifier) +{ + // temp raw data destination (for use with htslib methods) + kstring_t str = { 0, 0, NULL }; + + // ":" for printable, ASCII char + if (tag.HasModifier(TagModifier::ASCII_CHAR) || additionalModifier == TagModifier::ASCII_CHAR) { + const char c = tag.ToAscii(); + if (c != '\0') + kputc_(c, &str); + } + + // for all others + else { + switch (tag.Type()) { + + // single, numeric values + case TagDataType::INT8 : appendBamValue(tag.ToInt8(), &str); break; + case TagDataType::UINT8 : appendBamValue(tag.ToUInt8(), &str); break; + case TagDataType::INT16 : appendBamValue(tag.ToInt16(), &str); break; + case TagDataType::UINT16 : appendBamValue(tag.ToUInt16(), &str); break; + case TagDataType::INT32 : appendBamValue(tag.ToInt32(), &str); break; + case TagDataType::UINT32 : appendBamValue(tag.ToUInt32(), &str); break; + case TagDataType::FLOAT : appendBamValue(tag.ToFloat(), &str); break; + + // string & hex-string values + case TagDataType::STRING : + { + const string& s = tag.ToString(); + kputsn_(s.c_str(), s.size()+1, &str); // this adds the null-term + break; + } + + // array-type values + case TagDataType::INT8_ARRAY : + { + kputc_('c', &str); + appendBamMultiValue(tag.ToInt8Array(), &str); + break; + } + case TagDataType::UINT8_ARRAY : + { + kputc_('C', &str); + appendBamMultiValue(tag.ToUInt8Array(), &str); + break; + } + case TagDataType::INT16_ARRAY : + { + kputc_('s', &str); + appendBamMultiValue(tag.ToInt16Array(), &str); + break; + } + case TagDataType::UINT16_ARRAY : + { + kputc_('S', &str); + appendBamMultiValue(tag.ToUInt16Array(), &str); + break; + } + case TagDataType::INT32_ARRAY : + { + kputc_('i', &str); + appendBamMultiValue(tag.ToInt32Array(), &str); + break; + } + case TagDataType::UINT32_ARRAY : + { + kputc_('I', &str); + appendBamMultiValue(tag.ToUInt32Array(), &str); + break; + } + case TagDataType::FLOAT_ARRAY : + { + kputc_('f', &str); + appendBamMultiValue(tag.ToFloatArray(), &str); + break; + } + + // unsupported tag type + default : + free(str.s); + PB_ASSERT_OR_RETURN_VALUE(false, vector()); + } + } + + // store temp contents in actual destination + vector result; + result.resize(str.l); + memcpy((char*)&result[0], str.s, str.l); + free(str.s); + return result; +} + +uint8_t BamTagCodec::TagTypeCode(const Tag& tag, + const TagModifier& additionalModifier) +{ + if (tag.HasModifier(TagModifier::ASCII_CHAR) || additionalModifier == TagModifier::ASCII_CHAR) { + int64_t value = 0; + switch (tag.Type()) { + case TagDataType::INT8 : value = static_cast(tag.ToInt8()); break; + case TagDataType::UINT8 : value = static_cast(tag.ToUInt8()); break; + case TagDataType::INT16 : value = static_cast(tag.ToInt16()); break; + case TagDataType::UINT16 : value = static_cast(tag.ToUInt16()); break; + case TagDataType::INT32 : value = static_cast(tag.ToInt32()); break; + case TagDataType::UINT32 : value = static_cast(tag.ToUInt32()); break; + default: + // non integers not allowed + PB_ASSERT_OR_RETURN_VALUE(false, 0); + } + // printable range + PB_ASSERT_OR_RETURN_VALUE(value >= 33, 0); + PB_ASSERT_OR_RETURN_VALUE(value <= 126, 0); + return static_cast('A'); + } + + switch (tag.Type()) { + case TagDataType::INT8 : return static_cast('c'); + case TagDataType::UINT8 : return static_cast('C'); + case TagDataType::INT16 : return static_cast('s'); + case TagDataType::UINT16 : return static_cast('S'); + case TagDataType::INT32 : return static_cast('i'); + case TagDataType::UINT32 : return static_cast('I'); + case TagDataType::FLOAT : return static_cast('f'); + + case TagDataType::STRING : + { + if (tag.HasModifier(TagModifier::HEX_STRING) || additionalModifier == TagModifier::HEX_STRING) + return static_cast('H'); + else + return static_cast('Z'); + } + + case TagDataType::INT8_ARRAY : // fall through + case TagDataType::UINT8_ARRAY : // . + case TagDataType::INT16_ARRAY : // . + case TagDataType::UINT16_ARRAY : // . + case TagDataType::INT32_ARRAY : // . + case TagDataType::UINT32_ARRAY : // . + case TagDataType::FLOAT_ARRAY : return static_cast('B'); + + default: + PB_ASSERT_OR_RETURN_VALUE(false, 0); + } + return 0; // to avoid compiler warning +} diff --git a/src/BamWriter.cpp b/src/BamWriter.cpp new file mode 100644 index 0000000..f7d8400 --- /dev/null +++ b/src/BamWriter.cpp @@ -0,0 +1,200 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "pbbam/BamWriter.h" +#include "pbbam/BamFile.h" +#include "pbbam/Validator.h" +#include "AssertUtils.h" +#include "FileProducer.h" +#include "MemoryUtils.h" +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +class BamWriterPrivate : public internal::FileProducer +{ +public: + BamWriterPrivate(const std::string& filename, + const PBBAM_SHARED_PTR rawHeader, + const BamWriter::CompressionLevel compressionLevel, + const size_t numThreads, + const BamWriter::BinCalculationMode binCalculationMode); + +public: + void Write(const BamRecord& record); + void Write(const BamRecord& record, int64_t* vOffset); + void Write(const BamRecordImpl& recordImpl); + +public: + bool calculateBins_; + std::unique_ptr file_; + PBBAM_SHARED_PTR header_; +}; + +BamWriterPrivate::BamWriterPrivate(const string& filename, + const PBBAM_SHARED_PTR rawHeader, + const BamWriter::CompressionLevel compressionLevel, + const size_t numThreads, + const BamWriter::BinCalculationMode binCalculationMode) + : internal::FileProducer(filename) + , calculateBins_(binCalculationMode == BamWriter::BinCalculation_ON) + , file_(nullptr) + , header_(rawHeader) +{ + if (!header_) + throw std::runtime_error("null header"); + + // open file + const string& usingFilename = TempFilename(); + const string& mode = string("wb") + to_string(static_cast(compressionLevel)); + file_.reset(sam_open(usingFilename.c_str(), mode.c_str())); + if (!file_) + throw std::runtime_error("could not open file for writing"); + + // if no explicit thread count given, attempt built-in check + size_t actualNumThreads = numThreads; + if (actualNumThreads == 0) { + actualNumThreads = thread::hardware_concurrency(); + + // if still unknown, default to single-threaded + if (actualNumThreads == 0) + actualNumThreads = 1; + } + + // if multithreading requested, enable it + if (actualNumThreads > 1) + hts_set_threads(file_.get(), actualNumThreads); + + // write header + const int ret = sam_hdr_write(file_.get(), header_.get()); + if (ret != 0) + throw std::runtime_error("could not write header"); +} + +void BamWriterPrivate::Write(const BamRecord& record) +{ +#if PBBAM_AUTOVALIDATE + Validator::Validate(record); +#endif + + const auto rawRecord = internal::BamRecordMemory::GetRawData(record); + + // (probably) store bins + // min_shift=14 & n_lvls=5 are BAM "magic numbers" + if (calculateBins_) + rawRecord->core.bin = hts_reg2bin(rawRecord->core.pos, bam_endpos(rawRecord.get()), 14, 5); + + // write record to file + const int ret = sam_write1(file_.get(), header_.get(), rawRecord.get()); + if (ret <= 0) + throw std::runtime_error("could not write record"); +} + +void BamWriterPrivate::Write(const BamRecord& record, int64_t* vOffset) +{ + BGZF* bgzf = file_.get()->fp.bgzf; + assert(bgzf); + assert(vOffset); + + // ensure offsets up-to-date + bgzf_flush(bgzf); + + // capture virtual offset where we’re about to write + const off_t rawTell = htell(bgzf->fp); + const int length = bgzf->block_offset; + *vOffset = (rawTell << 16) | length ; + + // now write data + Write(record); +} + +inline void BamWriterPrivate::Write(const BamRecordImpl& recordImpl) +{ Write(BamRecord(recordImpl)); } + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +BamWriter::BamWriter(const std::string& filename, + const BamHeader& header, + const BamWriter::CompressionLevel compressionLevel, + const size_t numThreads, + const BinCalculationMode binCalculationMode) + : IRecordWriter() + , d_(nullptr) +{ +#if PBBAM_AUTOVALIDATE + Validator::Validate(header); +#endif + d_.reset(new internal::BamWriterPrivate{ filename, + internal::BamHeaderMemory::MakeRawHeader(header), + compressionLevel, + numThreads, + binCalculationMode + }); +} + +BamWriter::~BamWriter(void) +{ + bgzf_flush(d_->file_.get()->fp.bgzf); +} + +void BamWriter::TryFlush(void) +{ + // TODO: sanity checks on file_ & fp + const int ret = bgzf_flush(d_->file_.get()->fp.bgzf); + if (ret != 0) + throw std::runtime_error("could not flush output buffer contents"); +} + +void BamWriter::Write(const BamRecord& record) +{ d_->Write(record); } + +void BamWriter::Write(const BamRecord& record, int64_t* vOffset) +{ d_->Write(record, vOffset); } + +void BamWriter::Write(const BamRecordImpl& recordImpl) +{ d_->Write(recordImpl); } diff --git a/src/BarcodeQuery.cpp b/src/BarcodeQuery.cpp new file mode 100644 index 0000000..47af230 --- /dev/null +++ b/src/BarcodeQuery.cpp @@ -0,0 +1,68 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file BarcodeQuery.cpp +/// \brief Implements the BarcodeQuery class. +// +// Author: Derek Barnett + +#include "pbbam/BarcodeQuery.h" +#include "pbbam/PbiFilterTypes.h" +#include "pbbam/CompositeBamReader.h" +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +struct BarcodeQuery::BarcodeQueryPrivate +{ + BarcodeQueryPrivate(const int16_t barcode, const DataSet& dataset) + : reader_(PbiBarcodeFilter(barcode), dataset) + { } + + PbiFilterCompositeBamReader reader_; // unsorted +}; + +BarcodeQuery::BarcodeQuery(const int16_t barcode, + const DataSet& dataset) + : internal::IQuery() + , d_(new BarcodeQueryPrivate(barcode, dataset)) +{ } + +BarcodeQuery::~BarcodeQuery(void) { } + +bool BarcodeQuery::GetNext(BamRecord &r) +{ return d_->reader_.GetNext(r); } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..4b0b2dc --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,89 @@ + +# grab library source files +include(files.cmake) +set(SOURCES + ${PacBioBAM_H} + ${PacBioBAM_CPP} +) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PacBioBAM_CXX_FLAGS}") + +# define actual library +add_library(pbbam ${SOURCES}) + +# library properties +target_compile_definitions(pbbam + PRIVATE "-DPBBAM_LIBRARY" +) +set_target_properties(pbbam PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY ${PacBioBAM_LibDir} + RUNTIME_OUTPUT_DIRECTORY ${PacBioBAM_LibDir} + LIBRARY_OUTPUT_DIRECTORY ${PacBioBAM_LibDir} +) + +if(PacBioBAM_wrap_r) + # SWIG R does not support std::shared_ptr, but it does support boost::shared_ptr + # So force boost if we're wrapping for R. + target_compile_definitions(pbbam + PUBLIC -DPBBAM_USE_BOOST_SHARED_PTR + ) +endif() + +if(PacBioBAM_auto_validate) + target_compile_definitions(pbbam + PUBLIC "-DPBBAM_AUTOVALIDATE=1" + ) +endif() + +# pbbam includes +target_include_directories(pbbam + PUBLIC + ${PacBioBAM_IncludeDir} + ${HTSLIB_INCLUDE_DIRS} + ${Boost_INCLUDE_DIRS} + ${ZLIB_INCLUDE_DIRS} +) + +# set link dependencies +# if htslib provided externally +if(HTSLIB_LIBRARIES) + set(pbbam_all_dependency_libs + ${HTSLIB_LIBRARIES} + ${ZLIB_LIBRARIES} + ${SOCKET_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT} + ) +# otherwise, use the "in-project" htslib target +else() + set(pbbam_all_dependency_libs + $ + ${ZLIB_LIBRARIES} + ${SOCKET_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT} + ) +endif() + +target_link_libraries(pbbam + PUBLIC + ${pbbam_all_dependency_libs} +) + +# define include paths for projects that use pbbam +set(PacBioBAM_INCLUDE_DIRS + ${PacBioBAM_IncludeDir} + ${HTSLIB_INCLUDE_DIRS} + ${Boost_INCLUDE_DIRS} + ${ZLIB_INCLUDE_DIRS} + CACHE INTERNAL + "${PROJECT_NAME}: Include Directories" + FORCE +) +set(PacBioBAM_LIBRARIES + $ + ${pbbam_all_dependency_libs} + CACHE INTERNAL + "${PROJECT_NAME}: Libraries" + FORCE +) + +# add SWIG directory +add_subdirectory(swig) diff --git a/src/ChemistryTable.cpp b/src/ChemistryTable.cpp new file mode 100644 index 0000000..fbc161f --- /dev/null +++ b/src/ChemistryTable.cpp @@ -0,0 +1,85 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Lance Hepler + +#include "ChemistryTable.h" + +namespace PacBio { +namespace BAM { +namespace internal { + +extern const std::vector> ChemistryTable = { + + // BindingKit, SequencingKit, BasecallerVersion, Chemistry + + // RS + {{"100356300", "100356200", "2.1", "P6-C4"}}, + {{"100356300", "100356200", "2.3", "P6-C4"}}, + {{"100356300", "100612400", "2.1", "P6-C4"}}, + {{"100356300", "100612400", "2.3", "P6-C4"}}, + {{"100372700", "100356200", "2.1", "P6-C4"}}, + {{"100372700", "100356200", "2.3", "P6-C4"}}, + {{"100372700", "100612400", "2.1", "P6-C4"}}, + {{"100372700", "100612400", "2.3", "P6-C4"}}, + + // 3.0 ("Dromedary"): S/P1-C1/beta + {{"100-619-300", "100-620-000", "3.0", "S/P1-C1/beta"}}, + {{"100-619-300", "100-620-000", "3.1", "S/P1-C1/beta"}}, + + // 3.1 ("Echidna"): S/P1-C1.1 + {{"100-619-300", "100-867-300", "3.1", "S/P1-C1.1"}}, + {{"100-619-300", "100-867-300", "3.2", "S/P1-C1.1"}}, + {{"100-619-300", "100-867-300", "3.3", "S/P1-C1.1"}}, + + // 3.1.1 ("Flea"): S/P1-C1.2 + {{"100-619-300", "100-902-100", "3.1", "S/P1-C1.2"}}, + {{"100-619-300", "100-902-100", "3.2", "S/P1-C1.2"}}, + {{"100-619-300", "100-902-100", "3.3", "S/P1-C1.2"}}, + {{"100-619-300", "100-902-100", "4.0", "S/P1-C1.2"}}, + + // 3.2 ("Goat"): S/P1-C1.3 + {{"100-619-300", "100-972-200", "3.2", "S/P1-C1.3"}}, + {{"100-619-300", "100-972-200", "3.3", "S/P1-C1.3"}}, + {{"100-619-300", "100-972-200", "4.0", "S/P1-C1.3"}}, + + // 4.0 ("Seabiscuit"); S/P2-C2 + {{"100-862-200", "100-861-800", "4.0", "S/P2-C2"}} + +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio diff --git a/src/ChemistryTable.h b/src/ChemistryTable.h new file mode 100644 index 0000000..6caacaa --- /dev/null +++ b/src/ChemistryTable.h @@ -0,0 +1,55 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Lance Hepler + +#ifndef CHEMISTRYTABLE_H +#define CHEMISTRYTABLE_H + +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +extern const std::vector> ChemistryTable; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // CHEMISTRYTABLE_H diff --git a/src/Cigar.cpp b/src/Cigar.cpp new file mode 100644 index 0000000..f099f54 --- /dev/null +++ b/src/Cigar.cpp @@ -0,0 +1,74 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Cigar.cpp +/// \brief Implements the Cigar class. +// +// Author: Derek Barnett + +#include "pbbam/Cigar.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +Cigar::Cigar(const string& cigarString) + : vector() +{ + size_t numberStart = 0; + const size_t numChars = cigarString.size(); + for (size_t i = 0; i < numChars; ++i) { + const char c = cigarString.at(i); + if (!isdigit(c)) { + const size_t distance = i - numberStart; + const uint32_t length = stoul(cigarString.substr(numberStart, distance)); + push_back(CigarOperation(c, length)); + numberStart = i+1; + } + } +} + +string Cigar::ToStdString(void) const +{ + stringstream s; + const auto end = this->cend(); + for (auto iter = this->cbegin(); iter != end; ++iter) { + const CigarOperation& cigar = (*iter); + s << cigar.Length() + << cigar.Char(); + } + return s.str(); +} diff --git a/src/CigarOperation.cpp b/src/CigarOperation.cpp new file mode 100644 index 0000000..a207a58 --- /dev/null +++ b/src/CigarOperation.cpp @@ -0,0 +1,67 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file CigarOperation.cpp +/// \brief Implements the CigarOperation class. +// +// Author: Derek Barnett + +#include "pbbam/CigarOperation.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +CigarOperationType CigarOperation::CharToType(const char c) +{ + switch(c) + { + case 'S' : return CigarOperationType::SOFT_CLIP; + case '=' : return CigarOperationType::SEQUENCE_MATCH; + case 'X' : return CigarOperationType::SEQUENCE_MISMATCH; + case 'I' : return CigarOperationType::INSERTION; + case 'D' : return CigarOperationType::DELETION; + case 'N' : return CigarOperationType::REFERENCE_SKIP; + case 'H' : return CigarOperationType::HARD_CLIP; + case 'P' : return CigarOperationType::PADDING; + case 'M' : return CigarOperationType::ALIGNMENT_MATCH; + default: + return CigarOperationType::UNKNOWN_OP; + } +} + +char CigarOperation::TypeToChar(const CigarOperationType type) +{ return bam_cigar_opchr(static_cast(type)); } diff --git a/src/Compare.cpp b/src/Compare.cpp new file mode 100644 index 0000000..43874f2 --- /dev/null +++ b/src/Compare.cpp @@ -0,0 +1,141 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Compare.cpp +/// \brief Implements the Compare class. +// +// Author: Derek Barnett + +#include "pbbam/Compare.h" +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +struct TypeAlias +{ + string name_; + string op_; + string opAlpha_; + + TypeAlias(const string& name = string(), + const string& op = string(), + const string& opAlpha = string()) + : name_(name) + , op_(op) + , opAlpha_(opAlpha) + { } +}; + +struct CompareTypeHash +{ + size_t operator()(const Compare::Type& t) const + { return std::hash()(static_cast(t)); } +}; + +static const unordered_map opToTypeMap = +{ + // basic operators plus some permissiveness for other representations + + { "==", Compare::EQUAL }, + { "=", Compare::EQUAL }, + { "eq", Compare::EQUAL }, + { "!=", Compare::NOT_EQUAL }, + { "ne", Compare::NOT_EQUAL }, + { "<", Compare::LESS_THAN }, + { "lt", Compare::LESS_THAN }, + { "<", Compare::LESS_THAN }, + { "<=", Compare::LESS_THAN_EQUAL }, + { "lte", Compare::LESS_THAN_EQUAL }, + { "<=", Compare::LESS_THAN_EQUAL }, + { ">", Compare::GREATER_THAN }, + { "gt", Compare::GREATER_THAN }, + { ">", Compare::GREATER_THAN }, + { ">=", Compare::GREATER_THAN_EQUAL }, + { "gte", Compare::GREATER_THAN_EQUAL }, + { ">=", Compare::GREATER_THAN_EQUAL }, + { "&", Compare::CONTAINS }, + { "~", Compare::NOT_CONTAINS } +}; + +static const unordered_map typeAliases = +{ + { Compare::EQUAL, TypeAlias{ "Compare::EQUAL", "==", "eq" } }, + { Compare::NOT_EQUAL, TypeAlias{ "Compare::NOT_EQUAL", "!=", "ne" } }, + { Compare::LESS_THAN, TypeAlias{ "Compare::LESS_THAN", "<", "lt" } }, + { Compare::LESS_THAN_EQUAL, TypeAlias{ "Compare::LESS_THAN_EQUAL", "<=", "lte" } }, + { Compare::GREATER_THAN, TypeAlias{ "Compare::GREATER_THAN", ">", "gt" } }, + { Compare::GREATER_THAN_EQUAL, TypeAlias{ "Compare::GREATER_THAN_EQUAL", ">=", "gte" } }, + { Compare::CONTAINS, TypeAlias{ "Compare::CONTAINS", "&", "and" } }, + { Compare::NOT_CONTAINS, TypeAlias{ "Compare::NOT_CONTAINS", "~", "not" } } +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +Compare::Type Compare::TypeFromOperator(const string& opString) +{ + try { + return internal::opToTypeMap.at(opString); + } catch (std::exception&) { + throw std::runtime_error(opString + " is not a valid comparison operator." ); + } +} + +string Compare::TypeToName(const Compare::Type& type) +{ + try { + return internal::typeAliases.at(type).name_; + } catch (std::exception&) { + throw std::runtime_error("invalid comparison type encountered" ); + } +} + +string Compare::TypeToOperator(const Compare::Type& type, bool asAlpha) +{ + try { + return asAlpha ? internal::typeAliases.at(type).opAlpha_ + : internal::typeAliases.at(type).op_; + } catch (std::exception&) { + throw std::runtime_error("invalid comparison type encountered" ); + } +} diff --git a/src/Config.cpp b/src/Config.cpp new file mode 100644 index 0000000..095aa37 --- /dev/null +++ b/src/Config.cpp @@ -0,0 +1,60 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Config.cpp +/// \brief Initializes global variable defaults. +// +// Author: Derek Barnett + +#include "pbbam/Config.h" +using namespace PacBio; +using namespace PacBio::BAM; + +namespace PacBio { +namespace BAM { + +// Initialized to -1 to indicate default. Client code may set this or not. +// +// To respect client code or else fallback to default[OFF], this value should be used like this: +// +// hts_verbose = ( PacBio::BAM::HtslibVerbosity == -1 ? 0 : PacBio::BAM::HtslibVerbosity); +// +// +// +int HtslibVerbosity = -1; + +} // namespace BAM +} // namespace PacBio diff --git a/src/DataSet.cpp b/src/DataSet.cpp new file mode 100644 index 0000000..af6141c --- /dev/null +++ b/src/DataSet.cpp @@ -0,0 +1,298 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file DataSet.cpp +/// \brief Implements the DataSet class. +// +// Author: Derek Barnett + +#include "pbbam/DataSet.h" +#include "pbbam/DataSetTypes.h" +#include "pbbam/internal/DataSetBaseTypes.h" +#include "DataSetIO.h" +#include "FileUtils.h" +#include "TimeUtils.h" +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +static const string defaultVersion{ "4.0.0" }; + +static inline void InitDefaults(DataSet& ds) +{ + // provide default 'CreatedAt' & 'Version' attributes if not already present in XML + + if (ds.CreatedAt().empty()) + ds.CreatedAt(internal::ToIso8601(CurrentTime())); + + if (ds.Version().empty()) + ds.Version(internal::defaultVersion); +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +DataSet::DataSet(void) + : DataSet(DataSet::GENERIC) +{ + InitDefaults(*this); +} + +DataSet::DataSet(const DataSet::TypeEnum type) + : d_(nullptr) + , path_(FileUtils::CurrentWorkingDirectory()) +{ + switch(type) { + case DataSet::GENERIC : d_.reset(new DataSetBase); break; + case DataSet::ALIGNMENT : d_.reset(new AlignmentSet); break; + case DataSet::BARCODE : d_.reset(new BarcodeSet); break; + case DataSet::CONSENSUS_ALIGNMENT : d_.reset(new ConsensusAlignmentSet); break; + case DataSet::CONSENSUS_READ : d_.reset(new ConsensusReadSet); break; + case DataSet::CONTIG : d_.reset(new ContigSet); break; + case DataSet::HDF_SUBREAD : d_.reset(new HdfSubreadSet); break; + case DataSet::REFERENCE : d_.reset(new ReferenceSet); break; + case DataSet::SUBREAD : d_.reset(new SubreadSet); break; + default: + throw std::runtime_error("unsupported dataset type"); // unknown type + } + + InitDefaults(*this); +} + +DataSet::DataSet(const BamFile& bamFile) + : d_(DataSetIO::FromUri(bamFile.Filename())) + , path_(FileUtils::CurrentWorkingDirectory()) +{ + InitDefaults(*this); +} + +DataSet::DataSet(const string& filename) + : d_(DataSetIO::FromUri(filename)) + , path_(FileUtils::DirectoryName(filename)) +{ + // for FOFN contents and raw BAM filenames, we can just use the current + // directory as the starting path. + // + // (any relative paths in the FOFN have already been resolved) + // + if (boost::algorithm::iends_with(filename, ".fofn") || + boost::algorithm::iends_with(filename, ".bam")) + { + path_ = FileUtils::CurrentWorkingDirectory(); + } + InitDefaults(*this); +} + +DataSet::DataSet(const vector& filenames) + : d_(DataSetIO::FromUris(filenames)) + , path_(FileUtils::CurrentWorkingDirectory()) +{ + InitDefaults(*this); +} + +DataSet::DataSet(const DataSet& other) + : path_(other.path_) +{ + DataSetBase* otherDataset = other.d_.get(); + DataSetElement* copyDataset = new DataSetElement(*otherDataset); + d_.reset(static_cast(copyDataset)); +} + +DataSet::DataSet(DataSet&& other) + : d_(std::move(other.d_)) + , path_(std::move(other.path_)) +{ + assert(other.d_.get() == nullptr); +} + +DataSet& DataSet::operator=(const DataSet& other) +{ + DataSetBase* otherDataset = other.d_.get(); + DataSetElement* copyDataset = new DataSetElement(*otherDataset); + d_.reset(static_cast(copyDataset)); + path_ = other.path_; + return *this; +} + +DataSet& DataSet::operator=(DataSet&& other) +{ + d_ = std::move(other.d_); + path_ = std::move(other.path_); + return *this; +} + +DataSet::~DataSet(void) { } + +DataSet& DataSet::operator+=(const DataSet& other) +{ + *d_.get() += *other.d_.get(); + return *this; +} + +vector DataSet::BamFiles(void) const +{ + const PacBio::BAM::ExternalResources& resources = ExternalResources(); + + vector result; + result.reserve(resources.Size()); + for(const ExternalResource& ext : resources) { + + // only bother resolving file path if this is a BAM file + boost::iterator_range bamFound = boost::algorithm::ifind_first(ext.MetaType(), "bam"); + if (!bamFound.empty()) { + const string fn = ResolvePath(ext.ResourceId()); + result.push_back(BamFile(fn)); + } + } + return result; +} + +DataSet DataSet::FromXml(const string& xml) +{ + DataSet result; + result.d_ = internal::DataSetIO::FromXmlString(xml); + InitDefaults(result); + return result; +} + +const NamespaceRegistry& DataSet::Namespaces(void) const +{ return d_->Namespaces(); } + +NamespaceRegistry& DataSet::Namespaces(void) +{ return d_->Namespaces(); } + +DataSet::TypeEnum DataSet::NameToType(const string& typeName) +{ + static std::unordered_map lookup; + if (lookup.empty()) { + lookup["DataSet"] = DataSet::GENERIC; + lookup["AlignmentSet"] = DataSet::ALIGNMENT; + lookup["BarcodeSet"] = DataSet::BARCODE; + lookup["ConsensusAlignmentSet"] = DataSet::CONSENSUS_ALIGNMENT; + lookup["ConsensusReadSet"] = DataSet::CONSENSUS_READ; + lookup["ContigSet"] = DataSet::CONTIG; + lookup["HdfSubreadSet"] = DataSet::HDF_SUBREAD; + lookup["ReferenceSet"] = DataSet::REFERENCE; + lookup["SubreadSet"] = DataSet::SUBREAD; + } + return lookup.at(typeName); // throws if unknown typename +} + +vector DataSet::ResolvedResourceIds(void) const +{ + const PacBio::BAM::ExternalResources& resources = ExternalResources(); + + vector result; + result.reserve(resources.Size()); + for(const ExternalResource& ext : resources) { +// const string fn = ; +// const string fn = internal::FileUtils::ResolvedFilePath(ext.ResourceId(), path_); + result.push_back(ResolvePath(ext.ResourceId())); + } + return result; +} + +string DataSet::ResolvePath(const string& originalPath) const +{ return internal::FileUtils::ResolvedFilePath(originalPath, path_); } + +void DataSet::Save(const std::string& outputFilename) +{ DataSetIO::ToFile(d_, outputFilename); } + +void DataSet::SaveToStream(ostream& out) +{ DataSetIO::ToStream(d_, out); } + +set DataSet::SequencingChemistries(void) const +{ + const vector bamFiles{ BamFiles() }; + + set result; + for(const BamFile& bf : bamFiles) { + if (!bf.IsPacBioBAM()) + throw std::runtime_error{ "only PacBio BAMs are supported" }; + const vector readGroups{ bf.Header().ReadGroups() }; + for (const ReadGroupInfo& rg : readGroups) + result.insert(rg.SequencingChemistry()); + } + return result; +} + +string DataSet::TypeToName(const DataSet::TypeEnum& type) +{ + switch(type) { + case DataSet::GENERIC : return "DataSet"; + case DataSet::ALIGNMENT : return "AlignmentSet"; + case DataSet::BARCODE : return "BarcodeSet"; + case DataSet::CONSENSUS_ALIGNMENT : return "ConsensusAlignmentSet"; + case DataSet::CONSENSUS_READ : return "ConsensusReadSet"; + case DataSet::CONTIG : return "ContigSet"; + case DataSet::HDF_SUBREAD : return "HdfSubreadSet"; + case DataSet::REFERENCE : return "ReferenceSet"; + case DataSet::SUBREAD : return "SubreadSet"; + default: + throw std::runtime_error("unsupported dataset type"); // unknown type + } +} + +// Exposed timestamp utils + +namespace PacBio { +namespace BAM { + +string CurrentTimestamp(void) +{ return internal::ToDataSetFormat(internal::CurrentTime()); } + +string ToDataSetFormat(const chrono::system_clock::time_point &tp) +{ return internal::ToDataSetFormat(tp); } + +string ToDataSetFormat(const time_t &t) +{ return ToDataSetFormat(chrono::system_clock::from_time_t(t)); } + +string ToIso8601(const chrono::system_clock::time_point &tp) +{ return internal::ToIso8601(tp); } + +string ToIso8601(const time_t &t) +{ return ToIso8601(chrono::system_clock::from_time_t(t)); } + +} // namespace BAM +} // namespace PacBio diff --git a/src/DataSetBaseTypes.cpp b/src/DataSetBaseTypes.cpp new file mode 100644 index 0000000..2c19e0b --- /dev/null +++ b/src/DataSetBaseTypes.cpp @@ -0,0 +1,126 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "pbbam/DataSetTypes.h" +#include "pbbam/internal/DataSetBaseTypes.h" +#include "DataSetUtils.h" +#include "TimeUtils.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +// ---------------- +// BaseEntityType +// ---------------- + +BaseEntityType::BaseEntityType(const std::string& label, const XsdType& xsd) + : DataSetElement(label, xsd) +{ + if (Version().empty()) + Version(internal::XML_VERSION); +} + +DEFINE_ACCESSORS(BaseEntityType, Extensions, Extensions) + +BaseEntityType& BaseEntityType::Extensions(const PacBio::BAM::Extensions& extensions) +{ Extensions() = extensions; return *this; } + +// ---------------- +// DataEntityType +// ---------------- + +DataEntityType::DataEntityType(const std::string& label, const XsdType& xsd) + : BaseEntityType(label, xsd) +{ } + +// ----------------- +// IndexedDataType +// ----------------- + +IndexedDataType::IndexedDataType(const string& metatype, + const string& filename, + const string& label, + const XsdType &xsd) + : InputOutputDataType(metatype, filename, label, xsd) +{ } + +DEFINE_ACCESSORS(IndexedDataType, FileIndices, FileIndices) + +IndexedDataType& IndexedDataType::FileIndices(const PacBio::BAM::FileIndices& indices) +{ FileIndices() = indices; return *this; } + +// --------------------- +// InputOutputDataType +// --------------------- + +InputOutputDataType::InputOutputDataType(const string& metatype, + const string& filename, + const string& label, + const XsdType &xsd) + : StrictEntityType(metatype, label, xsd) +{ + ResourceId(filename); +} + +// ---------------- +// StrictEntityType +// ---------------- + +StrictEntityType::StrictEntityType(const string& metatype, + const string& label, + const XsdType& xsd) + : BaseEntityType(label, xsd) +{ + // MetaType + MetaType(metatype); + + // TimeStampedName + const size_t numChars = metatype.size(); + string transformedMetatype; + transformedMetatype.resize(numChars); + for (size_t i = 0; i < numChars; ++i) { + const char c = metatype.at(i); + transformedMetatype[i] = ((c == '.') ? '_' : tolower(c)); + } + const string& tsn = transformedMetatype + "-" + internal::ToDataSetFormat(internal::CurrentTime()); + TimeStampedName(tsn); + + // UniqueId + UniqueId(internal::GenerateUuid()); +} diff --git a/src/DataSetElement.cpp b/src/DataSetElement.cpp new file mode 100644 index 0000000..6854fd2 --- /dev/null +++ b/src/DataSetElement.cpp @@ -0,0 +1,47 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "pbbam/internal/DataSetElement.h" +#include "DataSetUtils.h" +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; + +const std::string& DataSetElement::SharedNullString(void) +{ + return internal::NullObject(); +} diff --git a/src/DataSetIO.cpp b/src/DataSetIO.cpp new file mode 100644 index 0000000..741ffc8 --- /dev/null +++ b/src/DataSetIO.cpp @@ -0,0 +1,162 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "DataSetIO.h" +#include "FileUtils.h" +#include "FofnReader.h" +#include "StringUtils.h" +#include "XmlReader.h" +#include "XmlWriter.h" +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +typedef std::shared_ptr DataSetPtr; + +namespace PacBio { +namespace BAM { +namespace internal { + +static +unique_ptr FromXml(const string& xmlFn) +{ + ifstream in(xmlFn); + if (!in) + throw std::runtime_error("could not open XML file for reading"); + return XmlReader::FromStream(in); +} + +static +unique_ptr FromBam(const string& bamFn) +{ + unique_ptr dataset(new SubreadSet); + ExternalResources& resources = dataset->ExternalResources(); + resources.Add(ExternalResource(BamFile(bamFn))); + return dataset; +} + +static +unique_ptr FromFofn(const string& fofn) +{ + const string fofnDir = internal::FileUtils::DirectoryName(fofn); + ifstream in(fofn); + if (!in) + throw std::runtime_error("could not open FOFN for reading"); + + vector filenames = FofnReader::Files(in); + for (size_t i = 0; i < filenames.size(); ++i) + filenames[i] = internal::FileUtils::ResolvedFilePath(filenames[i], fofnDir); + return DataSetIO::FromUris(filenames); +} + +static +unique_ptr FromUri(const string& uri) +{ + // NOTE: this says URI, but we're not quite handling filenames as true URIs + // basically just treating as a regular filename for now + + // handle on extension + if (boost::algorithm::iends_with(uri, ".xml")) + return FromXml(uri); + else if (boost::algorithm::iends_with(uri, ".bam")) + return FromBam(uri); + else if (boost::algorithm::iends_with(uri, ".fofn")) + return FromFofn(uri); + + // unknown filename extension + throw std::runtime_error("unsupported input file extension"); +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +std::unique_ptr DataSetIO::FromUri(const std::string& uri) +{ + return FromUris(vector(1, uri)); +} + +std::unique_ptr DataSetIO::FromUris(const std::vector& uris) +{ + if (uris.empty()) + throw std::runtime_error("empty input URI list"); // or just return empty, generic DataSet? + + // create dataset(s) from URI(s) + vector< unique_ptr > datasets; + datasets.reserve(uris.size()); + for ( const auto& uri : uris ) + datasets.push_back(internal::FromUri(uri)); + assert(!datasets.empty()); + + // if only 1, just return + if (datasets.size() == 1) + return std::unique_ptr(datasets.front().release()); + + // else merge + else { + unique_ptr& result = datasets.front(); + for (size_t i = 1; i < datasets.size(); ++i) + *result += *datasets.at(i); + return unique_ptr(result.release()); + } +} + +std::unique_ptr DataSetIO::FromXmlString(const string& xml) +{ + if (xml.empty()) + throw std::runtime_error("empty XML string"); + stringstream s(xml); + return XmlReader::FromStream(s); +} + +void DataSetIO::ToFile(const std::unique_ptr& dataset, + const string& fn) +{ + ofstream out(fn); + if (!out) + throw std::runtime_error("could not open XML for writing"); + XmlWriter::ToStream(dataset, out); +} + +void DataSetIO::ToStream(const std::unique_ptr& dataset, ostream &out) +{ XmlWriter::ToStream(dataset, out); } diff --git a/src/DataSetIO.h b/src/DataSetIO.h new file mode 100644 index 0000000..b03c23b --- /dev/null +++ b/src/DataSetIO.h @@ -0,0 +1,73 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef DATASETIO_H +#define DATASETIO_H + +#include "pbbam/DataSet.h" +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +class DataSetIO +{ +public: + + // input + static std::unique_ptr FromUri(const std::string& uri); + static std::unique_ptr FromUris(const std::vector& uris); + + static std::unique_ptr FromXmlString(const std::string& xml); + +// static DataSetBase FromUri(const std::string& uri); +// static DataSetBase FromUris(const std::vector& uris); + +// // output + static void ToFile(const std::unique_ptr& dataset, const std::string& fn); + static void ToStream(const std::unique_ptr& dataset, std::ostream& out); +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // DATASETIO_H diff --git a/src/DataSetTypes.cpp b/src/DataSetTypes.cpp new file mode 100644 index 0000000..9dd7b27 --- /dev/null +++ b/src/DataSetTypes.cpp @@ -0,0 +1,480 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file DataSetTypes.cpp +/// \brief Implementations for the public DataSet component classes. +// +// Author: Derek Barnett + +#include "pbbam/DataSetTypes.h" +#include "pbbam/internal/DataSetBaseTypes.h" +#include "DataSetUtils.h" +#include "FileUtils.h" +#include "TimeUtils.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +// ------------------- +// AlignmentSet +// ------------------- + +AlignmentSet::AlignmentSet(void) + : DataSetBase("PacBio.DataSet.AlignmentSet", + "AlignmentSet", + XsdType::DATASETS) +{ } + +// ------------------- +// BarcodeSet +// ------------------- + +BarcodeSet::BarcodeSet(void) + : DataSetBase("PacBio.DataSet.BarcodeSet", + "BarcodeSet", + XsdType::DATASETS) +{ } + +// ----------------------- +// ConsensusAlignmentSet +// ----------------------- + +ConsensusAlignmentSet::ConsensusAlignmentSet(void) + : DataSetBase("PacBio.DataSet.ConsensusAlignmentSet", + "ConsensusAlignmentSet", + XsdType::DATASETS) +{ } + +// ------------------- +// ConsensusReadSet +// ------------------- + +ConsensusReadSet::ConsensusReadSet(void) + : DataSetBase("PacBio.DataSet.ConsensusReadSet", + "ConsensusReadSet", + XsdType::DATASETS) +{ } + +// ------------------- +// ContigSet +// ------------------- + +ContigSet::ContigSet(void) + : DataSetBase("PacBio.DataSet.ContigSet", + "ContigSet", + XsdType::DATASETS) +{ } + +// ------------------- +// DataSetBase +// ------------------- + +DataSetBase::DataSetBase(void) + : StrictEntityType("PacBio.DataSet.DataSet", + "DataSet", + XsdType::DATASETS) +{ } + +DataSetBase::DataSetBase(const string& metatype, + const string& label, + const XsdType& xsd) + : StrictEntityType(metatype, label, xsd) +{ } + +DEFINE_ACCESSORS(DataSetBase, ExternalResources, ExternalResources) + +DataSetBase& DataSetBase::ExternalResources(const PacBio::BAM::ExternalResources& resources) +{ ExternalResources() = resources; return *this; } + +DEFINE_ACCESSORS(DataSetBase, Filters, Filters) + +DataSetBase& DataSetBase::Filters(const PacBio::BAM::Filters& filters) +{ Filters() = filters; return *this; } + +DEFINE_ACCESSORS(DataSetBase, DataSetMetadata, Metadata) + +DataSetBase& DataSetBase::Metadata(const PacBio::BAM::DataSetMetadata& metadata) +{ Metadata() = metadata; return *this; } + +const PacBio::BAM::SubDataSets& DataSetBase::SubDataSets(void) const +{ + try { + return Child("DataSets"); + } catch (std::exception&) { + return internal::NullObject(); + } +} + +PacBio::BAM::SubDataSets& DataSetBase::SubDataSets(void) +{ + if (!HasChild("DataSets")) + AddChild(internal::NullObject()); + return Child("DataSets"); +} + +DataSetBase& DataSetBase::SubDataSets(const PacBio::BAM::SubDataSets &subdatasets) +{ SubDataSets() = subdatasets; return *this; } + +DataSetBase* DataSetBase::DeepCopy(void) const +{ + DataSetElement* copyDataset = new DataSetElement(*this); + DataSetBase* result = static_cast(copyDataset); + result->registry_ = registry_; + return result; +} + +DataSetBase& DataSetBase::operator+=(const DataSetBase& other) +{ + // must be same dataset types (or 'other' must be generic) + if (other.LocalNameLabel() != LocalNameLabel() && other.LocalNameLabel() != "DataSet") + throw std::runtime_error("cannot merge incompatible dataset types"); + + // check filter match + // check object metadata + Metadata() += other.Metadata(); + ExternalResources() += other.ExternalResources(); + Filters() += other.Filters(); + SubDataSets() += other; + + return *this; +} + +std::shared_ptr DataSetBase::Create(const string& typeName) +{ + if (typeName == string("DataSet")) return make_shared(); + if (typeName == string("SubreadSet")) return make_shared(); + if (typeName == string("AlignmentSet")) return make_shared(); + if (typeName == string("BarcodeSet")) return make_shared(); + if (typeName == string("ConsensusAlignmentSet")) return make_shared(); + if (typeName == string("ConsensusReadSet")) return make_shared(); + if (typeName == string("ContigSet")) return make_shared(); + if (typeName == string("HdfSubreadSet")) return make_shared(); + if (typeName == string("ReferenceSet")) return make_shared(); + + // unknown typename + throw std::runtime_error("unsupported dataset type"); +} + +// ------------------- +// DataSetMetadata +// ------------------- + +DataSetMetadata::DataSetMetadata(const std::string& numRecords, + const std::string& totalLength) + : DataSetElement("DataSetMetadata", XsdType::DATASETS) +{ + NumRecords(numRecords); + TotalLength(totalLength); +} + +DEFINE_ACCESSORS(DataSetMetadata, Provenance, Provenance) + +DataSetMetadata& DataSetMetadata::Provenance(const PacBio::BAM::Provenance& provenance) +{ Provenance() = provenance; return *this; } + +DataSetMetadata& DataSetMetadata::operator+=(const DataSetMetadata& other) +{ + NumRecords() = NumRecords() + other.NumRecords(); + TotalLength() = TotalLength() + other.TotalLength(); + // merge add'l + return *this; +} + +// ------------------- +// ExtensionElement +// ------------------- + +ExtensionElement::ExtensionElement(void) + : DataSetElement("ExtensionElement", XsdType::BASE_DATA_MODEL) +{ } + +// ------------------- +// Extensions +// ------------------- + +Extensions::Extensions(void) + : DataSetListElement("Extensions", XsdType::BASE_DATA_MODEL) +{ } + +// ------------------- +// ExternalResource +// ------------------- + +ExternalResource::ExternalResource(const BamFile& bamFile) + : IndexedDataType("PacBio.SubreadFile.SubreadBamFile", + bamFile.Filename(), + "ExternalResource", + XsdType::BASE_DATA_MODEL) +{ } + +ExternalResource::ExternalResource(const string& metatype, + const string& filename) + : IndexedDataType(metatype, + filename, + "ExternalResource", + XsdType::BASE_DATA_MODEL) +{ } + +DEFINE_ACCESSORS(ExternalResource, ExternalResources, ExternalResources) + +ExternalResource& ExternalResource::ExternalResources(const PacBio::BAM::ExternalResources& resources) +{ ExternalResources() = resources; return *this; } + +BamFile ExternalResource::ToBamFile(void) const +{ return BamFile(ResourceId()); } + +// ------------------- +// ExternalResources +// ------------------- + +ExternalResources::ExternalResources(void) + : DataSetListElement("ExternalResources", + XsdType::BASE_DATA_MODEL) +{ } + +ExternalResources& ExternalResources::operator+=(const ExternalResources& other) +{ + // only keep unique resource ids + + set myResourceIds; + for (size_t i = 0; i < Size(); ++i) { + const ExternalResource& resource = this->operator[](i); + myResourceIds.insert(resource.ResourceId()); + } + + vector newResourceIndices; + const size_t numOtherResourceIds = other.Size(); + for (size_t i = 0; i < numOtherResourceIds; ++i) { + const string& resourceId = other[i].ResourceId(); + auto found = myResourceIds.find(resourceId); + if (found == myResourceIds.cend()) + newResourceIndices.push_back(i); + } + + for (size_t index : newResourceIndices) + Add(other[index]); + + return *this; +} + +void ExternalResources::Add(const ExternalResource& ext) +{ + // disallow external resources w/ duplicate ResourceIds + set myResourceIds; + for (size_t i = 0; i < Size(); ++i) { + const ExternalResource& resource = this->operator[](i); + myResourceIds.insert(resource.ResourceId()); + } + if (myResourceIds.find(ext.ResourceId()) == myResourceIds.cend()) + AddChild(ext); +} + +vector ExternalResources::BamFiles(void) const +{ + vector result; + const int numResources = Size(); + result.reserve(numResources); + for( const ExternalResource& ext : *this ) + result.push_back(ext.ToBamFile()); + return result; +} + +void ExternalResources::Remove(const ExternalResource& ext) +{ RemoveChild(ext); } + +// ------------------- +// FileIndex +// ------------------- + +FileIndex::FileIndex(const string& metatype, const string& filename) + : InputOutputDataType(metatype, + filename, + "FileIndex", + XsdType::BASE_DATA_MODEL) +{ } + +// ------------------- +// FileIndices +// ------------------- + +FileIndices::FileIndices(void) + : DataSetListElement("FileIndices", XsdType::BASE_DATA_MODEL) +{ } + +void FileIndices::Add(const FileIndex& index) +{ AddChild(index); } + +void FileIndices::Remove(const FileIndex& index) +{ RemoveChild(index); } + +// ------------------- +// Filter +// ------------------- + +Filter::Filter(void) + : DataSetElement("Filter", XsdType::DATASETS) +{ } + +DEFINE_ACCESSORS(Filter, Properties, Properties) + +Filter& Filter::Properties(const PacBio::BAM::Properties& properties) +{ Properties() = properties; return *this; } + +// ------------------- +// Filters +// ------------------- + +Filters::Filters(void) + : DataSetListElement("Filters", XsdType::DATASETS) +{ } + +Filters& Filters::operator+=(const Filters& other) +{ + for (auto& newFilter : other) + AddChild(newFilter); + return *this; +} + +void Filters::Add(const Filter& filter) +{ AddChild(filter); } + +void Filters::Remove(const Filter& filter) +{ RemoveChild(filter); } + +// ------------------- +// HdfSubreadSet +// ------------------- + +HdfSubreadSet::HdfSubreadSet(void) + : DataSetBase("PacBio.DataSet.HdfSubreadSet", + "HdfSubreadSet", + XsdType::DATASETS) +{ } + +// ------------------- +// ParentTool +// ------------------- + +ParentTool::ParentTool(void) + : BaseEntityType("ParentTool", XsdType::DATASETS) +{ } + +// ------------------- +// Properties +// ------------------- + +Properties::Properties(void) + : DataSetListElement("Properties", XsdType::BASE_DATA_MODEL) +{ } + +void Properties::Add(const Property &property) +{ AddChild(property); } + +void Properties::Remove(const Property& property) +{ RemoveChild(property); } + +// ------------------- +// Property +// ------------------- + +Property::Property(const std::string& name, + const std::string& value, + const std::string& op) + : DataSetElement("Property", XsdType::BASE_DATA_MODEL) +{ + Name(name); + Value(value); + Operator(op); +} + +// ------------------- +// Provenance +// ------------------- + +Provenance::Provenance(void) + : DataSetElement("Provenance", XsdType::DATASETS) +{ } + +DEFINE_ACCESSORS(Provenance, ParentTool, ParentTool) + +// ------------------- +// ReferenceSet +// ------------------- + +ReferenceSet::ReferenceSet(void) + : DataSetBase("PacBio.DataSet.ReferenceSet", + "ReferenceSet", + XsdType::DATASETS) +{ } + +// ------------------- +// SubDataSets +// ------------------- + +SubDataSets::SubDataSets(void) + : internal::DataSetListElement("DataSets", XsdType::DATASETS) +{ } + +SubDataSets& SubDataSets::operator+=(const DataSetBase& other) +{ + AddChild(other); + return *this; +} + +SubDataSets& SubDataSets::operator+=(const SubDataSets& other) +{ + for (auto& newSubDataset : other) + AddChild(newSubDataset); + return *this; +} + +void SubDataSets::Add(const DataSetBase& subdataset) +{ AddChild(subdataset); } + +void SubDataSets::Remove(const DataSetBase& subdataset) +{ RemoveChild(subdataset); } + +// ------------------- +// SubreadSet +// ------------------- + +SubreadSet::SubreadSet(void) + : DataSetBase("PacBio.DataSet.SubreadSet", + "SubreadSet", + XsdType::DATASETS) +{ } diff --git a/src/DataSetUtils.h b/src/DataSetUtils.h new file mode 100644 index 0000000..dcf234c --- /dev/null +++ b/src/DataSetUtils.h @@ -0,0 +1,107 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef DATASETUTILS_H +#define DATASETUTILS_H + +#include "pbbam/DataSetTypes.h" +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +static const std::string XML_VERSION = std::string { "3.0.1" }; + +template +inline const T& NullObject(void) +{ + static const T empty; + return empty; +} + +template<> +inline const PacBio::BAM::DataSetMetadata& NullObject(void) +{ + static const PacBio::BAM::DataSetMetadata empty("", ""); + return empty; +} + +inline +std::string GenerateUuid(void) +{ + static boost::uuids::random_generator gen; + const boost::uuids::uuid uuid = gen(); + return boost::uuids::to_string(uuid); +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#ifndef FETCH_CHILD_CONST_REF +#define FETCH_CHILD_CONST_REF(Class, Type, Method) \ + \ + const PacBio::BAM::Type& Class::Method(void) const \ + { \ + try { \ + return Child(#Type); \ + } catch (std::exception&) { \ + return internal::NullObject(); \ + } \ + } +#endif + +#ifndef FETCH_CHILD_REF +#define FETCH_CHILD_REF(Class, Type, Method) \ + \ + PacBio::BAM::Type& Class::Method(void) \ + { \ + if (!HasChild(#Type)) \ + AddChild(internal::NullObject()); \ + return Child(#Type); \ + } +#endif + +#ifndef DEFINE_ACCESSORS +#define DEFINE_ACCESSORS(Class, Type, Method) \ + FETCH_CHILD_CONST_REF(Class, Type, Method) \ + FETCH_CHILD_REF(Class, Type, Method) +#endif + +#endif // DATASETUTILS_H diff --git a/src/DataSetXsd.cpp b/src/DataSetXsd.cpp new file mode 100644 index 0000000..88d2d91 --- /dev/null +++ b/src/DataSetXsd.cpp @@ -0,0 +1,263 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file DataSetXsd.cpp +/// \brief Implements the XSD- and namespace-related classes for DataSetXML. +// +// Author: Derek Barnett + +#include "pbbam/DataSetXsd.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +static map DefaultRegistry(void) +{ + const auto result = map + { + { XsdType::NONE, NamespaceInfo{ "", "" } }, + { XsdType::AUTOMATION_CONSTRAINTS, NamespaceInfo{ "", "http://pacificbiosciences.com/PacBioAutomationConstraints.xsd" } }, + { XsdType::BASE_DATA_MODEL, NamespaceInfo{ "pbbase", "http://pacificbiosciences.com/PacBioBaseDataModel.xsd" } }, + { XsdType::COLLECTION_METADATA, NamespaceInfo{ "pbmeta", "http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" } }, + { XsdType::COMMON_MESSAGES, NamespaceInfo{ "", "http://pacificbiosciences.com/PacBioCommonMessages.xsd" } }, + { XsdType::DATA_MODEL, NamespaceInfo{ "pbdm", "http://pacificbiosciences.com/PacBioDataModel.xsd" } }, + { XsdType::DATA_STORE, NamespaceInfo{ "", "http://pacificbiosciences.com/PacBioDataStore.xsd" } }, + { XsdType::DATASETS, NamespaceInfo{ "pbds", "http://pacificbiosciences.com/PacBioDatasets.xsd" } }, + { XsdType::DECL_DATA, NamespaceInfo{ "", "http://pacificbiosciences.com/PacBioDeclData.xsd" } }, + { XsdType::PART_NUMBERS, NamespaceInfo{ "pbpn", "http://pacificbiosciences.com/PacBioPartNumbers.xsd" } }, + { XsdType::PRIMARY_METRICS, NamespaceInfo{ "", "http://pacificbiosciences.com/PacBioPrimaryMetrics.xsd" } }, + { XsdType::REAGENT_KIT, NamespaceInfo{ "pbrk", "http://pacificbiosciences.com/PacBioReagentKit.xsd" } }, + { XsdType::RIGHTS_AND_ROLES, NamespaceInfo{ "", "http://pacificbiosciences.com/PacBioRightsAndRoles.xsd" } }, + { XsdType::SAMPLE_INFO, NamespaceInfo{ "pbsample", "http://pacificbiosciences.com/PacBioSampleInfo.xsd" } }, + { XsdType::SEEDING_DATA, NamespaceInfo{ "", "http://pacificbiosciences.com/PacBioSeedingData.xsd" } } + }; + return result; +} + +static const auto elementRegistry = unordered_map +{ + // 'pbbase' elements + // + { "AutomationParameter" , XsdType::BASE_DATA_MODEL }, + { "AutomationParameters" , XsdType::BASE_DATA_MODEL }, + { "BinCount" , XsdType::BASE_DATA_MODEL }, + { "BinCounts" , XsdType::BASE_DATA_MODEL }, + { "BinLabel" , XsdType::BASE_DATA_MODEL }, + { "BinLabels" , XsdType::BASE_DATA_MODEL }, + { "BinWidth" , XsdType::BASE_DATA_MODEL }, + { "ExternalResource" , XsdType::BASE_DATA_MODEL }, + { "ExternalResources" , XsdType::BASE_DATA_MODEL }, + { "FileIndex" , XsdType::BASE_DATA_MODEL }, + { "FileIndices" , XsdType::BASE_DATA_MODEL }, + { "MaxBinValue" , XsdType::BASE_DATA_MODEL }, + { "MaxOutlierValue" , XsdType::BASE_DATA_MODEL }, + { "MetricDescription" , XsdType::BASE_DATA_MODEL }, + { "NumBins" , XsdType::BASE_DATA_MODEL }, + { "Properties" , XsdType::BASE_DATA_MODEL }, + { "Property" , XsdType::BASE_DATA_MODEL }, + { "Sample95thPct" , XsdType::BASE_DATA_MODEL }, + { "SampleMean" , XsdType::BASE_DATA_MODEL }, + { "SampleMed" , XsdType::BASE_DATA_MODEL }, + { "SampleSize" , XsdType::BASE_DATA_MODEL }, + { "SampleStd" , XsdType::BASE_DATA_MODEL }, + + // 'pbds' elements + // + { "AdapterDimerFraction", XsdType::DATASETS }, + { "AlignmentSet", XsdType::DATASETS }, + { "BarcodeConstruction", XsdType::DATASETS }, + { "BarcodeSet", XsdType::DATASETS }, + { "ConsensusAlignmentSet", XsdType::DATASETS }, + { "ConsensusReadSet", XsdType::DATASETS }, + { "Contig", XsdType::DATASETS }, + { "Contigs", XsdType::DATASETS }, + { "ContigSet", XsdType::DATASETS }, + { "ControlReadLenDist", XsdType::DATASETS }, + { "ControlReadQualDist", XsdType::DATASETS }, + { "DataSetMetdata", XsdType::DATASETS }, + { "DataSet", XsdType::DATASETS }, + { "DataSets", XsdType::DATASETS }, + { "Filter", XsdType::DATASETS }, + { "Filters", XsdType::DATASETS }, + { "HdfSubreadSet", XsdType::DATASETS }, + { "InsertReadLenDist", XsdType::DATASETS }, + { "InsertReadQualDist" , XsdType::DATASETS }, + { "MedianInsertDist", XsdType::DATASETS }, + { "NumRecords", XsdType::DATASETS }, + { "NumSequencingZmws", XsdType::DATASETS }, + { "Organism", XsdType::DATASETS }, + { "ParentTool", XsdType::DATASETS }, + { "Ploidy", XsdType::DATASETS }, + { "ProdDist", XsdType::DATASETS }, + { "Provenance", XsdType::DATASETS }, + { "ReadLenDist", XsdType::DATASETS }, + { "ReadQualDist", XsdType::DATASETS }, + { "ReadTypeDist", XsdType::DATASETS }, + { "ReferenceSet", XsdType::DATASETS }, + { "ShortInsertFraction", XsdType::DATASETS }, + { "SubreadSet", XsdType::DATASETS }, + { "SummaryStats", XsdType::DATASETS }, + { "TotalLength", XsdType::DATASETS }, + + // 'pbmeta' elements + // + { "Automation", XsdType::COLLECTION_METADATA }, + { "AutomationName", XsdType::COLLECTION_METADATA }, + { "CellIndex", XsdType::COLLECTION_METADATA }, + { "CellPac", XsdType::COLLECTION_METADATA }, + { "CollectionFileCopy", XsdType::COLLECTION_METADATA }, + { "CollectionMetadata", XsdType::COLLECTION_METADATA }, + { "CollectionNumber", XsdType::COLLECTION_METADATA }, + { "CollectionPathUri", XsdType::COLLECTION_METADATA }, + { "Collections", XsdType::COLLECTION_METADATA }, + { "Concentration", XsdType::COLLECTION_METADATA }, + { "ConfigFileName", XsdType::COLLECTION_METADATA }, + { "CopyFiles", XsdType::COLLECTION_METADATA }, + { "InstCtrlVer", XsdType::COLLECTION_METADATA }, + { "MetricsVerbosity", XsdType::COLLECTION_METADATA }, + { "Name", XsdType::COLLECTION_METADATA }, + { "OutputOptions", XsdType::COLLECTION_METADATA }, + { "PlateId", XsdType::COLLECTION_METADATA }, + { "Primary", XsdType::COLLECTION_METADATA }, + { "Readout", XsdType::COLLECTION_METADATA }, + { "ResultsFolder", XsdType::COLLECTION_METADATA }, + { "RunDetails", XsdType::COLLECTION_METADATA }, + { "RunId", XsdType::COLLECTION_METADATA }, + { "SampleReuseEnabled", XsdType::COLLECTION_METADATA }, + { "SequencingCondition", XsdType::COLLECTION_METADATA }, + { "SigProcVer", XsdType::COLLECTION_METADATA }, + { "SizeSelectionEnabled", XsdType::COLLECTION_METADATA }, + { "StageHotstartEnabled", XsdType::COLLECTION_METADATA }, + { "UseCount", XsdType::COLLECTION_METADATA }, + { "WellName", XsdType::COLLECTION_METADATA }, + { "WellSample", XsdType::COLLECTION_METADATA }, + + // 'pbsample' elements + // + { "BioSample", XsdType::SAMPLE_INFO }, + { "BioSamplePointer", XsdType::SAMPLE_INFO }, + { "BioSamplePointers", XsdType::SAMPLE_INFO }, + { "BioSamples", XsdType::SAMPLE_INFO } +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +// --------------- +// NamespaceInfo +// --------------- + +NamespaceInfo::NamespaceInfo(void) { } + +NamespaceInfo::NamespaceInfo(const string& name, + const string& uri) + : name_(name) + , uri_(uri) +{ } + +// ------------------- +// NamespaceRegistry +// ------------------- + +NamespaceRegistry::NamespaceRegistry(void) + : data_(internal::DefaultRegistry()) + , defaultXsdType_(XsdType::DATASETS) +{ } + +NamespaceRegistry::NamespaceRegistry(const NamespaceRegistry &other) + : data_(other.data_) + , defaultXsdType_(other.defaultXsdType_) +{ } + +NamespaceRegistry::NamespaceRegistry(NamespaceRegistry &&other) + : data_(std::move(other.data_)) + , defaultXsdType_(std::move(other.defaultXsdType_)) +{ } + +NamespaceRegistry& NamespaceRegistry::operator=(const NamespaceRegistry& other) +{ + data_ = other.data_; + defaultXsdType_ = other.defaultXsdType_; + return *this; +} + +NamespaceRegistry& NamespaceRegistry::operator=(NamespaceRegistry&& other) +{ + data_ = std::move(other.data_); + defaultXsdType_ = std::move(other.defaultXsdType_); + return *this; +} + +NamespaceRegistry::~NamespaceRegistry(void) { } + +const NamespaceInfo& NamespaceRegistry::DefaultNamespace(void) const +{ return Namespace(DefaultXsd()); } + +XsdType NamespaceRegistry::DefaultXsd(void) const +{ return defaultXsdType_; } + +const NamespaceInfo& NamespaceRegistry::Namespace(const XsdType& xsd) const +{ return data_.at(xsd); } + +void NamespaceRegistry::Register(const XsdType& xsd, const NamespaceInfo& namespaceInfo) +{ data_[xsd] = namespaceInfo; } + +void NamespaceRegistry::SetDefaultXsd(const XsdType& xsd) +{ defaultXsdType_ = xsd; } + +XsdType NamespaceRegistry::XsdForElement(const std::string& elementLabel) const +{ + const auto iter = internal::elementRegistry.find(elementLabel); + return (iter == internal::elementRegistry.cend() ? XsdType::NONE : iter->second); +} + +XsdType NamespaceRegistry::XsdForUri(const std::string& uri) const +{ + map::const_iterator iter = data_.cbegin(); + map::const_iterator end = data_.cend(); + for ( ; iter != end; ++iter ) { + const NamespaceInfo& info = iter->second; + if (info.Uri() == uri) + return iter->first; + } + return XsdType::NONE; +} diff --git a/src/EntireFileQuery.cpp b/src/EntireFileQuery.cpp new file mode 100644 index 0000000..6813492 --- /dev/null +++ b/src/EntireFileQuery.cpp @@ -0,0 +1,65 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file EntireFileQuery.cpp +/// \brief Implements the EntireFileQuery class. +// +// Author: Derek Barnett + +#include "pbbam/EntireFileQuery.h" +#include "pbbam/CompositeBamReader.h" +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +struct EntireFileQuery::EntireFileQueryPrivate +{ + EntireFileQueryPrivate(const DataSet& dataset) + : reader_(dataset) + { } + + SequentialCompositeBamReader reader_; +}; + +EntireFileQuery::EntireFileQuery(const DataSet &dataset) + : internal::IQuery() + , d_(new EntireFileQueryPrivate(dataset)) +{ } + +EntireFileQuery::~EntireFileQuery(void) { } + +bool EntireFileQuery::GetNext(BamRecord &r) +{ return d_->reader_.GetNext(r); } diff --git a/src/EnumClassHash.h b/src/EnumClassHash.h new file mode 100644 index 0000000..53740f7 --- /dev/null +++ b/src/EnumClassHash.h @@ -0,0 +1,85 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file EnumClassHash.h +/// \brief Defines the EnumClassHash class. +// +// Author: Derek Barnett + +#ifndef ENUMCLASSHASH_H +#define ENUMCLASSHASH_H + +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +/// +/// \brief The EnumClassHash struct enables the use of enum class types as keys +/// for std::unordered_map. +/// +/// Allows something like: +/// +/// \code{.cpp} +/// std::unordered_map myLookup; +/// \endcode +/// +/// where Key_t is an enum class. Without this sort of extra hand-holding to +/// provide a 'manual' hash value, enum classes as keys will fail to compile. +/// +/// \note This approach might be unnecessary in C++14, if I understand some of +/// the changes correctly. But this works for C++11 and should continue beyond. +/// +/// \sa http://stackoverflow.com/questions/18837857/cant-use-enum-class-as-unordered-map-key +/// +struct EnumClassHash +{ + // *** NOTE *** + // + // Remove this when we integrate pbcopper. + // This is a duplicate of pbcopper/utility/EnumClassHash.h + // + + template size_t operator()(const T t) const + { return static_cast(t); } +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // ENUMCLASSHASH_H diff --git a/src/FastaReader.cpp b/src/FastaReader.cpp new file mode 100644 index 0000000..f82d635 --- /dev/null +++ b/src/FastaReader.cpp @@ -0,0 +1,155 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file FastaReader.cpp +/// \brief Implements the FastaReader class. +// +// Author: Derek Barnett + +#include "pbbam/FastaReader.h" +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +struct FastaReaderPrivate +{ + ifstream stream_; + string name_; + string bases_; + + FastaReaderPrivate(const std::string& fn) + : stream_(fn) + { + if (!stream_) + throw std::runtime_error("FastaReader - could not open " + fn + " for reading"); + FetchNext(); + } + + bool GetNext(FastaSequence& record) + { + if (name_.empty() && bases_.empty()) + return false; + record = FastaSequence { name_, bases_ }; + FetchNext(); + return true; + } + +private: + void FetchNext(void) + { + name_.clear(); + bases_.clear(); + + SkipNewlines(); + ReadName(); + ReadBases(); + } + + inline void SkipNewlines(void) + { + if (!stream_) + return; + if (stream_.peek() == '\n') + stream_.ignore(std::numeric_limits::max(), '\n'); + } + + void ReadName(void) { + if (!stream_) + return; + if (stream_.get() == '>') + std::getline(stream_, name_, '\n'); + } + + void ReadBases(void) + { + if (!stream_) + return; + char c = static_cast(stream_.peek()); + string line; + while (c != '>') { + if (!stream_) + return; + std::getline(stream_, line, '\n'); + bases_ += line; + if (!stream_) + return; + c = static_cast(stream_.peek()); + } + } +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +FastaReader::FastaReader(const std::string& fn) + : d_{ new internal::FastaReaderPrivate{ fn } } +{ } + +FastaReader::FastaReader(FastaReader&& other) + : d_{ std::move(other.d_) } +{ } + +FastaReader& FastaReader::operator=(FastaReader&& other) +{ + d_.swap(other.d_); + return *this; +} + +FastaReader::~FastaReader(void) { } + +bool FastaReader::GetNext(FastaSequence& record) +{ return d_->GetNext(record); } + +vector FastaReader::ReadAll(const string& fn) +{ + vector result; + result.reserve(256); + FastaReader reader{ fn }; + FastaSequence s; + while(reader.GetNext(s)) + result.emplace_back(s); + return result; +} diff --git a/src/FileProducer.cpp b/src/FileProducer.cpp new file mode 100644 index 0000000..8cec89e --- /dev/null +++ b/src/FileProducer.cpp @@ -0,0 +1,71 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "FileProducer.h" +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +FileProducer::FileProducer(const string& targetFilename) + : FileProducer(targetFilename, targetFilename + ".tmp") +{ } + +FileProducer::FileProducer(const string& targetFilename, + const string& tempFilename) + : targetFilename_(targetFilename) + , tempFilename_(tempFilename) +{ + // override renaming if writing to stdout + // + // setting temp filename to '-' keeps consistent interfaces + // for derived classes to actually operate on temp filename + if (targetFilename_ == "-") + tempFilename_ = "-"; +} + +FileProducer::~FileProducer(void) +{ + // skip renaming if there is a 'live' exception + // or if writing to stdout + if ((std::current_exception() == nullptr) && (tempFilename_ != "-")) { + std::rename(tempFilename_.c_str(), + targetFilename_.c_str()); + } +} diff --git a/src/FileProducer.h b/src/FileProducer.h new file mode 100644 index 0000000..aee8c85 --- /dev/null +++ b/src/FileProducer.h @@ -0,0 +1,96 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef FILEPRODUCER_H +#define FILEPRODUCER_H + +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +// The FileProducer class provides functionality for working with a temp +// file until successful destruction of a FileProducer-derived class. +// +// Derived classes should be sure to flush/close the temp file, and the +// FileProducer's destructor will ensure that the temp file will be renamed to +// the target filename. +// +// If destruction is triggered by an exception, no renaming will occur. +// +class FileProducer { + +protected: + FileProducer(void) = delete; + + // Initializes FileProducer with specified target filename. Temp filename is + // set to target filename plus ".tmp" suffix. + FileProducer(const std::string& targetFilename); + + // Initializes FileProducer with specified target filename & explicit temp + // filename. + FileProducer(const std::string& targetFilename, + const std::string& tempFilename); + + // Renames temp file to target filename. + // + // Derived classes should ensure that data is flushed and file handle closed + // before or during their destructor. + // + // Remaming will not occur if there is a 'live' exception being thrown. + // + ~FileProducer(void); + +protected: + const std::string& TargetFilename(void) const + { return targetFilename_; } + + const std::string& TempFilename(void) const + { return tempFilename_; } + +private: + std::string targetFilename_; + std::string tempFilename_; +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // FILEPRODUCER_H diff --git a/src/FileUtils.cpp b/src/FileUtils.cpp new file mode 100644 index 0000000..a0a59af --- /dev/null +++ b/src/FileUtils.cpp @@ -0,0 +1,246 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "FileUtils.h" +#include "StringUtils.h" +#include +#include +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +// pops "file://" scheme off the front of a URI/filepath, if found +static string removeFileUriScheme(const string& uri) +{ + assert(!uri.empty()); + + auto schemeLess = uri; + const auto fileScheme = string{"file://"}; + const auto schemeFound = schemeLess.find(fileScheme); + if (schemeFound != string::npos) { + if (schemeFound != 0) + throw runtime_error("Malformed URI: scheme not at beginning"); + schemeLess = schemeLess.substr(fileScheme.size()); + } + return schemeLess; +} + +#ifdef PBBAM_WIN_FILEPATHS + +static +string removeDiskName(const string& filePath) +{ + if (filePath.size() >= 2) { + const char firstChar = filePath.at(0); + if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':')) + return filePath.substr(2); + } + return filePath; +} + +static const char native_pathSeparator = '\\'; + +static bool native_pathIsAbsolute(const string& filePath) +{ + assert(!filePath.empty()); + + // if starts with single slash or double slash + if (boost::algorithm::starts_with(filePath, "\\")) + return true; + + // if starts with single or double-dots -> not absolute + if (boost::algorithm::starts_with(filePath, ".")) + return false; + + // if starts with disk drive name and colon ("C:\foo\bar.txt") + // strip the drive name and check to see if the remaining path is absolute + if (filePath.size() >= 2) { + const char firstChar = filePath.at(0); + if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':')) + return native_pathIsAbsolute(removeDiskName(filePath)); + } + + // otherwise, likely relative + return false; +} + +static string native_resolvedFilePath(const string& filePath, + const string& from) +{ + // strip file:// scheme if present + auto schemeLess = removeFileUriScheme(filePath); + + // if empty or already absolute path, just return it + // upfront empty check simplifies further parsing logic + if (schemeLess.empty() || native_pathIsAbsolute(schemeLess)) + return schemeLess; + + // else make relative from the provided 'from' directory + // + // first pop disk name, then any leading single-dot '.' + // + // since we're prepending the 'from' directory, we can remove + // any leading './' form our file path. this may just mean that + // we pop it off to add it right back (when from == '.'), but this + // keeps it consistent with other 'from' parent directories + // + schemeLess = removeDiskName(schemeLess); + + const bool thisDirAtStart = (schemeLess.find(".") == 0); + if (thisDirAtStart) { + if (schemeLess.find(native_pathSeparator) == 1) + schemeLess = schemeLess.substr(2); + } + return from + native_pathSeparator + schemeLess; +} + +#else // else for non-Windows systems + +static const char native_pathSeparator = '/'; + +static bool native_pathIsAbsolute(const string& filePath) +{ return filePath.at(0) == '/'; } + +static string native_resolvedFilePath(const string& filePath, + const string& from) +{ + // strip file:// scheme if present + auto schemeLess = removeFileUriScheme(filePath); + + // if empty or already absolute path, just return it + // upfront empty check simplifies further parsing logic + if (schemeLess.empty() || native_pathIsAbsolute(schemeLess)) + return schemeLess; + + // else make relative from the provided 'from' directory + // + // since we're prepending the 'from' directory, we can remove + // any leading './' form our file path. this may just mean that + // we pop it off to add it right back (when from == '.'), but this + // keeps it consistent with other 'from' parent directories + // + const bool thisDirAtStart = (schemeLess.find(".") == 0); + if (thisDirAtStart) { + if (schemeLess.find(native_pathSeparator) == 1) + schemeLess = schemeLess.substr(2); + } + return from + native_pathSeparator + schemeLess; +} + +#endif // PBBAM_WIN_FILEPATHS + +// see http://stackoverflow.com/questions/2869594/how-return-a-stdstring-from-cs-getcwd-function +string FileUtils::CurrentWorkingDirectory(void) +{ + const size_t chunkSize = 1024; + const size_t maxNumChunks = 20; + + // stack-based buffer for 'normal' case + char buffer[chunkSize]; + if (getcwd(buffer, sizeof(buffer)) != NULL) + return string(buffer); + + // if error is not ERANGE, then it's not a problem of too-long name... something else happened + if (errno != ERANGE) + throw runtime_error("could not determine current working directory path"); + + // long path - use heap, trying progressively longer buffers + for (size_t chunks = 2; chunks < maxNumChunks; ++chunks) { + unique_ptr cwd(new char[chunkSize*chunks]); + if (getcwd(cwd.get(), chunkSize*chunks) != NULL) + return string(cwd.get()); + + // if error is not ERANGE, then it's not a problem of too-long name... something else happened + if (errno != ERANGE) + throw runtime_error("could not determine current working directory path"); + } + + // crazy long path name + throw runtime_error("could determine current working directory - extremely long path"); +} + +string FileUtils::DirectoryName(const string& file) +{ + const size_t found = file.rfind(Separator(), file.length()); + if (found != string::npos) + return file.substr(0, found); + return string("."); +} + +bool FileUtils::Exists(const char* fn) +{ + struct stat buf; + return (stat(fn, &buf) != -1); +} + +chrono::system_clock::time_point FileUtils::LastModified(const char* fn) +{ + struct stat s; + if (stat(fn, &s) != 0) + throw runtime_error("could not get file timestamp"); + return chrono::system_clock::from_time_t(s.st_mtime); +} + +string FileUtils::ResolvedFilePath(const string& filePath, + const string& from) +{ return native_resolvedFilePath(filePath, from); } + +constexpr char FileUtils::Separator(void) +{ return native_pathSeparator; } + +off_t FileUtils::Size(const char* fn) +{ + struct stat s; + if (stat(fn, &s) != 0) + throw runtime_error("could not determine file size"); + return s.st_size; +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio diff --git a/src/FileUtils.h b/src/FileUtils.h new file mode 100644 index 0000000..112223e --- /dev/null +++ b/src/FileUtils.h @@ -0,0 +1,145 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef FILEUTILS_H +#define FILEUTILS_H + +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +struct FileUtils +{ +public: + + /// \returns application's current working directory + static std::string CurrentWorkingDirectory(void); + + /// Parses a filepath for the the directory name for a file. + /// + /// Essentially this method strips the filename from the string provided (/path/to/file => /path/to). + /// If only a filename is provided, then "." is returned to indicate the current directory. + /// + /// \param[in] file name of file (can be just a filename or path/to/filename) + /// \returns file's directory name + /// + static std::string DirectoryName(const std::string& file); + + /// Check for existence of a file. + /// + /// \param[in] fn full path to file + /// \returns true if file exists & can be opened + /// + static bool Exists(const char* fn); + + /// Check for existence of a file. + /// + /// \param[in] fn full path to file + /// \returns true if file exists & can be opened + /// + static bool Exists(const std::string& fn); + + /// Check "last modified" timestamp for a file. + /// + /// \param[in] fn full path to file + /// \returns time of last modification + /// \throws runtime_error if file info can't be accessed + /// + static std::chrono::system_clock::time_point LastModified(const char* fn); + + /// Check "last modified" timestamp for a file. + /// + /// \param[in] fn full path to file + /// \returns time of last modification + /// \throws runtime_error if file info can't be accessed + /// + static std::chrono::system_clock::time_point LastModified(const std::string& fn); + + /// Resolves input file path using optional starting directory. + /// + /// \verbatim + /// /absolute/path/to/file.txt => /absolute/path/to/file.txt + /// ../relative/path/to/file.txt => /../relative/path/to/file.txt + /// file.txt => /file.txt + /// \endverbatim + /// + /// \note This method will strip any URI scheme as well ("file://") so that the result is immediately ready from I/O operations. + /// + /// \param[in] filePath file path to be resolved + /// \param[in] from optional starting directory (useful if not same as application's working directory) + /// \returns resolved file path + /// + static std::string ResolvedFilePath(const std::string& filePath, + const std::string& from = "."); + + /// \returns native path separator + constexpr static char Separator(void); + + /// Check size of file. + /// + /// \param[in] fn full path to file + /// \returns file size in bytes + /// \throws runtime_error if file info can't be accessed + /// + static off_t Size(const char* fn); + + /// Check size of file. + /// + /// \param[in] fn full path to file + /// \returns file size in bytes + /// \throws runtime_error if file info can't be accessed + /// + static off_t Size(const std::string& fn); +}; + +inline bool FileUtils::Exists(const std::string& fn) +{ return FileUtils::Exists(fn.c_str()); } + +inline std::chrono::system_clock::time_point FileUtils::LastModified(const std::string& fn) +{ return FileUtils::LastModified(fn.c_str()); } + +inline off_t FileUtils::Size(const std::string& fn) +{ return FileUtils::Size(fn.c_str()); } + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // FILEUTILS_H diff --git a/src/FofnReader.cpp b/src/FofnReader.cpp new file mode 100644 index 0000000..a0d9280 --- /dev/null +++ b/src/FofnReader.cpp @@ -0,0 +1,52 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "FofnReader.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +vector FofnReader::Files(istream& in) +{ + vector files; + string fn; + while (getline(in, fn)) + files.push_back(fn); + return files; +} diff --git a/src/FofnReader.h b/src/FofnReader.h new file mode 100644 index 0000000..ee09fc5 --- /dev/null +++ b/src/FofnReader.h @@ -0,0 +1,60 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef FOFNREADER_H +#define FOFNREADER_H + +#include "pbbam/DataSet.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +class FofnReader +{ +public: + static std::vector Files(std::istream& in); +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // FOFNREADER_H diff --git a/src/Frames.cpp b/src/Frames.cpp new file mode 100644 index 0000000..e54729a --- /dev/null +++ b/src/Frames.cpp @@ -0,0 +1,181 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Frames.cpp +/// \brief Implements the Frames class. +// +// Author: Derek Barnett + +#include "pbbam/Frames.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +static vector framepoints; +static vector frameToCode; +static uint16_t maxFramepoint; + +static +void InitIpdDownsampling(void) +{ + if (!framepoints.empty()) + return; + + // liftover from Dave's python code: + // .../bioinformatics/tools/kineticsTools/kineticsTools/_downsampling.py + + const int B = 2; + const int t = 6; + const double T = pow(B, t); + + int next = 0; + double grain; + const int end = 256/T; + for (int i = 0; i < end; ++i) { + grain = pow(B, i); + vector nextOnes; + for (double j = 0; j < T; ++j) + nextOnes.push_back(j*grain + next); + next = nextOnes.back() + grain; + framepoints.insert(framepoints.end(), nextOnes.cbegin(), nextOnes.cend()); + } + assert(framepoints.size()-1 <= UINT8_MAX); + + const uint16_t maxElement = (*max_element(framepoints.cbegin(), framepoints.cend())); + frameToCode.assign(maxElement+1, 0); + + const int fpEnd = framepoints.size() - 1; + uint8_t i = 0; + uint16_t fl = 0; + uint16_t fu = 0; + for (; i < fpEnd; ++i) { + fl = framepoints[i]; + fu = framepoints[i+1]; + if (fu > fl+1) { + const int middle = (fl+fu)/2; + for (int f = fl; f < middle; ++f) + frameToCode[f] = i; + for (int f = middle; f < fu; ++f) + frameToCode[f] = i+1; + } else + frameToCode[fl] = i; + } + + // this next line differs from the python implementation (there, it's "i+1") + // our C++ for loop has incremented our index counter one more time than the indexes from python enumerate(...) + frameToCode[fu] = i; + maxFramepoint = fu; +} + +static inline +uint16_t CodeToFrames(const uint8_t code) +{ + return framepoints[code]; +} + +static +vector CodeToFrames(const vector& codedData) +{ + InitIpdDownsampling(); + + const size_t length = codedData.size(); + vector frames(length, 0); + for (size_t i = 0; i < length; ++i) + frames[i] = CodeToFrames(codedData[i]); + return frames; +} + +static inline +uint8_t FramesToCode(const uint16_t frame) +{ + return frameToCode[std::min(maxFramepoint, frame)]; +} + +static +vector FramesToCode(const vector& frames) +{ + InitIpdDownsampling(); + + const size_t length = frames.size(); + vector result(length, 0); + for (size_t i = 0; i < length; ++i) + result[i] = FramesToCode(frames[i]); + return result; +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +Frames::Frames(void) +{ } + +Frames::Frames(const std::vector& frames) + : data_(frames) +{ } + +Frames::Frames(std::vector&& frames) + : data_(std::move(frames)) +{ } + +Frames::Frames(const Frames& other) + : data_(other.data_) +{ } + +Frames::Frames(Frames&& other) + : data_(std::move(other.data_)) +{ } + +Frames::~Frames(void) { } + +Frames& Frames::operator=(const Frames& other) +{ data_ = other.data_; return *this; } + +Frames& Frames::operator=(Frames&& other) +{ data_ = std::move(other.data_); return *this; } + +Frames Frames::Decode(const std::vector& codedData) +{ return Frames(internal::CodeToFrames(codedData)); } + +std::vector Frames::Encode(const std::vector& frames) +{ return internal::FramesToCode(frames); } diff --git a/src/GenomicInterval.cpp b/src/GenomicInterval.cpp new file mode 100644 index 0000000..10ebc23 --- /dev/null +++ b/src/GenomicInterval.cpp @@ -0,0 +1,136 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file GenomicInterval.cpp +/// \brief Implements the GenomicInterval class. +// +// Author: Derek Barnett + +#include "pbbam/GenomicInterval.h" +#include "AssertUtils.h" +#include "StringUtils.h" +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +// returns sequence name & sets begin/end, from input regionString +string parseRegionString(const string& reg, + PacBio::BAM::Position* begin, + PacBio::BAM::Position* end) +{ + const vector parts = internal::Split(reg, ':'); + if (parts.empty() || parts.size() > 2) + throw std::runtime_error("malformed region string"); + + // given name only, default min,max intervals + if (parts.size() == 1) { + *begin = 0; + *end = 1<<29; + } + + // parse interval from input + else if (parts.size() == 2) { + const vector intervalParts = internal::Split(parts.at(1), '-'); + if (intervalParts.empty() || intervalParts.size() >2 ) + throw std::runtime_error("malformed region string"); + *begin = std::stoi(intervalParts.at(0)); + *end = std::stoi(intervalParts.at(1)); + } + + return parts.at(0); +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +GenomicInterval::GenomicInterval(void) { } + +GenomicInterval::GenomicInterval(const std::string& name, + const Position& start, + const Position& stop) + : name_(name) + , interval_(start, stop) +{ } + +GenomicInterval::GenomicInterval(const string& samtoolsRegionString) +{ + Position begin; + Position end; + name_ = internal::parseRegionString(samtoolsRegionString, &begin, &end); + interval_ = PacBio::BAM::Interval(begin, end); +} + +GenomicInterval::GenomicInterval(const GenomicInterval& other) + : name_(other.name_) + , interval_(other.interval_) +{ } + +GenomicInterval& GenomicInterval::operator=(const GenomicInterval& other) +{ + name_ = other.name_; + interval_ = other.interval_; + return *this; +} + +bool GenomicInterval::CoveredBy(const GenomicInterval& other) const +{ + if (name_ != other.name_) + return false; + return interval_.CoveredBy(other.interval_); +} + +bool GenomicInterval::Covers(const GenomicInterval& other) const +{ + if (name_ != other.name_) + return false; + return interval_.Covers(other.interval_); +} + +bool GenomicInterval::Intersects(const GenomicInterval& other) const +{ + if (name_ != other.name_) + return false; + return interval_.Intersects(other.interval_); +} diff --git a/src/GenomicIntervalQuery.cpp b/src/GenomicIntervalQuery.cpp new file mode 100644 index 0000000..b6ead9f --- /dev/null +++ b/src/GenomicIntervalQuery.cpp @@ -0,0 +1,73 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file GenomicIntervalQuery.cpp +/// \brief Implements the GenomicIntervalQuery class. +// +// Author: Derek Barnett + +#include "pbbam/GenomicIntervalQuery.h" +#include "pbbam/CompositeBamReader.h" +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +struct GenomicIntervalQuery::GenomicIntervalQueryPrivate +{ + GenomicIntervalQueryPrivate(const GenomicInterval& interval, + const DataSet& dataset) + : reader_(interval, dataset) + { } + + GenomicIntervalCompositeBamReader reader_; +}; + +GenomicIntervalQuery::GenomicIntervalQuery(const GenomicInterval& interval, + const DataSet &dataset) + : internal::IQuery() + , d_(new GenomicIntervalQueryPrivate(interval, dataset)) +{ } + +GenomicIntervalQuery::~GenomicIntervalQuery(void) { } + +bool GenomicIntervalQuery::GetNext(BamRecord &r) +{ return d_->reader_.GetNext(r); } + +GenomicIntervalQuery& GenomicIntervalQuery::Interval(const GenomicInterval& interval) +{ d_->reader_.Interval(interval); return *this; } + +const GenomicInterval& GenomicIntervalQuery::Interval(void) const +{ return d_->reader_.Interval(); } diff --git a/src/IRecordWriter.cpp b/src/IRecordWriter.cpp new file mode 100644 index 0000000..7333182 --- /dev/null +++ b/src/IRecordWriter.cpp @@ -0,0 +1,48 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file IRecordWriter.cpp +/// \brief Implements the IRecordWriter class. +// +// Author: Derek Barnett + +#include "pbbam/IRecordWriter.h" +using namespace PacBio; +using namespace PacBio::BAM; + +IRecordWriter::IRecordWriter(void) { } + +IRecordWriter::~IRecordWriter(void) { } diff --git a/src/IndexedFastaReader.cpp b/src/IndexedFastaReader.cpp new file mode 100644 index 0000000..715dd03 --- /dev/null +++ b/src/IndexedFastaReader.cpp @@ -0,0 +1,236 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file IndexedFastaReader.cpp +/// \brief Implements the IndexedFastaReader class. +// +// Author: David Alexander + +#include "pbbam/IndexedFastaReader.h" + +#include "pbbam/BamRecord.h" +#include "pbbam/GenomicInterval.h" +#include "pbbam/Orientation.h" +#include "SequenceUtils.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { + +IndexedFastaReader::IndexedFastaReader(const std::string& filename) +{ + Open(filename); +} + +IndexedFastaReader::IndexedFastaReader(const IndexedFastaReader& src) +{ + if (!Open(src.filename_)) + throw std::runtime_error("Cannot open file " + src.filename_); +} + +IndexedFastaReader& IndexedFastaReader::operator=(const IndexedFastaReader& rhs) +{ + if (&rhs == this) + return *this; + + Open(rhs.filename_); + return *this; +} + +IndexedFastaReader::~IndexedFastaReader(void) +{ + Close(); +} + +bool IndexedFastaReader::Open(const std::string &filename) +{ + faidx_t* handle = fai_load(filename.c_str()); + if (handle == nullptr) + return false; + else + { + filename_ = filename; + handle_ = handle; + return true; + } +} + +void IndexedFastaReader::Close(void) +{ + filename_ = ""; + if (handle_ != nullptr) + fai_destroy(handle_); + handle_ = nullptr; +} + +#define REQUIRE_FAIDX_LOADED if (handle_ == nullptr) throw std::exception() + +std::string IndexedFastaReader::Subsequence(const std::string& id, + Position begin, + Position end) const +{ + REQUIRE_FAIDX_LOADED; + + int len; + // Derek: *Annoyingly* htslib seems to interpret "end" as inclusive in + // faidx_fetch_seq, whereas it considers it exclusive in the region spec in + // fai_fetch. Can you please verify? + char* rawSeq = faidx_fetch_seq(handle_, id.c_str(), begin, end - 1, &len); + if (rawSeq == nullptr) + throw std::runtime_error("could not fetch FASTA sequence"); + else { + std::string seq(rawSeq); + free(rawSeq); + return seq; + } +} + +std::string IndexedFastaReader::Subsequence(const GenomicInterval& interval) const +{ + REQUIRE_FAIDX_LOADED; + return Subsequence(interval.Name(), interval.Start(), interval.Stop()); +} + +std::string IndexedFastaReader::Subsequence(const char *htslibRegion) const +{ + REQUIRE_FAIDX_LOADED; + + int len; + char* rawSeq = fai_fetch(handle_, htslibRegion, &len); + if (rawSeq == nullptr) + throw std::runtime_error("could not fetch FASTA sequence"); + else { + std::string seq(rawSeq); + free(rawSeq); + return seq; + } +} + + +std::string +IndexedFastaReader::ReferenceSubsequence(const BamRecord& bamRecord, + const Orientation orientation, + const bool gapped, + const bool exciseSoftClips) const +{ + REQUIRE_FAIDX_LOADED; + + std::string subseq = Subsequence(bamRecord.ReferenceName(), + bamRecord.ReferenceStart(), + bamRecord.ReferenceEnd()); + const auto reverse = orientation != Orientation::GENOMIC && + bamRecord.Impl().IsReverseStrand(); + + if (bamRecord.Impl().IsMapped() && gapped) + { + size_t seqIndex = 0; + const Cigar& cigar = bamRecord.Impl().CigarData(); + Cigar::const_iterator cigarIter = cigar.cbegin(); + Cigar::const_iterator cigarEnd = cigar.cend(); + for (; cigarIter != cigarEnd; ++cigarIter) + { + const CigarOperation& op = (*cigarIter); + const CigarOperationType& type = op.Type(); + + // do nothing for hard clips + if (type != CigarOperationType::HARD_CLIP) + { + const size_t opLength = op.Length(); + + // maybe remove soft clips + if (type == CigarOperationType::SOFT_CLIP) + { + if (!exciseSoftClips) + { + subseq.reserve(subseq.size() + opLength); + subseq.insert(seqIndex, opLength, '-'); + seqIndex += opLength; + } + } + + // for non-clipping operations + else { + + // maybe add gaps/padding + if (type == CigarOperationType::INSERTION) + { + subseq.reserve(subseq.size() + opLength); + subseq.insert(seqIndex, opLength, '-'); + } + else if (type == CigarOperationType::PADDING) + { + subseq.reserve(subseq.size() + opLength); + subseq.insert(seqIndex, opLength, '*'); + } + + // update index + seqIndex += opLength; + } + } + } + } + + if (reverse) + internal::ReverseComplementCaseSens(subseq); + + return subseq; +} + + +int IndexedFastaReader::NumSequences(void) const +{ + REQUIRE_FAIDX_LOADED; + return faidx_nseq(handle_); +} + +bool IndexedFastaReader::HasSequence(const std::string& name) const +{ + REQUIRE_FAIDX_LOADED; + return (faidx_has_seq(handle_, name.c_str()) != 0); +} + +int IndexedFastaReader::SequenceLength(const std::string& name) const +{ + REQUIRE_FAIDX_LOADED; + int len = faidx_seq_len(handle_, name.c_str()); + if (len < 0) + throw std::runtime_error("could not determine FASTA sequence length"); + else return len; +} + +}} // PacBio::BAM diff --git a/src/MD5.cpp b/src/MD5.cpp new file mode 100644 index 0000000..b2262d5 --- /dev/null +++ b/src/MD5.cpp @@ -0,0 +1,70 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file MD5.cpp +/// \brief Implements basic MD5 hash utilities +// +// Author: Brett Bowman + +#include "pbbam/MD5.h" +#include + +namespace PacBio { +namespace BAM { + +/// \brief MD5 hash of a string as a 32-digit hexadecimal string +/// +std::string MD5Hash(const std::string& str) +{ + MD5_CTX md5; + unsigned char digest[16]; + char hexdigest[33]; + + MD5_Init(&md5); + MD5_Update(&md5, reinterpret_cast(const_cast(str.c_str())), str.size()); + MD5_Final(digest, &md5); + + for (int i = 0; i < 16; ++i) + sprintf(&hexdigest[2*i], "%02x", digest[i]); + + return std::string{hexdigest, 32}; +} + +} // namespace BAM +} // namespace PacBio + + + diff --git a/src/MemoryUtils.cpp b/src/MemoryUtils.cpp new file mode 100644 index 0000000..e3ec6a2 --- /dev/null +++ b/src/MemoryUtils.cpp @@ -0,0 +1,82 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "MemoryUtils.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +// ----------------- +// BamHeaderMemory +// ----------------- + +BamHeader BamHeaderMemory::FromRawData(bam_hdr_t* hdr) +{ + // null input - error + if (hdr == nullptr) + throw std::runtime_error("invalid BAM header"); + + // empty text input - ok + if (hdr->text == nullptr || hdr->l_text == 0) + return BamHeader(); + + // parse normal SAM text input + return BamHeader(string(hdr->text, hdr->l_text)); +} + +PBBAM_SHARED_PTR BamHeaderMemory::MakeRawHeader(const BamHeader& header) +{ + const string& text = header.ToSam(); + PBBAM_SHARED_PTR rawData(sam_hdr_parse(text.size(), text.c_str()), internal::HtslibHeaderDeleter()); + rawData->ignore_sam_err = 0; + rawData->cigar_tab = NULL; + rawData->l_text = text.size(); + rawData->text = (char*)calloc(rawData->l_text + 1, 1); + memcpy(rawData->text, text.c_str(), rawData->l_text); + return rawData; +} + +//PBBAM_SHARED_PTR BamHeaderMemory::MakeRawHeader(const BamHeader& header) +//{ +// if (!header) +// return PBBAM_SHARED_PTR(nullptr); +// return MakeRawHeader(*header.get()); +//} diff --git a/src/MemoryUtils.h b/src/MemoryUtils.h new file mode 100644 index 0000000..c22f9f5 --- /dev/null +++ b/src/MemoryUtils.h @@ -0,0 +1,168 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef MEMORYUTILS_H +#define MEMORYUTILS_H + +#include "pbbam/Config.h" +#include "pbbam/BamHeader.h" +#include "pbbam/BamRecord.h" +#include "pbbam/BamRecordImpl.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { + +class BamHeader; + +namespace internal { + +// intended for use with PBBAM_SHARED_PTR, std::unique_ptr, etc + +struct HtslibBgzfDeleter +{ + void operator()(BGZF* bgzf) + { + if (bgzf) + bgzf_close(bgzf); + bgzf = nullptr; + } +}; + +struct HtslibFileDeleter +{ + void operator()(samFile* file) + { + if (file) + sam_close(file); + file = nullptr; + } +}; + +struct HtslibHeaderDeleter +{ + void operator()(bam_hdr_t* hdr) + { + if (hdr) + bam_hdr_destroy(hdr); + hdr = nullptr; + } +}; + +struct HtslibIndexDeleter +{ + void operator()(hts_idx_t* index) + { + if (index) + hts_idx_destroy(index); + index = nullptr; + } +}; + +struct HtslibIteratorDeleter +{ + void operator()(hts_itr_t* iter) + { + if (iter) + hts_itr_destroy(iter); + iter = nullptr; + } +}; + +struct HtslibRecordDeleter +{ + void operator()(bam1_t* b) + { + if (b) + bam_destroy1(b); + b = nullptr; + } +}; + +class BamHeaderMemory +{ +public: + static BamHeader FromRawData(bam_hdr_t* header); + static PBBAM_SHARED_PTR MakeRawHeader(const BamHeader& header); +// static PBBAM_SHARED_PTR MakeRawHeader(const BamHeader& header); +}; + +class BamRecordMemory +{ +public: + static const BamRecordImpl& GetImpl(const BamRecord& r); + static const BamRecordImpl& GetImpl(const BamRecord* r); + static PBBAM_SHARED_PTR GetRawData(const BamRecord& r); + static PBBAM_SHARED_PTR GetRawData(const BamRecord* r); + static PBBAM_SHARED_PTR GetRawData(const BamRecordImpl& impl); + static PBBAM_SHARED_PTR GetRawData(const BamRecordImpl* impl); + + static void UpdateRecordTags(const BamRecord& r); + static void UpdateRecordTags(const BamRecordImpl& r); +}; + +inline const BamRecordImpl& BamRecordMemory::GetImpl(const BamRecord& r) +{ return r.impl_; } + +inline const BamRecordImpl& BamRecordMemory::GetImpl(const BamRecord* r) +{ return r->impl_; } + +inline PBBAM_SHARED_PTR BamRecordMemory::GetRawData(const BamRecord& r) +{ return GetRawData(r.impl_); } + +inline PBBAM_SHARED_PTR BamRecordMemory::GetRawData(const BamRecord* r) +{ return GetRawData(r->impl_); } + +inline PBBAM_SHARED_PTR BamRecordMemory::GetRawData(const BamRecordImpl& impl) +{ return impl.d_; } + +inline PBBAM_SHARED_PTR BamRecordMemory::GetRawData(const BamRecordImpl* impl) +{ return impl->d_; } + +inline void BamRecordMemory::UpdateRecordTags(const BamRecord& r) +{ UpdateRecordTags(r.impl_); } + +inline void BamRecordMemory::UpdateRecordTags(const BamRecordImpl& r) +{ r.UpdateTagMap(); } + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // MEMORYUTILS_H diff --git a/src/PbiBuilder.cpp b/src/PbiBuilder.cpp new file mode 100644 index 0000000..5cb6d6c --- /dev/null +++ b/src/PbiBuilder.cpp @@ -0,0 +1,405 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiBuilder.cpp +/// \brief Implements the PbiBuilder class. +// +// Author: Derek Barnett + +#include "pbbam/PbiBuilder.h" +#include "pbbam/BamRecord.h" +#include "pbbam/PbiRawData.h" +#include "FileProducer.h" +#include "MemoryUtils.h" +#include "PbiIndexIO.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +// ------------------------------------------- +// PbiRawReferenceDataBuilder implementation +// ------------------------------------------- + +// helper for reference data +class PbiRawReferenceDataBuilder +{ +public: + PbiRawReferenceDataBuilder(const size_t numReferenceSequences); + +public: + bool AddRecord(const BamRecord& record, + const PbiReferenceEntry::Row rowNumber); + PbiRawReferenceData Result(void) const; + +private: + int32_t lastRefId_; + Position lastPos_; + map rawReferenceEntries_; +}; + +PbiRawReferenceDataBuilder::PbiRawReferenceDataBuilder(const size_t numReferenceSequences) + : lastRefId_(-1) + , lastPos_(-1) +{ + // initialize with number of references we expect to see + // + // we can add more later, but want to ensure known references have an entry + // even if no records are observed mapping to it + // + for (size_t i = 0; i < numReferenceSequences; ++i) + rawReferenceEntries_[i] = PbiReferenceEntry(i); + + // also create an "unmapped" entry + rawReferenceEntries_[PbiReferenceEntry::UNMAPPED_ID] = PbiReferenceEntry(); +} + +bool PbiRawReferenceDataBuilder::AddRecord(const BamRecord& record, + const PbiReferenceEntry::Row rowNumber) +{ + // fetch ref ID & pos for record + const int32_t tId = record.ReferenceId(); + const int32_t pos = record.ReferenceStart(); + + // sanity checks to protect against non-coordinate-sorted BAMs + if (lastRefId_ != tId || (lastRefId_ >= 0 && tId < 0)) { + if (tId >= 0) { + + // if we've already seen unmapped reads, but our current tId is valid + // + // error: unmapped reads should all be at the end (can stop checking refs) + // + PbiReferenceEntry& unmappedEntry = + rawReferenceEntries_[PbiReferenceEntry::UNMAPPED_ID]; + if (unmappedEntry.beginRow_ != PbiReferenceEntry::UNSET_ROW) + return false; + + // if we've already seen data for this new tId + // (remember we're coming from another tId) + // + // error: refs are out of order (can stop checking refs) + // + PbiReferenceEntry& currentEntry = + rawReferenceEntries_[(uint32_t)tId]; + if (currentEntry.beginRow_ != PbiReferenceEntry::UNSET_ROW) + return false; + } + lastRefId_ = tId; + } + else if (tId >= 0 && lastPos_ > pos) + return false; //error: positions out of order + + // update row numbers + PbiReferenceEntry& entry = rawReferenceEntries_[(uint32_t)tId]; + if (entry.beginRow_ == PbiReferenceEntry::UNSET_ROW) + entry.beginRow_ = rowNumber; + entry.endRow_ = rowNumber+1; + + // update pos (for sorting check next go-round) + lastPos_ = pos; + return true; +} + +PbiRawReferenceData PbiRawReferenceDataBuilder::Result(void) const { + // PbiReferenceEntries will be sorted thanks to std::map + // tId will be at end since we're sorting on the uint cast of -1 + PbiRawReferenceData result; + result.entries_.reserve(rawReferenceEntries_.size()); + auto refIter = rawReferenceEntries_.cbegin(); + const auto refEnd = rawReferenceEntries_.cend(); + for ( ; refIter != refEnd; ++refIter ) + result.entries_.push_back(refIter->second); + return result; +} + +// ---------------------------------- +// PbiBuilderPrivate implementation +// ---------------------------------- + +class PbiBuilderPrivate : public internal::FileProducer +{ +public: + PbiBuilderPrivate(const string& filename, + const size_t numReferenceSequences, + const PbiBuilder::CompressionLevel compressionLevel, + const size_t numThreads); + PbiBuilderPrivate(const string& filename, + const size_t numReferenceSequences, + const bool isCoordinateSorted, + const PbiBuilder::CompressionLevel compressionLevel, + const size_t numThreads); + ~PbiBuilderPrivate(void); + +public: + void AddRecord(const BamRecord& record, const int64_t vOffset); + +public: + bool HasBarcodeData(void) const; + bool HasMappedData(void) const; + bool HasReferenceData(void) const; + +public: + unique_ptr bgzf_; + PbiRawData rawData_; + PbiReferenceEntry::Row currentRow_; + unique_ptr refDataBuilder_; +}; + +PbiBuilderPrivate::PbiBuilderPrivate(const string& filename, + const size_t numReferenceSequences, + const PbiBuilder::CompressionLevel compressionLevel, + const size_t numThreads) + : internal::FileProducer(filename) + , bgzf_(nullptr) + , currentRow_(0) + , refDataBuilder_(nullptr) +{ + const string& usingFilename = TempFilename(); + const string& mode = string("wb") + to_string(static_cast(compressionLevel)); + bgzf_.reset(bgzf_open(usingFilename.c_str(), mode.c_str())); + if (bgzf_.get() == 0) + throw std::runtime_error("could not open PBI file for writing"); + + size_t actualNumThreads = numThreads; + if (actualNumThreads == 0) { + actualNumThreads = thread::hardware_concurrency(); + + // if still unknown, default to single-threaded + if (actualNumThreads == 0) + actualNumThreads = 1; + } + + // if multithreading requested, enable it + if (actualNumThreads > 1) + bgzf_mt(bgzf_.get(), actualNumThreads, 256); + + if (numReferenceSequences > 0) + refDataBuilder_.reset(new PbiRawReferenceDataBuilder(numReferenceSequences)); +} + +PbiBuilderPrivate::PbiBuilderPrivate(const string& filename, + const size_t numReferenceSequences, + const bool isCoordinateSorted, + const PbiBuilder::CompressionLevel compressionLevel, + const size_t numThreads) + : internal::FileProducer(filename) + , bgzf_(nullptr) + , currentRow_(0) + , refDataBuilder_(nullptr) +{ + const string& usingFilename = TempFilename(); + const string& mode = string("wb") + to_string(static_cast(compressionLevel)); + bgzf_.reset(bgzf_open(usingFilename.c_str(), mode.c_str())); + if (bgzf_.get() == 0) + throw std::runtime_error("could not open PBI file for writing"); + + size_t actualNumThreads = numThreads; + if (actualNumThreads == 0) { + actualNumThreads = thread::hardware_concurrency(); + + // if still unknown, default to single-threaded + if (actualNumThreads == 0) + actualNumThreads = 1; + } + + // if multithreading requested, enable it + if (actualNumThreads > 1) + bgzf_mt(bgzf_.get(), actualNumThreads, 256); + + if (isCoordinateSorted && numReferenceSequences > 0) + refDataBuilder_.reset(new PbiRawReferenceDataBuilder(numReferenceSequences)); +} + +PbiBuilderPrivate::~PbiBuilderPrivate(void) +{ + rawData_.NumReads(currentRow_); + + const auto hasBarcodeData = HasBarcodeData(); + const auto hasMappedData = HasMappedData(); + const auto hasReferenceData = HasReferenceData(); + + // fetch reference data, if available + if (hasReferenceData) { + assert(refDataBuilder_); + rawData_.ReferenceData() = refDataBuilder_->Result(); + } + + // determine flags + PbiFile::Sections sections = PbiFile::BASIC; + if (hasMappedData) sections |= PbiFile::MAPPED; + if (hasBarcodeData) sections |= PbiFile::BARCODE; + if (hasReferenceData) sections |= PbiFile::REFERENCE; + rawData_.FileSections(sections); + + // write index contents to file + BGZF* fp = bgzf_.get(); + PbiIndexIO::WriteHeader(rawData_, fp); + const uint32_t numReads = rawData_.NumReads(); + if (numReads > 0) { + PbiIndexIO::WriteBasicData(rawData_.BasicData(), numReads, fp); + if (hasMappedData) PbiIndexIO::WriteMappedData(rawData_.MappedData(), numReads, fp); + if (hasReferenceData) PbiIndexIO::WriteReferenceData(rawData_.ReferenceData(), fp); + if (hasBarcodeData) PbiIndexIO::WriteBarcodeData(rawData_.BarcodeData(), numReads, fp); + } +} + +void PbiBuilderPrivate::AddRecord(const BamRecord& record, const int64_t vOffset) +{ + // ensure updated data + record.ResetCachedPositions(); + + // store data + rawData_.BarcodeData().AddRecord(record); + rawData_.BasicData().AddRecord(record, vOffset); + rawData_.MappedData().AddRecord(record); + + if (refDataBuilder_) { + + // stop storing coordinate-sorted reference data if we encounter out-of-order record + const bool sorted = refDataBuilder_->AddRecord(record, currentRow_); + if (!sorted) + refDataBuilder_.reset(); + } + + // increment row counter + ++currentRow_; +} + +bool PbiBuilderPrivate::HasBarcodeData(void) const +{ + // fetch data components + const auto& barcodeData = rawData_.BarcodeData(); + const auto& bcForward = barcodeData.bcForward_; + const auto& bcReverse = barcodeData.bcReverse_; + const auto& bcQuality = barcodeData.bcQual_; + + // ensure valid sizes + if (bcForward.size() != bcReverse.size() && + bcForward.size() != bcQuality.size()) + { + auto msg = string{ "error: inconsistency in PBI barcode data:\n" }; + msg += string{ " bcForward has " } + to_string(bcForward.size()) + string{ " elements\n" }; + msg += string{ " bcReverse has " } + to_string(bcReverse.size()) + string{ " elements\n" }; + msg += string{ " bcQuality has " } + to_string(bcQuality.size()) + string{ " elements\n" }; + msg += string{ "\n" }; + msg += string{ " these containers should contain equal number of elements.\n" }; + throw std::runtime_error(msg); + } + assert(bcForward.size() == rawData_.NumReads()); + + // check for data + for (uint32_t i = 0; i < rawData_.NumReads(); ++i) { + if (bcForward.at(i) != -1 || + bcReverse.at(i) != -1 || + bcQuality.at(i) != -1 ) + { + return true; + } + } + // no actual data found + return false; +} + +bool PbiBuilderPrivate::HasMappedData(void) const +{ + const auto& mappedData = rawData_.MappedData(); + const auto& tIds = mappedData.tId_; + assert(tIds.size() == rawData_.NumReads()); + for (const auto tId : tIds) { + if (tId >= 0) + return true; + } + return false; // all reads unmapped +} + +bool PbiBuilderPrivate::HasReferenceData(void) const +{ return bool(refDataBuilder_); } + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +// --------------------------- +// PbiBuilder implementation +// --------------------------- + +PbiBuilder::PbiBuilder(const string& pbiFilename, + const CompressionLevel compressionLevel, + const size_t numThreads) + : d_(new internal::PbiBuilderPrivate(pbiFilename, + 0, + compressionLevel, + numThreads)) +{ } + +PbiBuilder::PbiBuilder(const string& pbiFilename, + const size_t numReferenceSequences, + const CompressionLevel compressionLevel, + const size_t numThreads) + : d_(new internal::PbiBuilderPrivate(pbiFilename, + numReferenceSequences, + compressionLevel, + numThreads)) +{ } + +PbiBuilder::PbiBuilder(const string& pbiFilename, + const size_t numReferenceSequences, + const bool isCoordinateSorted, + const CompressionLevel compressionLevel, + const size_t numThreads) + : d_(new internal::PbiBuilderPrivate(pbiFilename, + numReferenceSequences, + isCoordinateSorted, + compressionLevel, + numThreads)) +{ } + +PbiBuilder::~PbiBuilder(void) { } + +void PbiBuilder::AddRecord(const BamRecord& record, const int64_t vOffset) +{ + internal::BamRecordMemory::UpdateRecordTags(record); + d_->AddRecord(record, vOffset); +} + +const PbiRawData& PbiBuilder::Index(void) const +{ return d_->rawData_; } diff --git a/src/PbiFile.cpp b/src/PbiFile.cpp new file mode 100644 index 0000000..144c847 --- /dev/null +++ b/src/PbiFile.cpp @@ -0,0 +1,74 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiFile.cpp +/// \brief Implements the PbiFile methods. +// +// Author: Derek Barnett + +#include "pbbam/PbiFile.h" +#include "pbbam/BamFile.h" +#include "pbbam/PbiBuilder.h" +#include "pbbam/BamReader.h" +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::PbiFile; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace PbiFile { + +void CreateFrom(const BamFile& bamFile, + const PbiBuilder::CompressionLevel compressionLevel, + const size_t numThreads) +{ + PbiBuilder builder(bamFile.PacBioIndexFilename(), + bamFile.Header().Sequences().size(), + compressionLevel, + numThreads); + BamReader reader(bamFile); + BamRecord b; + int64_t offset = reader.VirtualTell(); + while (reader.GetNext(b)) { + builder.AddRecord(b, offset); + offset = reader.VirtualTell(); + } +} + +} // namespace PbiFile +} // namespace BAM +} // namespace PacBio diff --git a/src/PbiFilter.cpp b/src/PbiFilter.cpp new file mode 100644 index 0000000..0ce8930 --- /dev/null +++ b/src/PbiFilter.cpp @@ -0,0 +1,365 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiFilter.cpp +/// \brief Implements the PbiFilter class. +// +// Author: Derek Barnett + +#include "pbbam/PbiFilter.h" +#include "pbbam/PbiFilterTypes.h" +#include "StringUtils.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +enum class BuiltIn +{ + AlignedEndFilter + , AlignedLengthFilter + , AlignedStartFilter + , AlignedStrandFilter + , BarcodeFilter + , BarcodeForwardFilter + , BarcodeQualityFilter + , BarcodeReverseFilter + , BarcodesFilter + , IdentityFilter + , LocalContextFilter + , MovieNameFilter + , NumDeletedBasesFilter + , NumInsertedBasesFilter + , NumMatchesFilter + , NumMismatchesFilter + , QueryEndFilter + , QueryLengthFilter + , QueryNameFilter + , QueryNamesFromFileFilter + , QueryStartFilter + , ReadAccuracyFilter + , ReadGroupFilter + , ReferenceEndFilter + , ReferenceIdFilter + , ReferenceNameFilter + , ReferenceStartFilter + , ZmwFilter +}; + +static const unordered_map builtInLookup = +{ + // property name built-in filter + { "ae", BuiltIn::AlignedEndFilter }, + { "aend", BuiltIn::AlignedEndFilter }, + { "alignedlength", BuiltIn::AlignedLengthFilter }, + { "as", BuiltIn::AlignedStartFilter }, + { "astart", BuiltIn::AlignedStartFilter }, + { "readstart", BuiltIn::AlignedStartFilter }, + { "bc", BuiltIn::BarcodeFilter }, + { "barcode", BuiltIn::BarcodeFilter }, + { "bcf", BuiltIn::BarcodeForwardFilter }, + { "bq", BuiltIn::BarcodeQualityFilter }, + { "bcq", BuiltIn::BarcodeQualityFilter }, + { "bcr", BuiltIn::BarcodeReverseFilter }, + { "accuracy", BuiltIn::IdentityFilter }, + { "identity", BuiltIn::IdentityFilter }, + { "cx", BuiltIn::LocalContextFilter }, + { "movie", BuiltIn::MovieNameFilter }, + { "qe", BuiltIn::QueryEndFilter }, + { "qend", BuiltIn::QueryEndFilter }, + { "length", BuiltIn::QueryLengthFilter }, + { "querylength", BuiltIn::QueryLengthFilter }, + { "qname", BuiltIn::QueryNameFilter }, + { "qname_file", BuiltIn::QueryNamesFromFileFilter }, + { "qs", BuiltIn::QueryStartFilter }, + { "qstart", BuiltIn::QueryStartFilter }, + { "rq", BuiltIn::ReadAccuracyFilter }, + { "te", BuiltIn::ReferenceEndFilter }, + { "tend", BuiltIn::ReferenceEndFilter }, + { "rname", BuiltIn::ReferenceNameFilter }, + { "ts", BuiltIn::ReferenceStartFilter }, + { "tstart", BuiltIn::ReferenceStartFilter }, + { "pos", BuiltIn::ReferenceStartFilter }, + { "zm", BuiltIn::ZmwFilter }, + { "zmw", BuiltIn::ZmwFilter } +}; + +static const unordered_map contextFlagNames = +{ + { "NO_LOCAL_CONTEXT", LocalContextFlags::NO_LOCAL_CONTEXT }, + { "ADAPTER_BEFORE", LocalContextFlags::ADAPTER_BEFORE }, + { "ADAPTER_AFTER", LocalContextFlags::ADAPTER_AFTER }, + { "BARCODE_BEFORE", LocalContextFlags::BARCODE_BEFORE }, + { "BARCODE_AFTER", LocalContextFlags::BARCODE_AFTER }, + { "FORWARD_PASS", LocalContextFlags::FORWARD_PASS }, + { "REVERSE_PASS", LocalContextFlags::REVERSE_PASS } +}; + +// helper methods (for handling maybe-list strings)) +static inline bool isBracketed(const string& value) +{ + static const string openBrackets = "[({"; + static const string closeBrackets = "])}"; + return openBrackets.find(value.at(0)) != string::npos && + closeBrackets.find(value.at(value.length()-1)) != string::npos; +}; + +static inline bool isList(const string& value) +{ + return value.find(',') != string::npos; +} + +static +PbiFilter CreateBarcodeFilter(string value, + const Compare::Type compareType) +{ + if (value.empty()) + throw std::runtime_error("empty value for barcode filter property"); + + if (isBracketed(value)) { + value.erase(0,1); + value.pop_back(); + } + + if (isList(value)) { + vector barcodes = internal::Split(value, ','); + if (barcodes.size() != 2) + throw std::runtime_error("only 2 barcode values expected"); + return PbiBarcodesFilter{ boost::numeric_cast(stoi(barcodes.at(0))), + boost::numeric_cast(stoi(barcodes.at(1))), + compareType + }; + } else + return PbiBarcodeFilter{ boost::numeric_cast(stoi(value)), compareType }; +} + +static +PbiFilter CreateBarcodeForwardFilter(string value, + const Compare::Type compareType) +{ + if (value.empty()) + throw std::runtime_error("empty value for barcode_forward filter property"); + + if (isBracketed(value)) { + value.erase(0,1); + value.pop_back(); + } + + if (isList(value)) { + vector tokens = internal::Split(value, ','); + vector barcodes; + barcodes.reserve(tokens.size()); + for (const auto& t : tokens) + barcodes.push_back(boost::numeric_cast(stoi(t))); + return PbiBarcodeForwardFilter{ std::move(barcodes) }; + } else + return PbiBarcodeForwardFilter{ boost::numeric_cast(stoi(value)), compareType }; +} + +static +PbiFilter CreateBarcodeReverseFilter(string value, + const Compare::Type compareType) +{ + if (value.empty()) + throw std::runtime_error("empty value for barcode_reverse filter property"); + + if (isBracketed(value)) { + value.erase(0,1); + value.pop_back(); + } + + if (isList(value)) { + vector tokens = internal::Split(value, ','); + vector barcodes; + barcodes.reserve(tokens.size()); + for (const auto& t : tokens) + barcodes.push_back(boost::numeric_cast(stoi(t))); + return PbiBarcodeReverseFilter{ std::move(barcodes) }; + } else + return PbiBarcodeReverseFilter{ boost::numeric_cast(stoi(value)), compareType }; +} + +static +PbiFilter CreateLocalContextFilter(const string& value, + const Compare::Type compareType) +{ + if (value.empty()) + throw std::runtime_error("empty value for local context filter property"); + + LocalContextFlags filterValue = LocalContextFlags::NO_LOCAL_CONTEXT; + + // if raw integer + if (isdigit(value.at(0))) + filterValue = static_cast(stoi(value)); + + // else interpret as flag names + else { + vector tokens = internal::Split(value, '|'); + for (string& token : tokens) { + boost::algorithm::trim(token); // trim whitespace + filterValue = (filterValue | contextFlagNames.at(token)); + } + } + + return PbiFilter{ PbiLocalContextFilter{filterValue, compareType} }; +} + +static +PbiFilter CreateQueryNamesFilterFromFile(const string& value, + const DataSet& dataset) +{ + // resolve file from dataset, value + const string resolvedFilename = dataset.ResolvePath(value); + vector whitelist; + string fn; + ifstream in(resolvedFilename); + while (getline(in, fn)) + whitelist.push_back(fn); + return PbiQueryNameFilter{ whitelist }; +} + +static +PbiFilter FromDataSetProperty(const Property& property, + const DataSet& dataset) +{ + try { + const string& value = property.Value(); + const Compare::Type compareType = Compare::TypeFromOperator(property.Operator()); + const BuiltIn builtInCode = builtInLookup.at(boost::algorithm::to_lower_copy(property.Name())); + switch (builtInCode) { + + // single-value filters + case BuiltIn::AlignedEndFilter : return PbiAlignedEndFilter{ static_cast(stoul(value)), compareType }; + case BuiltIn::AlignedLengthFilter : return PbiAlignedLengthFilter{ static_cast(stoul(value)), compareType }; + case BuiltIn::AlignedStartFilter : return PbiAlignedStartFilter{ static_cast(stoul(value)), compareType }; + case BuiltIn::BarcodeQualityFilter : return PbiBarcodeQualityFilter{ static_cast(stoul(value)), compareType }; + case BuiltIn::IdentityFilter : return PbiIdentityFilter{ stof(value), compareType }; + case BuiltIn::MovieNameFilter : return PbiMovieNameFilter{ value }; + case BuiltIn::QueryEndFilter : return PbiQueryEndFilter{ stoi(value), compareType }; + case BuiltIn::QueryLengthFilter : return PbiQueryLengthFilter{ stoi(value), compareType }; + case BuiltIn::QueryNameFilter : return PbiQueryNameFilter{ value }; + case BuiltIn::QueryStartFilter : return PbiQueryStartFilter{ stoi(value), compareType }; + case BuiltIn::ReadAccuracyFilter : return PbiReadAccuracyFilter{ stof(value), compareType }; + case BuiltIn::ReadGroupFilter : return PbiReadGroupFilter{ value, compareType }; + case BuiltIn::ReferenceEndFilter : return PbiReferenceEndFilter{ static_cast(stoul(value)), compareType }; + case BuiltIn::ReferenceIdFilter : return PbiReferenceIdFilter{ stoi(value), compareType }; + case BuiltIn::ReferenceNameFilter : return PbiReferenceNameFilter{ value }; + case BuiltIn::ReferenceStartFilter : return PbiReferenceStartFilter{ static_cast(stoul(value)), compareType }; + case BuiltIn::ZmwFilter : return PbiZmwFilter{ stoi(value), compareType }; + + // (maybe) list-value filters + case BuiltIn::BarcodeFilter : return CreateBarcodeFilter(value, compareType); + case BuiltIn::BarcodeForwardFilter : return CreateBarcodeForwardFilter(value, compareType); + case BuiltIn::BarcodeReverseFilter : return CreateBarcodeReverseFilter(value, compareType); + case BuiltIn::LocalContextFilter : return CreateLocalContextFilter(value, compareType); + + // other built-ins + case BuiltIn::QueryNamesFromFileFilter : return CreateQueryNamesFilterFromFile(value, dataset); // compareType ignored + + default : + throw std::exception(); + } + // unreachable + return PbiFilter{ }; + + } catch (std::exception& e) { + stringstream s; + s << "error: could not create filter from XML Property element: " << endl + << " Name: " << property.Name() << endl + << " Value: " << property.Value() << endl + << " Operator: " << property.Operator() << endl + << " reason: " << e.what() << endl; + throw std::runtime_error(s.str()); + } +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +PbiFilter PbiFilter::FromDataSet(const DataSet& dataset) +{ + auto datasetFilter = PbiFilter{ PbiFilter::UNION }; + for (auto&& xmlFilter : dataset.Filters()) { + auto propertiesFilter = PbiFilter{ }; + for (auto&& xmlProperty : xmlFilter.Properties()) + propertiesFilter.Add(internal::FromDataSetProperty(xmlProperty, dataset)); + datasetFilter.Add(propertiesFilter); + } + return datasetFilter; +} + +PbiFilter PbiFilter::Intersection(const std::vector& filters) +{ + auto result = PbiFilter{ PbiFilter::INTERSECT }; + result.Add(filters); + return result; +} + +PbiFilter PbiFilter::Intersection(std::vector&& filters) +{ + auto result = PbiFilter{ PbiFilter::INTERSECT }; + result.Add(std::move(filters)); + return result; +} + +PbiFilter PbiFilter::Union(const std::vector& filters) +{ + auto result = PbiFilter{ PbiFilter::UNION }; + result.Add(filters); + return result; +} + +PbiFilter PbiFilter::Union(std::vector&& filters) +{ + auto result = PbiFilter{ PbiFilter::UNION }; + result.Add(std::move(filters)); + return result; +} diff --git a/src/PbiFilterQuery.cpp b/src/PbiFilterQuery.cpp new file mode 100644 index 0000000..19d2b31 --- /dev/null +++ b/src/PbiFilterQuery.cpp @@ -0,0 +1,70 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiFilterQuery.cpp +/// \brief Implements the PbiFilterQuery class. +// +// Author: Derek Barnett + +#include "pbbam/PbiFilterQuery.h" +#include "pbbam/CompositeBamReader.h" + + +#include + +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +struct PbiFilterQuery::PbiFilterQueryPrivate +{ + PbiFilterQueryPrivate(const PbiFilter& filter, const DataSet& dataset) + : reader_(filter, dataset) + { } + + PbiFilterCompositeBamReader reader_; // unsorted +}; + +PbiFilterQuery::PbiFilterQuery(const PbiFilter& filter, const DataSet& dataset) + : internal::IQuery() + , d_(new PbiFilterQueryPrivate(filter, dataset)) +{ } + +PbiFilterQuery::~PbiFilterQuery(void) { } + +bool PbiFilterQuery::GetNext(BamRecord &r) +{ return d_->reader_.GetNext(r); } diff --git a/src/PbiFilterTypes.cpp b/src/PbiFilterTypes.cpp new file mode 100644 index 0000000..13ff375 --- /dev/null +++ b/src/PbiFilterTypes.cpp @@ -0,0 +1,450 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiFilterTypes.cpp +/// \brief Implements the built-in PBI filters. +// +// Author: Derek Barnett + +#include "pbbam/PbiFilterTypes.h" +#include "StringUtils.h" +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +template +IndexList readLengthHelper(const std::vector& start, + const std::vector& end, + const T& value, + const Compare::Type cmp) +{ + assert(start.size() == end.size()); + + auto result = IndexList{ }; + const auto numElements = start.size(); + for (size_t i = 0; i < numElements; ++i) { + const auto readLength = end[i] - start[i]; + bool keep = false; + switch(cmp) { + case Compare::EQUAL : keep = (readLength == value); break; + case Compare::NOT_EQUAL : keep = (readLength != value); break; + case Compare::LESS_THAN : keep = (readLength < value); break; + case Compare::LESS_THAN_EQUAL : keep = (readLength <= value); break; + case Compare::GREATER_THAN : keep = (readLength > value); break; + case Compare::GREATER_THAN_EQUAL : keep = (readLength >= value); break; + default: + assert(false); + throw std::runtime_error(string{"read length filter encountered unknown Compare::Type: "} + + Compare::TypeToName(cmp)); + } + + if (keep) + result.push_back(i); + } + return result; +} + +static +PbiFilter filterFromMovieName(const string& movieName, bool includeCcs) +{ + // we'll match on any rgIds from our candidate list + auto filter = PbiFilter{ PbiFilter::UNION }; + filter.Add( + { + PbiReadGroupFilter{ MakeReadGroupId(movieName, "POLYMERASE") }, + PbiReadGroupFilter{ MakeReadGroupId(movieName, "HQREGION") }, + PbiReadGroupFilter{ MakeReadGroupId(movieName, "SUBREAD") }, + PbiReadGroupFilter{ MakeReadGroupId(movieName, "SCRAP") }, + PbiReadGroupFilter{ MakeReadGroupId(movieName, "UNKNOWN") } + }); + if (includeCcs) + filter.Add(PbiReadGroupFilter{ MakeReadGroupId(movieName, "CCS") }); + + return filter; +} + +//static +//PbiFilter filterFromQueryName(const string& queryName) +//{ +// // split full name into moviename, holenumber +// const auto nameParts = internal::Split(queryName, '/'); +// if (nameParts.size() != 3) { +// auto msg = string{ "PbiQueryNameFilter error: requested QNAME (" } + queryName; +// msg += string{ ") is not a valid PacBio BAM QNAME. See spec for details"}; +// throw std::runtime_error(msg); +// } +// +// // main filter: {union of candidate rgIds} && zmw [&& qStart && qEnd](non-CCS reads) +// auto filter = PbiFilter{ }; +// filter.Add(PbiZmwFilter{ stoi(nameParts.at(1)) }); // hole number +// +// const auto movieName = nameParts.at(0); +// +// // CCS (only 1 possible candidate rgId) +// if (nameParts.at(2) == "ccs") +// filter.Add(PbiReadGroupFilter{ MakeReadGroupId(movieName, "CCS") }); +// +// // all other read types +// else { +// // we'll match on any read type that matches our qname +// // (except for CCS since it has a different QNAME anyway) +// const auto rgIdFilter = filterFromMovieName(movieName, false); +// filter.Add(rgIdFilter); +// +// // add qStart/qEnd filters to our main filter +// const auto queryIntervalParts = internal::Split(nameParts.at(2), '_'); +// if (queryIntervalParts.size() != 2) { +// auto msg = string{ "PbiQueryNameFilter error: requested QNAME (" } + queryName; +// msg += string{ ") is not a valid PacBio BAM QNAME. See spec for details"}; +// throw std::runtime_error(msg); +// } +// filter.Add(PbiQueryStartFilter{ stoi(queryIntervalParts.at(0)) }); +// filter.Add(PbiQueryEndFilter{ stoi(queryIntervalParts.at(1)) }); +// } +// return filter; +//} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +// PbiAlignedLengthFilter + +bool PbiAlignedLengthFilter::Accepts(const PbiRawData& idx, const size_t row) const +{ + const auto& mappedData = idx.MappedData(); + const auto& aEnd = mappedData.aEnd_.at(row) ; + const auto& aStart = mappedData.aStart_.at(row); + const auto aLength = aEnd - aStart; + return CompareHelper(aLength); +} + +// PbiIdentityFilter + +bool PbiIdentityFilter::Accepts(const PbiRawData& idx, const size_t row) const +{ + const auto& mappedData = idx.MappedData(); + const auto& nMM = mappedData.nMM_.at(row); + const auto& nIndels = mappedData.NumDeletedAndInsertedBasesAt(row); + const auto& nDel = nIndels.first; + const auto& nIns = nIndels.second; + + const auto& basicData = idx.BasicData(); + const auto& qStart = basicData.qStart_.at(row); + const auto& qEnd = basicData.qEnd_.at(row); + + const auto readLength = qEnd - qStart; + const auto nonMatches = nMM + nDel + nIns; + const float identity = 1.0 - (static_cast(nonMatches)/static_cast(readLength)); + + return CompareHelper(identity); +} + +// PbiMovieNameFilter + +PbiMovieNameFilter::PbiMovieNameFilter(const std::string& movieName) + : compositeFilter_(internal::filterFromMovieName(movieName, true)) // include CCS +{ } + +PbiMovieNameFilter::PbiMovieNameFilter(const std::vector& whitelist) + : compositeFilter_(PbiFilter::UNION) +{ + for (const auto& movieName : whitelist) + compositeFilter_.Add(internal::filterFromMovieName(movieName, true)); // include CCS +} + +PbiMovieNameFilter::PbiMovieNameFilter(std::vector&& whitelist) + : compositeFilter_(PbiFilter::UNION) +{ + for (auto&& movieName : whitelist) + compositeFilter_.Add(internal::filterFromMovieName(movieName, true)); // include CCS +} + +// PbiQueryLengthFilter + +bool PbiQueryLengthFilter::Accepts(const PbiRawData& idx, const size_t row) const +{ + const auto& basicData = idx.BasicData(); + const auto& qStart = basicData.qStart_.at(row); + const auto& qEnd = basicData.qEnd_.at(row); + const auto readLength = qEnd - qStart; + return CompareHelper(readLength); +} + +// PbiQueryNameFilter + +struct PbiQueryNameFilter::PbiQueryNameFilterPrivate +{ +public: + typedef pair QueryInterval; + typedef set QueryIntervals; + typedef unordered_map ZmwLookup; + typedef shared_ptr ZmwLookupPtr; + typedef unordered_map RgIdLookup; + +public: + PbiQueryNameFilterPrivate(const vector& whitelist) + { + for (const auto& queryName : whitelist) { + + // split name into main parts + auto nameParts = internal::Split(queryName, '/'); + if (nameParts.size() != 3) { + auto msg = string{ "PbiQueryNameFilter error: requested QNAME (" } + queryName; + msg += string{ ") is not a valid PacBio BAM QNAME. See spec for details"}; + throw std::runtime_error(msg); + } + + // + // generate candidate read group IDs from movie name + // + // then, ensure read group IDs in lookup table, creating or fetching + // shared ZmwLookup table if new movie + // + const string& movieName = nameParts.at(0); + const bool isCCS = (nameParts.at(2) == "ccs" || nameParts.at(2) == "CCS"); + vector rgIds; + if (isCCS) { + rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "CCS")) ); + } else { + rgIds.reserve(6); + rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "POLYMERASE"))); + rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "HQREGION"))); + rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "SUBREAD"))); + rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "SCRAP"))); + rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "UNKNOWN"))); + rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "ZMW"))); + } + assert(!rgIds.empty()); + auto rgFound = lookup_.find(rgIds.front()); + ZmwLookupPtr zmwPtr = nullptr; + if (rgFound == lookup_.end()) { + zmwPtr = ZmwLookupPtr(new ZmwLookup); + for (const auto& rg : rgIds) { + assert(lookup_.find(rg) == lookup_.end()); + lookup_.emplace(rg, zmwPtr); + } + } + else { +#ifndef NDEBUG + for (const auto& rg : rgIds) + assert(lookup_.find(rg) != lookup_.end()); +#endif + zmwPtr = rgFound->second; + } + + // fetch ZMW & QueryStart/QEnd from query name + const int32_t zmw = stoi(nameParts.at(1)); + int32_t queryStart = -1; + int32_t queryEnd = -1; + if (!isCCS) { + const auto queryIntervalParts = internal::Split(nameParts.at(2), '_'); + if (queryIntervalParts.size() != 2) { + auto msg = string{ "PbiQueryNameFilter error: requested QNAME (" } + queryName; + msg += string{ ") is not a valid PacBio BAM QNAME. See spec for details"}; + throw std::runtime_error(msg); + } + queryStart = stoi(queryIntervalParts.at(0)); + queryEnd = stoi(queryIntervalParts.at(1)); + } + + // creating new ZMW entry if not yet seen & store QS/QE pair + // + const auto zmwFound = zmwPtr->find(zmw); + if (zmwFound == zmwPtr->end()) + zmwPtr->emplace(zmw, QueryIntervals{}); + QueryIntervals& queryIntervals = zmwPtr->at(zmw); + queryIntervals.emplace(make_pair(queryStart, queryEnd)); + } + } + + PbiQueryNameFilterPrivate(const unique_ptr& other) + { + if (other) + lookup_ = other->lookup_; + } + + bool Accepts(const PbiRawData& idx, const size_t row) const + { + const auto& basicData = idx.BasicData(); + + // see if row's RGID known + const auto& rgId = basicData.rgId_.at(row); + const auto rgFound = lookup_.find(rgId); + if (rgFound == lookup_.end()) + return false; + + // see if row's ZMW known + const auto& zmwPtr = rgFound->second; + const auto zmw = basicData.holeNumber_.at(row); + const auto zmwFound = zmwPtr->find(zmw); + if (zmwFound == zmwPtr->end()) + return false; + + // see if row's QueryStart/QueryEnd known + // CCS names already covered in lookup construction phase + const auto& queryIntervals = zmwFound->second; + const auto qStart = basicData.qStart_.at(row); + const auto qEnd = basicData.qEnd_.at(row); + const auto queryInterval = make_pair(qStart, qEnd); + return queryIntervals.find(queryInterval) != queryIntervals.end(); + } + +private: + RgIdLookup lookup_; +}; + +PbiQueryNameFilter::PbiQueryNameFilter(const std::string& qname) + : d_(new PbiQueryNameFilter::PbiQueryNameFilterPrivate(vector{1, qname})) +{ } +// : compositeFilter_(internal::filterFromQueryName(qname)) +//{ } + +PbiQueryNameFilter::PbiQueryNameFilter(const std::vector& whitelist) + : d_(new PbiQueryNameFilter::PbiQueryNameFilterPrivate(whitelist)) +{ } +// : compositeFilter_(PbiFilter::UNION) +//{ +// try { +// for (const auto& qname : whitelist) +// compositeFilter_.Add(internal::filterFromQueryName(qname)); +// } +// // simply re-throw our own exception +// catch (std::runtime_error&) { +// throw; +// } +// // we may hit other exceptions (e.g. in stoi()) - but we'll pin on a bit of extra data +// catch (std::exception& e) { +// auto msg = string{ "PbiQueryNameFilter encountered error: " } + e.what(); +// throw std::runtime_error(msg); +// } +//} + +//PbiQueryNameFilter::PbiQueryNameFilter(std::vector&& whitelist) +// : d_(new PbiQueryNameFilter::PbiQueryNameFilterPrivate(whitelist)) +//{ } +// : compositeFilter_(PbiFilter::UNION) +//{ +// try { +// for (const auto& qname : whitelist) +// compositeFilter_.Add(internal::filterFromQueryName(qname)); +// } +// // simply re-throw our own exception +// catch (std::runtime_error&) { +// throw; +// } +// // we may hit other exceptions (e.g. in stoi()) - but we'll pin on a bit of extra data +// catch (std::exception& e) { +// auto msg = string{ "PbiQueryNameFilter encountered error: " } + e.what(); +// throw std::runtime_error(msg); +// } +//} + +PbiQueryNameFilter::PbiQueryNameFilter(const PbiQueryNameFilter& other) + : d_(new PbiQueryNameFilter::PbiQueryNameFilterPrivate(other.d_)) +{ } + +PbiQueryNameFilter::~PbiQueryNameFilter(void) { } + +bool PbiQueryNameFilter::Accepts(const PbiRawData& idx, const size_t row) const +{ return d_->Accepts(idx, row); } +//{ return compositeFilter_.Accepts(idx, row); } + +// PbiReferenceNameFilter + +PbiReferenceNameFilter::PbiReferenceNameFilter(const std::string& rname, + const Compare::Type cmp) + : initialized_(false) + , rname_(rname) + , cmp_(cmp) +{ + if (cmp != Compare::EQUAL && cmp != Compare::NOT_EQUAL) { + auto msg = std::string{ "Compare type: " }; + msg += Compare::TypeToName(cmp); + msg += " not supported for PbiReferenceNameFilter (use one of Compare::EQUAL or Compare::NOT_EQUAL)."; + throw std::runtime_error(msg); + } +} + +PbiReferenceNameFilter::PbiReferenceNameFilter(const std::vector& whitelist) + : initialized_(false) + , rnameWhitelist_(whitelist) + , cmp_(Compare::EQUAL) +{ } + +PbiReferenceNameFilter::PbiReferenceNameFilter(std::vector&& whitelist) + : initialized_(false) + , rnameWhitelist_(std::move(whitelist)) + , cmp_(Compare::EQUAL) +{ } + +bool PbiReferenceNameFilter::Accepts(const PbiRawData& idx, const size_t row) const +{ + if (!initialized_) + Initialize(idx); + return subFilter_.Accepts(idx, row); +} + +void PbiReferenceNameFilter::Initialize(const PbiRawData& idx) const +{ + const auto pbiFilename = idx.Filename(); + const auto bamFilename = pbiFilename.substr(0, pbiFilename.length() - 4); + const auto bamFile = BamFile{ bamFilename }; + + // single-value + if (rnameWhitelist_ == boost::none) { + const auto tId = bamFile.ReferenceId(rname_); + subFilter_ = PbiReferenceIdFilter{ tId, cmp_ }; + } + + // multi-value whitelist + else { + subFilter_ = PbiFilter(PbiFilter::UNION); + for (const auto& rname : rnameWhitelist_.get()) + subFilter_.Add(PbiReferenceIdFilter{ bamFile.ReferenceId(rname) }); + } + initialized_ = true; +} + diff --git a/src/PbiIndex.cpp b/src/PbiIndex.cpp new file mode 100644 index 0000000..da874b7 --- /dev/null +++ b/src/PbiIndex.cpp @@ -0,0 +1,218 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiIndex.cpp +/// \brief Implements the PbiIndex class. +// +// Author: Derek Barnett + +#include "pbbam/PbiIndex.h" +#include "PbiIndexIO.h" +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +// ---------------------------------- +// SubreadLookupData implementation +// ---------------------------------- + +BasicLookupData::BasicLookupData(void) { } + +BasicLookupData::BasicLookupData(const PbiRawBasicData& rawData) + : rgId_(rawData.rgId_) + , qStart_(rawData.qStart_) + , qEnd_(rawData.qEnd_) + , holeNumber_(rawData.holeNumber_) + , readQual_(rawData.readQual_) + , ctxtFlag_(rawData.ctxtFlag_) + , fileOffset_(rawData.fileOffset_) +{ } + +// ---------------------------------- +// MappedLookupData implementation +// ---------------------------------- + +MappedLookupData::MappedLookupData(void) { } + +MappedLookupData::MappedLookupData(const PbiRawMappedData& rawData) + : tId_(rawData.tId_) + , tStart_(rawData.tStart_) + , tEnd_(rawData.tEnd_) + , aStart_(rawData.aStart_) + , aEnd_(rawData.aEnd_) + , nM_(rawData.nM_) + , nMM_(rawData.nMM_) + , mapQV_(rawData.mapQV_) +{ + const size_t numElements = rawData.revStrand_.size(); + reverseStrand_.reserve(numElements/2); + forwardStrand_.reserve(numElements/2); + + std::map insRawData; + std::map delRawData; + for (size_t i = 0; i < numElements; ++i) { + + // nDel, nIns + const auto indels = rawData.NumDeletedAndInsertedBasesAt(i); + delRawData[indels.first].push_back(i); + insRawData[indels.second].push_back(i); + + // strand + if (rawData.revStrand_.at(i) == 0) + forwardStrand_.push_back(i); + else + reverseStrand_.push_back(i); + } + + nIns_ = OrderedLookup(std::move(insRawData)); + nDel_ = OrderedLookup(std::move(delRawData)); +} + +// ---------------------------------- +// BarcodeLookupData implementation +// ---------------------------------- + +BarcodeLookupData::BarcodeLookupData(void) { } + +BarcodeLookupData::BarcodeLookupData(const PbiRawBarcodeData& rawData) + : bcForward_(rawData.bcForward_) + , bcReverse_(rawData.bcReverse_) + , bcQual_(rawData.bcQual_) + +{ } + +// ---------------------------------- +// ReferenceLookupData implementation +// ---------------------------------- + +ReferenceLookupData::ReferenceLookupData(void) { } + +ReferenceLookupData::ReferenceLookupData(const PbiRawReferenceData& rawData) +{ + const size_t numEntries = rawData.entries_.size(); + references_.reserve(numEntries); + for (size_t i = 0; i < numEntries; ++i) { + const PbiReferenceEntry& entry = rawData.entries_.at(i); + references_[entry.tId_] = IndexRange(entry.beginRow_, entry.endRow_); + } +} + +// -------------------------------- +// PbiIndexPrivate implementation +// -------------------------------- + +PbiIndexPrivate::PbiIndexPrivate(void) + : version_(PbiFile::CurrentVersion) + , sections_(PbiFile::BASIC) + , numReads_(0) +{ } + +PbiIndexPrivate::PbiIndexPrivate(const PbiRawData& rawIndex) + : filename_(rawIndex.Filename()) + , version_(rawIndex.Version()) + , sections_(rawIndex.FileSections()) + , numReads_(rawIndex.NumReads()) + , basicData_(rawIndex.BasicData()) + , mappedData_(rawIndex.MappedData()) + , referenceData_(rawIndex.ReferenceData()) + , barcodeData_(rawIndex.BarcodeData()) +{ } + +PbiIndexPrivate::PbiIndexPrivate(PbiRawData&& rawIndex) + : filename_(rawIndex.Filename()) + , version_(std::move(rawIndex.Version())) + , sections_(std::move(rawIndex.FileSections())) + , numReads_(std::move(rawIndex.NumReads())) + , basicData_(std::move(rawIndex.BasicData())) + , mappedData_(std::move(rawIndex.MappedData())) + , referenceData_(std::move(rawIndex.ReferenceData())) + , barcodeData_(std::move(rawIndex.BarcodeData())) +{ } + +unique_ptr PbiIndexPrivate::DeepCopy(void) const +{ + std::unique_ptr copy(new PbiIndexPrivate); + copy->filename_ = filename_; + copy->version_ = version_; + copy->sections_ = sections_; + copy->numReads_ = numReads_; + copy->basicData_ = basicData_; + copy->mappedData_ = mappedData_; + copy->referenceData_ = referenceData_; + copy->barcodeData_ = barcodeData_; + return copy; +} + +// ------------------------- +// PbiIndex implementation +// ------------------------- + +PbiIndex::PbiIndex(void) + : d_(new PbiIndexPrivate) +{ } + +PbiIndex::PbiIndex(const string& pbiFilename) + : d_(new PbiIndexPrivate(PbiRawData(pbiFilename))) +{ } + +PbiIndex::PbiIndex(const PbiIndex& other) + : d_(std::forward>(other.d_->DeepCopy())) +{ + // move is ok, since it's a deep-copied, new object +} + +PbiIndex::PbiIndex(PbiIndex&& other) + : d_(std::move(other.d_)) +{ } + +PbiIndex& PbiIndex::operator=(const PbiIndex& other) +{ + // move is ok, since it's a deep-copied, new object + d_ = other.d_->DeepCopy(); + return *this; +} + +PbiIndex& PbiIndex::operator=(PbiIndex&& other) +{ + d_ = std::move(other.d_); + return *this; +} + +PbiIndex::~PbiIndex(void) { } + +string PbiIndex::Filename(void) const +{ return d_->filename_; } diff --git a/src/PbiIndexIO.cpp b/src/PbiIndexIO.cpp new file mode 100644 index 0000000..7d1f615 --- /dev/null +++ b/src/PbiIndexIO.cpp @@ -0,0 +1,474 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// Author: Derek Barnett + +#include "PbiIndexIO.h" + +#include "pbbam/BamFile.h" +#include "pbbam/BamRecord.h" +#include "pbbam/EntireFileQuery.h" +#include "pbbam/PbiBuilder.h" +#include "MemoryUtils.h" +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +// \brief Appends content of src vector to dst vector using move semantics. +/// +/// \param[in] src Input vector that will be empty after execution +/// \param[in,out] dst Output vector that will be appended to +/// +template +inline void MoveAppend(std::vector& src, std::vector& dst) noexcept +{ + if (dst.empty()) + { + dst = std::move(src); + } + else + { + dst.reserve(dst.size() + src.size()); + std::move(src.begin(), src.end(), std::back_inserter(dst)); + src.clear(); + } +} + +/// \brief Appends content of src vector to dst vector using move semantics. +/// +/// \param[in] src Input vector via perfect forwarding +/// \param[in,out] dst Output vector that will be appended to +/// +template +inline void MoveAppend(std::vector&& src, std::vector& dst) noexcept +{ + if (dst.empty()) + { + dst = std::move(src); + } + else + { + dst.reserve(dst.size() + src.size()); + std::move(src.begin(), src.end(), std::back_inserter(dst)); + src.clear(); + } +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +// --------------------------- +// PbiIndexIO implementation +// --------------------------- + +PbiRawData PbiIndexIO::Load(const std::string& pbiFilename) +{ + PbiRawData rawData; + Load(rawData, pbiFilename); + return rawData; +} + +void PbiIndexIO::Load(PbiRawData& rawData, + const string& filename) +{ + // open file for reading + if (!boost::algorithm::iends_with(filename, ".pbi")) + throw std::runtime_error("unsupported file extension"); + std::unique_ptr bgzf(bgzf_open(filename.c_str(), "rb")); + BGZF* fp = bgzf.get(); + if (fp == 0) + throw std::runtime_error("could not open PBI file for reading"); + + // load data + LoadHeader(rawData, fp); + const uint32_t numReads = rawData.NumReads(); + if (numReads > 0) { + LoadBasicData(rawData.BasicData(), numReads, fp); + if (rawData.HasMappedData()) + LoadMappedData(rawData.MappedData(), numReads, fp); + if (rawData.HasReferenceData()) + LoadReferenceData(rawData.ReferenceData(), fp); + if (rawData.HasBarcodeData()) + LoadBarcodeData(rawData.BarcodeData(), numReads, fp); + } +} + +void PbiIndexIO::LoadFromDataSet(PbiRawData& aggregateData, + const DataSet& dataset) +{ + aggregateData.NumReads(0); + aggregateData.FileSections(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::BARCODE); + aggregateData.Version(PbiFile::CurrentVersion); + + const auto bamFiles = dataset.BamFiles(); + uint16_t fileNumber = 0; + for (const auto& bamFile : bamFiles) { + PbiRawData currentPbi{bamFile.PacBioIndexFilename()}; + const size_t currentPbiCount = currentPbi.NumReads(); + + // read count + aggregateData.NumReads(aggregateData.NumReads()+currentPbiCount); + + // BasicData + PbiRawBasicData& aggregateBasicData = aggregateData.BasicData(); + PbiRawBasicData& currentBasicData = currentPbi.BasicData(); + MoveAppend(std::move(currentBasicData.rgId_), aggregateBasicData.rgId_); + MoveAppend(std::move(currentBasicData.qStart_), aggregateBasicData.qStart_); + MoveAppend(std::move(currentBasicData.qEnd_), aggregateBasicData.qEnd_); + MoveAppend(std::move(currentBasicData.holeNumber_), aggregateBasicData.holeNumber_); + MoveAppend(std::move(currentBasicData.readQual_), aggregateBasicData.readQual_); + MoveAppend(std::move(currentBasicData.ctxtFlag_), aggregateBasicData.ctxtFlag_); + MoveAppend(std::move(currentBasicData.fileOffset_), aggregateBasicData.fileOffset_); + MoveAppend(std::vector(currentPbiCount, fileNumber), aggregateBasicData.fileNumber_); + + // BarcodeData + PbiRawBarcodeData& aggregateBarcodeData = aggregateData.BarcodeData(); + if (currentPbi.HasBarcodeData()) { + PbiRawBarcodeData& currentBarcodeData = currentPbi.BarcodeData(); + MoveAppend(std::move(currentBarcodeData.bcForward_), aggregateBarcodeData.bcForward_); + MoveAppend(std::move(currentBarcodeData.bcReverse_), aggregateBarcodeData.bcReverse_); + MoveAppend(std::move(currentBarcodeData.bcQual_), aggregateBarcodeData.bcQual_); + } else { + MoveAppend(std::vector(currentPbiCount, -1), aggregateBarcodeData.bcForward_); + MoveAppend(std::vector(currentPbiCount, -1), aggregateBarcodeData.bcReverse_); + MoveAppend(std::vector(currentPbiCount, -1), aggregateBarcodeData.bcQual_); + } + + // MappedData + PbiRawMappedData& aggregateMappedData = aggregateData.MappedData(); + if (currentPbi.HasMappedData()) { + PbiRawMappedData& currentMappedData = currentPbi.MappedData(); + MoveAppend(std::move(currentMappedData.tId_), aggregateMappedData.tId_); + MoveAppend(std::move(currentMappedData.tStart_), aggregateMappedData.tStart_); + MoveAppend(std::move(currentMappedData.tEnd_), aggregateMappedData.tEnd_); + MoveAppend(std::move(currentMappedData.aStart_), aggregateMappedData.aStart_); + MoveAppend(std::move(currentMappedData.aEnd_), aggregateMappedData.aEnd_); + MoveAppend(std::move(currentMappedData.revStrand_), aggregateMappedData.revStrand_); + MoveAppend(std::move(currentMappedData.nM_), aggregateMappedData.nM_); + MoveAppend(std::move(currentMappedData.nMM_), aggregateMappedData.nMM_); + MoveAppend(std::move(currentMappedData.mapQV_), aggregateMappedData.mapQV_); + } else { + MoveAppend(std::vector(currentPbiCount, -1), aggregateMappedData.tId_); + MoveAppend(std::vector(currentPbiCount, UnmappedPosition), aggregateMappedData.tStart_); + MoveAppend(std::vector(currentPbiCount, UnmappedPosition), aggregateMappedData.tEnd_); + MoveAppend(std::vector(currentPbiCount, UnmappedPosition), aggregateMappedData.aStart_); + MoveAppend(std::vector(currentPbiCount, UnmappedPosition), aggregateMappedData.aEnd_); + MoveAppend(std::vector(currentPbiCount, 0), aggregateMappedData.revStrand_); + MoveAppend(std::vector(currentPbiCount, 0), aggregateMappedData.nM_); + MoveAppend(std::vector(currentPbiCount, 0), aggregateMappedData.nMM_); + MoveAppend(std::vector(currentPbiCount, 255), aggregateMappedData.mapQV_); + } + + ++fileNumber; + } +} + +void PbiIndexIO::LoadBarcodeData(PbiRawBarcodeData& barcodeData, + const uint32_t numReads, + BGZF* fp) +{ + assert(numReads > 0); + (void)numReads; // quash warnings building in release mode + + LoadBgzfVector(fp, barcodeData.bcForward_, numReads); + LoadBgzfVector(fp, barcodeData.bcReverse_, numReads); + LoadBgzfVector(fp, barcodeData.bcQual_, numReads); + + assert(barcodeData.bcForward_.size() == numReads); + assert(barcodeData.bcReverse_.size() == numReads); + assert(barcodeData.bcQual_.size() == numReads); +} + +void PbiIndexIO::LoadHeader(PbiRawData& index, + BGZF* fp) +{ + size_t bytesRead = 0; + + // 'magic' string + char magic[4]; + bytesRead = bgzf_read(fp, magic, 4); + if (bytesRead != 4 || strncmp(magic, "PBI\1", 4)) + throw std::runtime_error("expected PBI file, found unknown format instead"); + + // version, pbi_flags, & n_reads + uint32_t version; + uint16_t sections; + uint32_t numReads; + bgzf_read(fp, &version, sizeof(version)); + bgzf_read(fp, §ions, sizeof(sections)); + bgzf_read(fp, &numReads, sizeof(numReads)); + if (fp->is_be) { + version = ed_swap_4(version); + sections = ed_swap_2(sections); + numReads = ed_swap_4(numReads); + } + + index.Version(PbiFile::VersionEnum(version)); + index.FileSections(PbiFile::Sections(sections)); + index.NumReads(numReads); + + // skip reserved section + size_t reservedLength = 18; + // adjust depending on version + char reserved[18]; + bytesRead = bgzf_read(fp, &reserved, reservedLength); +} + +void PbiIndexIO::LoadMappedData(PbiRawMappedData& mappedData, + const uint32_t numReads, + BGZF* fp) +{ + assert(numReads > 0); + (void)numReads; // quash warnings building in release mode + + LoadBgzfVector(fp, mappedData.tId_, numReads); + LoadBgzfVector(fp, mappedData.tStart_, numReads); + LoadBgzfVector(fp, mappedData.tEnd_, numReads); + LoadBgzfVector(fp, mappedData.aStart_, numReads); + LoadBgzfVector(fp, mappedData.aEnd_, numReads); + LoadBgzfVector(fp, mappedData.revStrand_, numReads); + LoadBgzfVector(fp, mappedData.nM_, numReads); + LoadBgzfVector(fp, mappedData.nMM_, numReads); + LoadBgzfVector(fp, mappedData.mapQV_, numReads); + + assert(mappedData.tId_.size() == numReads); + assert(mappedData.tStart_.size() == numReads); + assert(mappedData.tEnd_.size() == numReads); + assert(mappedData.aStart_.size() == numReads); + assert(mappedData.aEnd_.size() == numReads); + assert(mappedData.revStrand_.size() == numReads); + assert(mappedData.nM_.size() == numReads); + assert(mappedData.nMM_.size() == numReads); + assert(mappedData.mapQV_.size() == numReads); +} + +void PbiIndexIO::LoadReferenceData(PbiRawReferenceData& referenceData, + BGZF* fp) +{ + assert(sizeof(PbiReferenceEntry::ID) == 4); + assert(sizeof(PbiReferenceEntry::Row) == 4); + + // num refs + uint32_t numRefs; + bgzf_read(fp, &numRefs, 4); + if (fp->is_be) + numRefs = ed_swap_4(numRefs); + + // reference entries + referenceData.entries_.clear(); + referenceData.entries_.resize(numRefs); + for (size_t i = 0; i < numRefs; ++i) { + PbiReferenceEntry& entry = referenceData.entries_[i]; + bgzf_read(fp, &entry.tId_, 4); + bgzf_read(fp, &entry.beginRow_, 4); + bgzf_read(fp, &entry.endRow_, 4); + if (fp->is_be) { + entry.tId_ = ed_swap_4(entry.tId_); + entry.beginRow_ = ed_swap_4(entry.beginRow_); + entry.endRow_ = ed_swap_4(entry.endRow_); + } + } +} + +void PbiIndexIO::LoadBasicData(PbiRawBasicData& basicData, + const uint32_t numReads, + BGZF* fp) +{ + assert(numReads > 0); + (void)numReads; // quash warnings building in release mode + + LoadBgzfVector(fp, basicData.rgId_, numReads); + LoadBgzfVector(fp, basicData.qStart_, numReads); + LoadBgzfVector(fp, basicData.qEnd_, numReads); + LoadBgzfVector(fp, basicData.holeNumber_, numReads); + LoadBgzfVector(fp, basicData.readQual_, numReads); + LoadBgzfVector(fp, basicData.ctxtFlag_, numReads); + LoadBgzfVector(fp, basicData.fileOffset_, numReads); + + assert(basicData.rgId_.size() == numReads); + assert(basicData.qStart_.size() == numReads); + assert(basicData.qEnd_.size() == numReads); + assert(basicData.holeNumber_.size() == numReads); + assert(basicData.readQual_.size() == numReads); + assert(basicData.ctxtFlag_.size() == numReads); + assert(basicData.fileOffset_.size() == numReads); +} + +void PbiIndexIO::Save(const PbiRawData& index, + const std::string& filename) +{ + std::unique_ptr bgzf(bgzf_open(filename.c_str(), "wb")); + BGZF* fp = bgzf.get(); + if (fp == 0) + throw std::runtime_error("could not open PBI file for writing"); + + WriteHeader(index, fp); + const uint32_t numReads = index.NumReads(); + if (numReads > 0) { + WriteBasicData(index.BasicData(), numReads, fp); + + if (index.HasMappedData()) + WriteMappedData(index.MappedData(), numReads, fp); + if (index.HasReferenceData()) + WriteReferenceData(index.ReferenceData(), fp); + if (index.HasBarcodeData()) + WriteBarcodeData(index.BarcodeData(), numReads, fp); + } +} + +void PbiIndexIO::WriteBarcodeData(const PbiRawBarcodeData& barcodeData, + const uint32_t numReads, + BGZF* fp) +{ + assert(numReads > 0); + assert(barcodeData.bcForward_.size() == numReads); + assert(barcodeData.bcReverse_.size() == numReads); + assert(barcodeData.bcQual_.size() == numReads); + (void)numReads; // quash warnings building in release mode + + WriteBgzfVector(fp, barcodeData.bcForward_); + WriteBgzfVector(fp, barcodeData.bcReverse_); + WriteBgzfVector(fp, barcodeData.bcQual_); +} + +void PbiIndexIO::WriteHeader(const PbiRawData& index, + BGZF* fp) +{ + // 'magic' string + char magic[4]; + strncpy(magic, "PBI\1", 4); + bgzf_write(fp, magic, 4); + + // version, pbi_flags, & n_reads + uint32_t version = static_cast(index.Version()); + uint16_t pbi_flags = static_cast(index.FileSections()); + uint32_t numReads = index.NumReads(); + if (fp->is_be) { + version = ed_swap_4(version); + pbi_flags = ed_swap_2(pbi_flags); + numReads = ed_swap_4(numReads); + } + bgzf_write(fp, &version, 4); + bgzf_write(fp, &pbi_flags, 2); + bgzf_write(fp, &numReads, 4); + + // reserved space + char reserved[18]; + memset(reserved, 0, 18); + bgzf_write(fp, reserved, 18); +} + +void PbiIndexIO::WriteMappedData(const PbiRawMappedData& mappedData, + const uint32_t numReads, + BGZF* fp) +{ + assert(mappedData.tId_.size() == numReads); + assert(mappedData.tStart_.size() == numReads); + assert(mappedData.tEnd_.size() == numReads); + assert(mappedData.aStart_.size() == numReads); + assert(mappedData.aEnd_.size() == numReads); + assert(mappedData.revStrand_.size() == numReads); + assert(mappedData.nM_.size() == numReads); + assert(mappedData.nMM_.size() == numReads); + assert(mappedData.mapQV_.size() == numReads); + (void)numReads; // quash warnings building in release mode + + WriteBgzfVector(fp, mappedData.tId_); + WriteBgzfVector(fp, mappedData.tStart_); + WriteBgzfVector(fp, mappedData.tEnd_); + WriteBgzfVector(fp, mappedData.aStart_); + WriteBgzfVector(fp, mappedData.aEnd_); + WriteBgzfVector(fp, mappedData.revStrand_); + WriteBgzfVector(fp, mappedData.nM_); + WriteBgzfVector(fp, mappedData.nMM_); + WriteBgzfVector(fp, mappedData.mapQV_); +} + +void PbiIndexIO::WriteReferenceData(const PbiRawReferenceData& referenceData, + BGZF* fp) +{ + // num_refs + uint32_t numRefs = referenceData.entries_.size(); + if (fp->is_be) + numRefs = ed_swap_4(numRefs); + bgzf_write(fp, &numRefs, 4); + + // reference entries + numRefs = referenceData.entries_.size(); // need to reset after maybe endian-swapping + for (size_t i = 0; i < numRefs; ++i) { + const PbiReferenceEntry& entry = referenceData.entries_[i]; + uint32_t tId = entry.tId_; + uint32_t beginRow = entry.beginRow_; + uint32_t endRow = entry.endRow_; + if (fp->is_be) { + tId = ed_swap_4(tId); + beginRow = ed_swap_4(beginRow); + endRow = ed_swap_4(endRow); + } + bgzf_write(fp, &tId, 4); + bgzf_write(fp, &beginRow, 4); + bgzf_write(fp, &endRow, 4); + } +} + +void PbiIndexIO::WriteBasicData(const PbiRawBasicData& basicData, + const uint32_t numReads, + BGZF* fp) +{ + assert(basicData.rgId_.size() == numReads); + assert(basicData.qStart_.size() == numReads); + assert(basicData.qEnd_.size() == numReads); + assert(basicData.holeNumber_.size() == numReads); + assert(basicData.readQual_.size() == numReads); + assert(basicData.ctxtFlag_.size() == numReads); + assert(basicData.fileOffset_.size() == numReads); + (void)numReads; // quash warnings building in release mode + + WriteBgzfVector(fp, basicData.rgId_); + WriteBgzfVector(fp, basicData.qStart_); + WriteBgzfVector(fp, basicData.qEnd_); + WriteBgzfVector(fp, basicData.holeNumber_); + WriteBgzfVector(fp, basicData.readQual_); + WriteBgzfVector(fp, basicData.ctxtFlag_); + WriteBgzfVector(fp, basicData.fileOffset_); +} diff --git a/src/PbiIndexIO.h b/src/PbiIndexIO.h new file mode 100644 index 0000000..927173c --- /dev/null +++ b/src/PbiIndexIO.h @@ -0,0 +1,165 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// Author: Derek Barnett + +#ifndef PBIINDEXIO_H +#define PBIINDEXIO_H + +#include "pbbam/BamFile.h" +#include "pbbam/DataSet.h" +#include "pbbam/PbiFile.h" +#include "pbbam/PbiRawData.h" +#include +#include +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +class PbiIndexIO +{ +public: + // top-level entry points + static PbiRawData Load(const std::string& filename); + static void Load(PbiRawData& rawData, const std::string& filename); + static void LoadFromDataSet(PbiRawData& aggregateData, const DataSet& dataset); + static void Save(const PbiRawData& rawData, const std::string& filename); + +public: + // per-component load + static void LoadBarcodeData(PbiRawBarcodeData& barcodeData, + const uint32_t numReads, + BGZF* fp); + static void LoadHeader(PbiRawData& index, + BGZF* fp); + static void LoadMappedData(PbiRawMappedData& mappedData, + const uint32_t numReads, + BGZF* fp); + static void LoadReferenceData(PbiRawReferenceData& referenceData, + BGZF* fp); + static void LoadBasicData(PbiRawBasicData& basicData, + const uint32_t numReads, + BGZF* fp); + + // per-data-field load + template + static void LoadBgzfVector(BGZF* fp, + std::vector& data, + const uint32_t numReads); + +public: + // per-component write + static void WriteBarcodeData(const PbiRawBarcodeData& barcodeData, + const uint32_t numReads, + BGZF* fp); + static void WriteHeader(const PbiRawData& index, + BGZF* fp); + static void WriteMappedData(const PbiRawMappedData& mappedData, + const uint32_t numReads, + BGZF* fp); + static void WriteReferenceData(const PbiRawReferenceData& referenceData, + BGZF* fp); + static void WriteBasicData(const PbiRawBasicData& subreadData, + const uint32_t numReads, + BGZF* fp); + + // per-data-field write + template + static void WriteBgzfVector(BGZF* fp, + const std::vector& data); + +private: + // helper functions + template + static void SwapEndianness(std::vector& data); +}; + +template +inline void PbiIndexIO::LoadBgzfVector(BGZF* fp, + std::vector& data, + const uint32_t numReads) +{ + assert(fp); + data.resize(numReads); + bgzf_read(fp, &data[0], numReads*sizeof(T)); + if (fp->is_be) + SwapEndianness(data); +} + +template +inline void PbiIndexIO::SwapEndianness(std::vector& data) +{ + const size_t elementSize = sizeof(T); + const size_t numReads = data.size(); + switch (elementSize) { + case 1 : break; // no swapping necessary + case 2 : + for (size_t i = 0; i < numReads; ++i) + ed_swap_2p(&data[i]); + break; + case 4: + for (size_t i = 0; i < numReads; ++i) + ed_swap_4p(&data[i]); + break; + case 8: + for (size_t i = 0; i < numReads; ++i) + ed_swap_8p(&data[i]); + break; + default: + throw std::runtime_error("unsupported element size"); + } +} + +template +inline void PbiIndexIO::WriteBgzfVector(BGZF* fp, + const std::vector& data) +{ + assert(fp); + std::vector output = data; + if (fp->is_be) + SwapEndianness(output); + bgzf_write(fp, &output[0], data.size()*sizeof(T)); +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // PBIINDEXIO_H diff --git a/src/PbiIndexedBamReader.cpp b/src/PbiIndexedBamReader.cpp new file mode 100644 index 0000000..e9aeeb7 --- /dev/null +++ b/src/PbiIndexedBamReader.cpp @@ -0,0 +1,189 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiIndexedBamReader.cpp +/// \brief Implements the PbiIndexedBamReader class. +// +// Author: Derek Barnett + +#include "pbbam/PbiIndexedBamReader.h" +#include + +#include + + +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +struct PbiIndexedBamReaderPrivate +{ +public: + PbiIndexedBamReaderPrivate(const string& pbiFilename) + : index_(pbiFilename) + , currentBlockReadCount_(0) + { } + + void ApplyOffsets(void) + { + const std::vector& fileOffsets = index_.BasicData().fileOffset_; + for (IndexResultBlock& block : blocks_) + block.virtualOffset_ = fileOffsets.at(block.firstIndex_); + } + + void Filter(const PbiFilter& filter) + { + // store request & reset counters + filter_ = filter; + currentBlockReadCount_ = 0; + blocks_.clear(); + + // find blocks of reads passing filter criteria + const uint32_t numReads = index_.NumReads(); + if (numReads == 0) { // empty PBI - no reads to use + return; + } else if (filter_.IsEmpty()) { // empty filter - use all reads + blocks_.push_back(IndexResultBlock{0, numReads}); + } else { + IndexList indices; + indices.reserve(numReads); + for (size_t i = 0; i < numReads; ++i) { + if (filter_.Accepts(index_, i)) + indices.push_back(i); + } + blocks_ = mergedIndexBlocks(std::move(indices)); + } + + // apply offsets + ApplyOffsets(); + } + + int ReadRawData(BGZF* bgzf, bam1_t* b) + { + // no data to fetch, return false + if (blocks_.empty()) + return -1; // "EOF" + + // if on new block, seek to its first record + if (currentBlockReadCount_ == 0) { + auto seekResult = bgzf_seek(bgzf, blocks_.at(0).virtualOffset_, SEEK_SET); + if (seekResult == -1) + throw std::runtime_error("could not seek in BAM file"); + } + + // read next record + auto result = bam_read1(bgzf, b); + + // update counters. if block finished, pop & reset + ++currentBlockReadCount_; + if (currentBlockReadCount_ == blocks_.at(0).numReads_) { + blocks_.pop_front(); + currentBlockReadCount_ = 0; + } + + return result; + } + +public: + PbiFilter filter_; + PbiRawData index_; + IndexResultBlocks blocks_; + size_t currentBlockReadCount_; +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +PbiIndexedBamReader::PbiIndexedBamReader(const PbiFilter& filter, + const std::string& filename) + : PbiIndexedBamReader(filter, BamFile(filename)) +{ } + +PbiIndexedBamReader::PbiIndexedBamReader(const PbiFilter& filter, + const BamFile& bamFile) + : PbiIndexedBamReader(bamFile) +{ + Filter(filter); +} + +PbiIndexedBamReader::PbiIndexedBamReader(const PbiFilter& filter, + BamFile&& bamFile) + : PbiIndexedBamReader(std::move(bamFile)) +{ + Filter(filter); +} + +PbiIndexedBamReader::PbiIndexedBamReader(const std::string& bamFilename) + : PbiIndexedBamReader(BamFile(bamFilename)) +{ } + +PbiIndexedBamReader::PbiIndexedBamReader(const BamFile& bamFile) + : BamReader(bamFile) + , d_(new internal::PbiIndexedBamReaderPrivate(File().PacBioIndexFilename())) +{ } + +PbiIndexedBamReader::PbiIndexedBamReader(BamFile&& bamFile) + : BamReader(std::move(bamFile)) + , d_(new internal::PbiIndexedBamReaderPrivate(File().PacBioIndexFilename())) +{ } + +PbiIndexedBamReader::~PbiIndexedBamReader(void) { } + +int PbiIndexedBamReader::ReadRawData(BGZF* bgzf, bam1_t* b) +{ + assert(d_); + return d_->ReadRawData(bgzf, b); +} + +const PbiFilter& PbiIndexedBamReader::Filter(void) const +{ + assert(d_); + return d_->filter_; +} + +PbiIndexedBamReader& PbiIndexedBamReader::Filter(const PbiFilter& filter) +{ + assert(d_); + d_->Filter(filter); + return *this; +} + diff --git a/src/PbiRawData.cpp b/src/PbiRawData.cpp new file mode 100644 index 0000000..a0e9d1d --- /dev/null +++ b/src/PbiRawData.cpp @@ -0,0 +1,508 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file PbiRawData.cpp +/// \brief Implements the classes used for working with raw PBI data. +// +// Author: Derek Barnett + +#include "pbbam/PbiRawData.h" +#include "pbbam/BamFile.h" +#include "pbbam/BamRecord.h" +#include "PbiIndexIO.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +static +string ToString(const RecordType type) +{ + static const auto lookup = map + { + { RecordType::ZMW, "ZMW" }, + { RecordType::HQREGION, "HQREGION" }, + { RecordType::SUBREAD, "SUBREAD" }, + { RecordType::CCS, "CCS" }, + { RecordType::SCRAP, "SCRAP" }, + { RecordType::UNKNOWN, "UNKNOWN" } + }; + + try { + return lookup.at(type); + } catch (std::exception&) { + throw std::runtime_error("error: unknown RecordType encountered"); + } +} + +} // namespace internal +} // namespace BAM +} // namesapce PacBio + +// ---------------------------------- +// PbiRawBarcodeData implementation +// ---------------------------------- + +PbiRawBarcodeData::PbiRawBarcodeData(void) { } + +PbiRawBarcodeData::PbiRawBarcodeData(uint32_t numReads) +{ + bcForward_.reserve(numReads); + bcReverse_.reserve(numReads); + bcQual_.reserve(numReads); +} + +PbiRawBarcodeData::PbiRawBarcodeData(const PbiRawBarcodeData& other) + : bcForward_(other.bcForward_) + , bcReverse_(other.bcReverse_) + , bcQual_(other.bcQual_) +{ } + +PbiRawBarcodeData::PbiRawBarcodeData(PbiRawBarcodeData&& other) + : bcForward_(std::move(other.bcForward_)) + , bcReverse_(std::move(other.bcReverse_)) + , bcQual_(std::move(other.bcQual_)) +{ } + +PbiRawBarcodeData& PbiRawBarcodeData::operator=(const PbiRawBarcodeData& other) +{ + bcForward_ = other.bcForward_; + bcReverse_ = other.bcReverse_; + bcQual_ = other.bcQual_; + return *this; +} + +PbiRawBarcodeData& PbiRawBarcodeData::operator=(PbiRawBarcodeData&& other) +{ + bcForward_ = std::move(other.bcForward_); + bcReverse_ = std::move(other.bcReverse_); + bcQual_ = std::move(other.bcQual_); + return *this; +} + +void PbiRawBarcodeData::AddRecord(const BamRecord& b) +{ + // check for any barcode data (both required) + if (b.HasBarcodes() && b.HasBarcodeQuality()) { + + // fetch data from record + const auto barcodes = b.Barcodes(); + const auto barcodeQuality = b.BarcodeQuality(); + const auto bcForward = barcodes.first; + const auto bcReverse = barcodes.second; + const auto bcQuality = boost::numeric_cast(barcodeQuality); + + // only store actual data if all values >= 0 + if (bcForward >= 0 && bcReverse >=0 && bcQuality >= 0) { + bcForward_.push_back(bcForward); + bcReverse_.push_back(bcReverse); + bcQual_.push_back(bcQuality); + return; + } + } + + // if we get here, at least one value is either missing or is -1 + bcForward_.push_back(-1); + bcReverse_.push_back(-1); + bcQual_.push_back(-1); +} + +// ---------------------------------- +// PbiRawMappedData implementation +// ---------------------------------- + +PbiRawMappedData::PbiRawMappedData(void) { } + +PbiRawMappedData::PbiRawMappedData(uint32_t numReads) +{ + tId_.reserve(numReads); + tStart_.reserve(numReads); + tEnd_.reserve(numReads); + aStart_.reserve(numReads); + aEnd_.reserve(numReads); + revStrand_.reserve(numReads); + nM_.reserve(numReads); + nMM_.reserve(numReads); + mapQV_.reserve(numReads); +} + +PbiRawMappedData::PbiRawMappedData(const PbiRawMappedData& other) + : tId_(other.tId_) + , tStart_(other.tStart_) + , tEnd_(other.tEnd_) + , aStart_(other.aStart_) + , aEnd_(other.aEnd_) + , revStrand_(other.revStrand_) + , nM_(other.nM_) + , nMM_(other.nMM_) + , mapQV_(other.mapQV_) +{ } + +PbiRawMappedData::PbiRawMappedData(PbiRawMappedData&& other) + : tId_(std::move(other.tId_)) + , tStart_(std::move(other.tStart_)) + , tEnd_(std::move(other.tEnd_)) + , aStart_(std::move(other.aStart_)) + , aEnd_(std::move(other.aEnd_)) + , revStrand_(std::move(other.revStrand_)) + , nM_(std::move(other.nM_)) + , nMM_(std::move(other.nMM_)) + , mapQV_(std::move(other.mapQV_)) +{ } + +PbiRawMappedData& PbiRawMappedData::operator=(const PbiRawMappedData& other) +{ + tId_ = other.tId_; + tStart_ = other.tStart_; + tEnd_ = other.tEnd_; + aStart_ = other.aStart_; + aEnd_ = other.aEnd_; + revStrand_ = other.revStrand_; + nM_ = other.nM_; + nMM_ = other.nMM_; + mapQV_ = other.mapQV_; + return *this; +} + +PbiRawMappedData& PbiRawMappedData::operator=(PbiRawMappedData&& other) +{ + tId_ = std::move(other.tId_); + tStart_ = std::move(other.tStart_); + tEnd_ = std::move(other.tEnd_); + aStart_ = std::move(other.aStart_); + aEnd_ = std::move(other.aEnd_); + revStrand_ = std::move(other.revStrand_); + nM_ = std::move(other.nM_); + nMM_ = std::move(other.nMM_); + mapQV_ = std::move(other.mapQV_); + return *this; +} + +void PbiRawMappedData::AddRecord(const BamRecord& b) +{ + tId_.push_back(b.ReferenceId()); + tStart_.push_back(b.ReferenceStart()); + tEnd_.push_back(b.ReferenceEnd()); + aStart_.push_back(b.AlignedStart()); + aEnd_.push_back(b.AlignedEnd()); + revStrand_.push_back( (b.AlignedStrand() == Strand::REVERSE ? 1 : 0) ); + mapQV_.push_back(b.MapQuality()); + + const auto matchesAndMismatches = b.NumMatchesAndMismatches(); + nM_.push_back(matchesAndMismatches.first); + nMM_.push_back(matchesAndMismatches.second); +} + +uint32_t PbiRawMappedData::NumDeletedBasesAt(size_t recordIndex) const +{ return NumDeletedAndInsertedBasesAt(recordIndex).first; } + +std::pair PbiRawMappedData::NumDeletedAndInsertedBasesAt(size_t recordIndex) const +{ + const auto aStart = aStart_.at(recordIndex); + const auto aEnd = aEnd_.at(recordIndex); + const auto tStart = tStart_.at(recordIndex); + const auto tEnd = tEnd_.at(recordIndex); + const auto nM = nM_.at(recordIndex); + const auto nMM = nMM_.at(recordIndex); + const auto numIns = (aEnd - aStart - nM - nMM); + const auto numDel = (tEnd - tStart - nM - nMM); + return std::make_pair(numDel, numIns); +} + +uint32_t PbiRawMappedData::NumInsertedBasesAt(size_t recordIndex) const +{ return NumDeletedAndInsertedBasesAt(recordIndex).second; } + +// ------------------------------------ +// PbiReferenceEntry implementation +// ------------------------------------ + +const PbiReferenceEntry::ID PbiReferenceEntry::UNMAPPED_ID = static_cast(-1); +const PbiReferenceEntry::Row PbiReferenceEntry::UNSET_ROW = static_cast(-1); + +PbiReferenceEntry::PbiReferenceEntry(void) + : tId_(UNMAPPED_ID) + , beginRow_(UNSET_ROW) + , endRow_(UNSET_ROW) +{ } + +PbiReferenceEntry::PbiReferenceEntry(ID id) + : tId_(id) + , beginRow_(UNSET_ROW) + , endRow_(UNSET_ROW) +{ } + +PbiReferenceEntry::PbiReferenceEntry(ID id, Row beginRow, Row endRow) + : tId_(id) + , beginRow_(beginRow) + , endRow_(endRow) +{ } + +PbiReferenceEntry::PbiReferenceEntry(const PbiReferenceEntry& other) + : tId_(other.tId_) + , beginRow_(other.beginRow_) + , endRow_(other.endRow_) +{ } + +PbiReferenceEntry::PbiReferenceEntry(PbiReferenceEntry&& other) + : tId_(std::move(other.tId_)) + , beginRow_(std::move(other.beginRow_)) + , endRow_(std::move(other.endRow_)) +{ } + +PbiReferenceEntry& PbiReferenceEntry::operator=(const PbiReferenceEntry& other) +{ + tId_ = other.tId_; + beginRow_ = other.beginRow_; + endRow_ = other.endRow_; + return *this; +} + +PbiReferenceEntry& PbiReferenceEntry::operator=(PbiReferenceEntry&& other) +{ + tId_ = std::move(other.tId_); + beginRow_ = std::move(other.beginRow_); + endRow_ = std::move(other.endRow_); + return *this; +} + +// ------------------------------------ +// PbiRawReferenceData implementation +// ------------------------------------ + +PbiRawReferenceData::PbiRawReferenceData(void) { } + +PbiRawReferenceData::PbiRawReferenceData(uint32_t numRefs) +{ entries_.reserve(numRefs); } + +PbiRawReferenceData::PbiRawReferenceData(const PbiRawReferenceData& other) + : entries_(other.entries_) +{ } + +PbiRawReferenceData::PbiRawReferenceData(PbiRawReferenceData&& other) + : entries_(std::move(other.entries_)) +{ } + +PbiRawReferenceData& PbiRawReferenceData::operator=(const PbiRawReferenceData& other) +{ + entries_ = other.entries_; + return *this; +} + +PbiRawReferenceData& PbiRawReferenceData::operator=(PbiRawReferenceData&& other) +{ + entries_ = std::move(other.entries_); + return *this; +} + +// ---------------------------------- +// PbiRawSubreadData implementation +// ---------------------------------- + +PbiRawBasicData::PbiRawBasicData(void) { } + +PbiRawBasicData::PbiRawBasicData(uint32_t numReads) +{ + rgId_.reserve(numReads); + qStart_.reserve(numReads); + qEnd_.reserve(numReads); + holeNumber_.reserve(numReads); + readQual_.reserve(numReads); + ctxtFlag_.reserve(numReads); + fileOffset_.reserve(numReads); + fileNumber_.reserve(numReads); +} + +PbiRawBasicData::PbiRawBasicData(const PbiRawBasicData& other) + : rgId_(other.rgId_) + , qStart_(other.qStart_) + , qEnd_(other.qEnd_) + , holeNumber_(other.holeNumber_) + , readQual_(other.readQual_) + , ctxtFlag_(other.ctxtFlag_) + , fileOffset_(other.fileOffset_) + , fileNumber_(other.fileNumber_) +{ } + +PbiRawBasicData::PbiRawBasicData(PbiRawBasicData&& other) + : rgId_(std::move(other.rgId_)) + , qStart_(std::move(other.qStart_)) + , qEnd_(std::move(other.qEnd_)) + , holeNumber_(std::move(other.holeNumber_)) + , readQual_(std::move(other.readQual_)) + , ctxtFlag_(std::move(other.ctxtFlag_)) + , fileOffset_(std::move(other.fileOffset_)) + , fileNumber_(std::move(other.fileNumber_)) +{ } + +PbiRawBasicData& PbiRawBasicData::operator=(const PbiRawBasicData& other) +{ + rgId_ = other.rgId_; + qStart_ = other.qStart_; + qEnd_ = other.qEnd_; + holeNumber_ = other.holeNumber_; + readQual_ = other.readQual_; + ctxtFlag_ = other.ctxtFlag_; + fileOffset_ = other.fileOffset_; + fileNumber_ = other.fileNumber_; + return *this; +} + +PbiRawBasicData& PbiRawBasicData::operator=(PbiRawBasicData&& other) +{ + rgId_ = std::move(other.rgId_); + qStart_ = std::move(other.qStart_); + qEnd_ = std::move(other.qEnd_); + holeNumber_ = std::move(other.holeNumber_); + readQual_ = std::move(other.readQual_); + ctxtFlag_ = std::move(other.ctxtFlag_); + fileOffset_ = std::move(other.fileOffset_); + fileNumber_ = std::move(other.fileNumber_); + return *this; +} + +void PbiRawBasicData::AddRecord(const BamRecord& b, int64_t offset) +{ + // read group ID + auto rgId = b.ReadGroupId(); + if (rgId.empty()) + rgId = MakeReadGroupId(b.MovieName(), internal::ToString(b.Type())); + const uint32_t rawid = std::stoul(rgId, nullptr, 16); + const int32_t id = static_cast(rawid); + rgId_.push_back(id); + + // query start/end + if (b.Type() == RecordType::CCS) { + qStart_.push_back(-1); + qEnd_.push_back(-1); + } else { + qStart_.push_back(b.QueryStart()); + qEnd_.push_back(b.QueryEnd()); + } + + // add'l basic data + holeNumber_.push_back(b.HasHoleNumber() ? b.HoleNumber() : 0); + readQual_.push_back(b.HasReadAccuracy() ? static_cast(b.ReadAccuracy()) : 0.0f); + ctxtFlag_.push_back(b.HasLocalContextFlags() ? b.LocalContextFlags() : LocalContextFlags::NO_LOCAL_CONTEXT); + + // virtual offset of record start + fileOffset_.push_back(offset); + + // default file number + fileNumber_.push_back(0); +} + +// ---------------------------------- +// PbiRawData implementation +// ---------------------------------- + +PbiRawData::PbiRawData(void) + : version_(PbiFile::CurrentVersion) + , sections_(PbiFile::ALL) + , numReads_(0) +{ } + +PbiRawData::PbiRawData(const string& pbiFilename) + : filename_(pbiFilename) + , version_(PbiFile::CurrentVersion) + , sections_(PbiFile::ALL) + , numReads_(0) +{ + internal::PbiIndexIO::Load(*this, pbiFilename); +} + +PbiRawData::PbiRawData(const DataSet& dataset) + : version_(PbiFile::CurrentVersion) + , sections_(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::BARCODE) + , numReads_(0) +{ + internal::PbiIndexIO::LoadFromDataSet(*this, dataset); +} + +PbiRawData::PbiRawData(const PbiRawData& other) + : filename_(other.filename_) + , version_(other.version_) + , sections_(other.sections_) + , numReads_(other.numReads_) + , barcodeData_(other.barcodeData_) + , mappedData_(other.mappedData_) + , referenceData_(other.referenceData_) + , basicData_(other.basicData_) +{ } + +PbiRawData::PbiRawData(PbiRawData&& other) + : filename_(std::move(other.filename_)) + , version_(std::move(other.version_)) + , sections_(std::move(other.sections_)) + , numReads_(std::move(other.numReads_)) + , barcodeData_(std::move(other.barcodeData_)) + , mappedData_(std::move(other.mappedData_)) + , referenceData_(std::move(other.referenceData_)) + , basicData_(std::move(other.basicData_)) +{ } + +PbiRawData& PbiRawData::operator=(const PbiRawData& other) +{ + filename_ = other.filename_; + version_ = other.version_; + sections_ = other.sections_; + numReads_ = other.numReads_; + barcodeData_ = other.barcodeData_; + mappedData_ = other.mappedData_; + referenceData_ = other.referenceData_; + basicData_ = other.basicData_; + return *this; +} + +PbiRawData& PbiRawData::operator=(PbiRawData&& other) +{ + filename_ = std::move(other.filename_); + version_ = std::move(other.version_); + sections_ = std::move(other.sections_); + numReads_ = std::move(other.numReads_); + barcodeData_ = std::move(other.barcodeData_); + mappedData_ = std::move(other.mappedData_); + referenceData_ = std::move(other.referenceData_); + basicData_ = std::move(other.basicData_); + return *this; +} + +PbiRawData::~PbiRawData(void) { } diff --git a/src/ProgramInfo.cpp b/src/ProgramInfo.cpp new file mode 100644 index 0000000..75f193a --- /dev/null +++ b/src/ProgramInfo.cpp @@ -0,0 +1,165 @@ + +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ProgramInfo.cpp +/// \brief Implements the ProgramInfo class. +// +// Author: Derek Barnett + +#include "pbbam/ProgramInfo.h" +#include "SequenceUtils.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +static string token_ID = string("ID"); +static string token_CL = string("CL"); +static string token_DS = string("DS"); +static string token_PN = string("PN"); +static string token_PP = string("PP"); +static string token_VN = string("VN"); + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +ProgramInfo::ProgramInfo(void) { } + +ProgramInfo::ProgramInfo(const std::string& id) + : id_(id) +{ } + +ProgramInfo::ProgramInfo(const ProgramInfo& other) + : commandLine_(other.commandLine_) + , description_(other.description_) + , id_(other.id_) + , name_(other.name_) + , previousProgramId_(other.previousProgramId_) + , version_(other.version_) +{ } + +ProgramInfo::ProgramInfo(ProgramInfo&& other) + : commandLine_(std::move(other.commandLine_)) + , description_(std::move(other.description_)) + , id_(std::move(other.id_)) + , name_(std::move(other.name_)) + , previousProgramId_(std::move(other.previousProgramId_)) + , version_(std::move(other.version_)) +{ } + +ProgramInfo::~ProgramInfo(void) { } + +ProgramInfo& ProgramInfo::operator=(const ProgramInfo& other) +{ + commandLine_ = other.commandLine_; + description_ = other.description_; + id_ = other.id_; + name_ = other.name_; + previousProgramId_ = other.previousProgramId_; + version_ = other.version_; + return *this; +} + +ProgramInfo& ProgramInfo::operator=(ProgramInfo&& other) +{ + commandLine_ = std::move(other.commandLine_); + description_ = std::move(other.description_); + id_ = std::move(other.id_); + name_ = std::move(other.name_); + previousProgramId_ = std::move(other.previousProgramId_); + version_ = std::move(other.version_); + return *this; +} + +ProgramInfo ProgramInfo::FromSam(const string& sam) +{ + // pop off '@PG\t', then split rest of line into tokens + const vector& tokens = internal::Split(sam.substr(4), '\t'); + if (tokens.empty()) + return ProgramInfo(); + + ProgramInfo prog; + map custom; + + // iterate over tokens + for (const string& token : tokens) { + const string& tokenTag = token.substr(0,2); + const string& tokenValue = token.substr(3); + + // set program contents + if (tokenTag == internal::token_ID) prog.Id(tokenValue); + else if (tokenTag == internal::token_CL) prog.CommandLine(tokenValue); + else if (tokenTag == internal::token_DS) prog.Description(tokenValue); + else if (tokenTag == internal::token_PN) prog.Name(tokenValue); + else if (tokenTag == internal::token_PP) prog.PreviousProgramId(tokenValue); + else if (tokenTag == internal::token_VN) prog.Version(tokenValue); + + // otherwise, "custom" tag + else + custom[tokenTag] = tokenValue; + } + + prog.CustomTags(custom); + return prog; +} + +string ProgramInfo::ToSam(void) const +{ + stringstream out; + out << "@PG" + << internal::MakeSamTag(internal::token_ID, id_); + + if (!name_.empty()) out << internal::MakeSamTag(internal::token_PN, name_); + if (!version_.empty()) out << internal::MakeSamTag(internal::token_VN, version_); + if (!description_.empty()) out << internal::MakeSamTag(internal::token_DS, description_); + if (!previousProgramId_.empty()) out << internal::MakeSamTag(internal::token_PP, previousProgramId_); + if (!commandLine_.empty()) out << internal::MakeSamTag(internal::token_CL, commandLine_); + + // append any custom tags + map::const_iterator customIter = custom_.cbegin(); + map::const_iterator customEnd = custom_.cend(); + for ( ; customIter != customEnd; ++customIter ) + out << internal::MakeSamTag(customIter->first, customIter->second); + + return out.str(); +} + diff --git a/src/Pulse2BaseCache.h b/src/Pulse2BaseCache.h new file mode 100644 index 0000000..ae5bb1c --- /dev/null +++ b/src/Pulse2BaseCache.h @@ -0,0 +1,154 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// Author: Derek Barnett + +#ifndef PULSE2BASECACHE_H +#define PULSE2BASECACHE_H + +#include "pbbam/Config.h" +#include +#include +#include +#include +namespace PacBio { +namespace BAM { +namespace internal { + +class Pulse2BaseCache +{ +public: + /// \brief Creates a Pulse2BaseCache from pulseCall data ('pc' tag) + /// + /// Computes & stores cache of basecalled vs. squashed pulse positions for + /// later masking of pulse data. + /// + /// \param pulseCalls[in] string contents of 'pc' tag + /// + Pulse2BaseCache(const std::string& pulseCalls) + : data_(pulseCalls.size()) + { + // basecalled pulse -> data[i] == 1 + // squashed pulse -> data[i] == 0 + // + const auto numPulses = pulseCalls.size(); + for (size_t i = 0; i < numPulses; ++i) + data_[i] = std::isupper(pulseCalls.at(i)); + } + + Pulse2BaseCache(void) = delete; + Pulse2BaseCache(const Pulse2BaseCache& other) = default; + Pulse2BaseCache(Pulse2BaseCache&& other) = default; + Pulse2BaseCache& operator=(const Pulse2BaseCache&) = default; + Pulse2BaseCache& operator=(Pulse2BaseCache&&) = default; + ~Pulse2BaseCache(void) noexcept {} + +public: + + /// + /// \brief FindFirst + /// \return + /// + size_t FindFirst(void) const + { return data_.find_first(); } + + /// + /// \brief FindNext + /// \param from + /// \return + /// + size_t FindNext(size_t from) const + { return data_.find_next(from); } + + /// + /// \brief IsBasecallAt + /// \param pos + /// \return + /// + bool IsBasecallAt(const size_t pos) const + { return data_[pos]; } + + /// \returns the total number of pulses (basecalled & squashed) + /// + size_t NumPulses(void) const + { + return data_.size(); + } + + /// \returns the total number of basecalled pulses + /// + size_t NumBases(void) const + { + return data_.count(); + } + + /// \brief Removes squashed pulse positions from input data. + /// + /// \param[in] Contents of any per-pulse tag. + /// \returns Input \p pulseData less all squashed pulses + /// + template + T RemoveSquashedPulses(const T& pulseData) const + { + const auto numPulses = pulseData.size(); + assert(numPulses == data_.size()); + + // The reserve() below overshoots the required space, but numPulses is cheap + // to compute, and by definition will be sufficient to hold the result. Thus + // we only ever need to do one allocation. + // + T result; + result.reserve(numPulses); + + // Only include data at positions that match our cached pulse data. + // + size_t inputIndex = 0; + for (size_t i = 0; i < numPulses; ++i) { + if (data_[i]) + result.push_back(pulseData.at(inputIndex)); + ++inputIndex; + } + return result; + } + +private: + boost::dynamic_bitset<> data_; +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // PULSE2BASECACHE_H diff --git a/src/QNameQuery.cpp b/src/QNameQuery.cpp new file mode 100644 index 0000000..e544664 --- /dev/null +++ b/src/QNameQuery.cpp @@ -0,0 +1,104 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file QNameQuery.cpp +/// \brief Implements the QNameQuery class. +// +// Author: Derek Barnett + +#include "pbbam/QNameQuery.h" +#include "pbbam/CompositeBamReader.h" +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +struct QNameQuery::QNameQueryPrivate +{ +public: + QNameQueryPrivate(const DataSet& dataset) + : reader_(new SequentialCompositeBamReader(dataset)) + , nextRecord_(boost::none) + { } + + bool GetNext(vector& records) + { + records.clear(); + + string groupRecordName; + + if (nextRecord_.is_initialized()) { + BamRecord r = nextRecord_.get(); + groupRecordName = r.FullName(); + records.push_back(std::move(r)); + nextRecord_ = boost::none; + } + + BamRecord record; + while (reader_->GetNext(record)) { + if (records.empty()) { + groupRecordName = record.FullName(); + records.push_back(record); + } + else { + assert(!records.empty()); + if (record.FullName() == groupRecordName) + records.push_back(record); + else { + nextRecord_ = record; + return true; + } + } + } + return !records.empty(); + } + +public: + unique_ptr reader_; + boost::optional nextRecord_; +}; + +QNameQuery::QNameQuery(const DataSet& dataset) + : internal::IGroupQuery() + , d_(new QNameQueryPrivate(dataset)) +{ } + +QNameQuery::~QNameQuery(void) { } + +bool QNameQuery::GetNext(vector& records) +{ return d_->GetNext(records); } diff --git a/src/QualityValue.cpp b/src/QualityValue.cpp new file mode 100644 index 0000000..e9f63c9 --- /dev/null +++ b/src/QualityValue.cpp @@ -0,0 +1,47 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file QualityValue.h +/// \brief Implements the QualityValue class. +// +// Author: Derek Barnett + +#include "pbbam/QualityValue.h" +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +const uint8_t QualityValue::MAX = 93; diff --git a/src/ReadAccuracyQuery.cpp b/src/ReadAccuracyQuery.cpp new file mode 100644 index 0000000..8535189 --- /dev/null +++ b/src/ReadAccuracyQuery.cpp @@ -0,0 +1,71 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ReadAccuracyQuery.cpp +/// \brief Implements the ReadAccuracyQuery class. +// +// Author: Derek Barnett + +#include "pbbam/ReadAccuracyQuery.h" +#include "pbbam/PbiFilterTypes.h" +#include "pbbam/CompositeBamReader.h" +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +struct ReadAccuracyQuery::ReadAccuracyQueryPrivate +{ + ReadAccuracyQueryPrivate(const Accuracy accuracy, + const Compare::Type compareType, + const DataSet& dataset) + : reader_(PbiReadAccuracyFilter(accuracy, compareType), dataset) + { } + + PbiFilterCompositeBamReader reader_; // unsorted +}; + +ReadAccuracyQuery::ReadAccuracyQuery(const Accuracy accuracy, + const Compare::Type compareType, + const DataSet& dataset) + : internal::IQuery() + , d_(new ReadAccuracyQueryPrivate(accuracy, compareType, dataset)) +{ } + +ReadAccuracyQuery::~ReadAccuracyQuery(void) { } + +bool ReadAccuracyQuery::GetNext(BamRecord &r) +{ return d_->reader_.GetNext(r); } diff --git a/src/ReadGroupInfo.cpp b/src/ReadGroupInfo.cpp new file mode 100644 index 0000000..765c60e --- /dev/null +++ b/src/ReadGroupInfo.cpp @@ -0,0 +1,803 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ReadGroupInfo.cpp +/// \brief Implements the ReadGroupInfo class. +// +// Author: Derek Barnett + +#include "pbbam/ReadGroupInfo.h" +#include "pbbam/MD5.h" +#include "ChemistryTable.h" +#include "SequenceUtils.h" +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +static const string sam_ID = string{ "ID" }; +static const string sam_CN = string{ "CN" }; +static const string sam_DS = string{ "DS" }; +static const string sam_DT = string{ "DT" }; +static const string sam_FO = string{ "FO" }; +static const string sam_KS = string{ "KS" }; +static const string sam_LB = string{ "LB" }; +static const string sam_PG = string{ "PG" }; +static const string sam_PI = string{ "PI" }; +static const string sam_PL = string{ "PL" }; +static const string sam_PM = string{ "PM" }; +static const string sam_PU = string{ "PU" }; +static const string sam_SM = string{ "SM" }; + +static const string feature_DQ = string{ "DeletionQV" }; +static const string feature_DT = string{ "DeletionTag" }; +static const string feature_IQ = string{ "InsertionQV" }; +static const string feature_MQ = string{ "MergeQV" }; +static const string feature_SQ = string{ "SubstitutionQV" }; +static const string feature_ST = string{ "SubstitutionTag" }; +static const string feature_IP = string{ "Ipd" }; +static const string feature_PW = string{ "PulseWidth" }; +static const string feature_PM = string{ "PkMid" }; +static const string feature_PA = string{ "PkMean" }; +static const string feature_PI = string{ "PkMid2" }; +static const string feature_PS = string{ "PkMean2" }; +static const string feature_LT = string{ "Label" }; +static const string feature_PQ = string{ "LabelQV" }; +static const string feature_PT = string{ "AltLabel" }; +static const string feature_PV = string{ "AltLabelQV" }; +static const string feature_PG = string{ "PulseMergeQV" }; +static const string feature_PC = string{ "PulseCall" }; +static const string feature_PD = string{ "PrePulseFrames" }; +static const string feature_PX = string{ "PulseCallWidth" }; +static const string feature_SF = string{ "StartFrame" }; + +static const string token_RT = string{ "READTYPE" }; +static const string token_BK = string{ "BINDINGKIT" }; +static const string token_SK = string{ "SEQUENCINGKIT" }; +static const string token_BV = string{ "BASECALLERVERSION" }; +static const string token_FR = string{ "FRAMERATEHZ" }; +static const string token_CT = string{ "CONTROL" }; + +static const string token_BF = string{ "BarcodeFile" }; +static const string token_BH = string{ "BarcodeHash" }; +static const string token_BC = string{ "BarcodeCount" }; +static const string token_BM = string{ "BarcodeMode" }; +static const string token_BQ = string{ "BarcodeQuality" }; + +static const string codec_RAW = string{ "Frames" }; +static const string codec_V1 = string{ "CodecV1" }; + +static const string barcodemode_NONE = string{ "None" }; +static const string barcodemode_SYM = string{ "Symmetric" }; +static const string barcodemode_ASYM = string{ "Asymmetric" }; + +static const string barcodequal_NONE = string{ "None" }; +static const string barcodequal_SCORE = string{ "Score" }; +static const string barcodequal_PROB = string{ "Probability" }; + +static const string platformModelType_ASTRO = string{ "ASTRO" }; +static const string platformModelType_RS = string{ "RS" }; +static const string platformModelType_SEQUEL = string{ "SEQUEL" }; + +static +string BaseFeatureName(const BaseFeature& feature) +{ + switch(feature) { + case BaseFeature::DELETION_QV : return feature_DQ; + case BaseFeature::DELETION_TAG : return feature_DT; + case BaseFeature::INSERTION_QV : return feature_IQ; + case BaseFeature::MERGE_QV : return feature_MQ; + case BaseFeature::SUBSTITUTION_QV : return feature_SQ; + case BaseFeature::SUBSTITUTION_TAG : return feature_ST; + case BaseFeature::IPD : return feature_IP; + case BaseFeature::PULSE_WIDTH : return feature_PW; + case BaseFeature::PKMID : return feature_PM; + case BaseFeature::PKMEAN : return feature_PA; + case BaseFeature::PKMID2 : return feature_PI; + case BaseFeature::PKMEAN2 : return feature_PS; + case BaseFeature::LABEL_QV : return feature_PQ; + case BaseFeature::ALT_LABEL : return feature_PT; + case BaseFeature::ALT_LABEL_QV : return feature_PV; + case BaseFeature::PULSE_MERGE_QV : return feature_PG; + case BaseFeature::PULSE_CALL : return feature_PC; + case BaseFeature::PRE_PULSE_FRAMES : return feature_PD; + case BaseFeature::PULSE_CALL_WIDTH : return feature_PX; + case BaseFeature::START_FRAME : return feature_SF; + default: + throw std::runtime_error{ "unrecognized base feature" }; + } + return string{ }; // unreachable +} + +static +string FrameCodecName(const FrameCodec& codec) +{ + switch (codec) { + case FrameCodec::RAW : return codec_RAW; + case FrameCodec::V1 : return codec_V1; + default: + throw std::runtime_error{ "unrecognized frame codec" }; + } + return string{ }; // unreachable +} + +static +string BarcodeModeName(const BarcodeModeType& mode) +{ + switch (mode) { + case BarcodeModeType::NONE : return barcodemode_NONE; + case BarcodeModeType::SYMMETRIC : return barcodemode_SYM; + case BarcodeModeType::ASYMMETRIC : return barcodemode_ASYM; + default: + throw std::runtime_error{ "unrecognized barcode mode" }; + } + return string{ }; // unreachable +} + +static +string BarcodeQualityName(const BarcodeQualityType& type) +{ + switch (type) { + case BarcodeQualityType::NONE : return barcodequal_NONE; + case BarcodeQualityType::SCORE : return barcodequal_SCORE; + case BarcodeQualityType::PROBABILITY : return barcodequal_PROB; + default: + throw std::runtime_error{ "unrecognized barcode quality type" }; + } + return string{ }; // unreachable +} + +static +string PlatformModelName(const PlatformModelType& type) +{ + switch (type) { + case PlatformModelType::ASTRO : return platformModelType_ASTRO; + case PlatformModelType::RS : return platformModelType_RS; + case PlatformModelType::SEQUEL : return platformModelType_SEQUEL; + default: + throw std::runtime_error{ "unrecognized platform model" }; + } + return string{ }; // unreachable +} + +static map nameToFeature; +static map nameToCodec; +static map nameToBarcodeMode; +static map nameToBarcodeQuality; +static map nameToPlatformModel; + +static inline +void InitNameToFeature(void) +{ + if (nameToFeature.empty()) { + nameToFeature[feature_DQ] = BaseFeature::DELETION_QV; + nameToFeature[feature_DT] = BaseFeature::DELETION_TAG; + nameToFeature[feature_IQ] = BaseFeature::INSERTION_QV; + nameToFeature[feature_MQ] = BaseFeature::MERGE_QV; + nameToFeature[feature_SQ] = BaseFeature::SUBSTITUTION_QV; + nameToFeature[feature_ST] = BaseFeature::SUBSTITUTION_TAG; + nameToFeature[feature_IP] = BaseFeature::IPD; + nameToFeature[feature_PW] = BaseFeature::PULSE_WIDTH; + nameToFeature[feature_PM] = BaseFeature::PKMID; + nameToFeature[feature_PA] = BaseFeature::PKMEAN; + nameToFeature[feature_PI] = BaseFeature::PKMID2; + nameToFeature[feature_PS] = BaseFeature::PKMEAN2; + nameToFeature[feature_PQ] = BaseFeature::LABEL_QV; + nameToFeature[feature_PT] = BaseFeature::ALT_LABEL; + nameToFeature[feature_PV] = BaseFeature::ALT_LABEL_QV; + nameToFeature[feature_PC] = BaseFeature::PULSE_CALL; + nameToFeature[feature_PG] = BaseFeature::PULSE_MERGE_QV; + nameToFeature[feature_PD] = BaseFeature::PRE_PULSE_FRAMES; + nameToFeature[feature_PX] = BaseFeature::PULSE_CALL_WIDTH; + nameToFeature[feature_SF] = BaseFeature::START_FRAME; + } +} + +static inline +void InitNameToCodec(void) +{ + if (nameToCodec.empty()) { + nameToCodec[codec_RAW] = FrameCodec::RAW; + nameToCodec[codec_V1] = FrameCodec::V1; + } +} + +static inline +void InitNameToBarcodeMode(void) +{ + if (nameToBarcodeMode.empty()) { + nameToBarcodeMode[barcodemode_NONE] = BarcodeModeType::NONE; + nameToBarcodeMode[barcodemode_SYM] = BarcodeModeType::SYMMETRIC; + nameToBarcodeMode[barcodemode_ASYM] = BarcodeModeType::ASYMMETRIC; + } +} + +static inline +void InitNameToBarcodeQuality(void) +{ + if (nameToBarcodeQuality.empty()) { + nameToBarcodeQuality[barcodequal_NONE] = BarcodeQualityType::NONE; + nameToBarcodeQuality[barcodequal_SCORE] = BarcodeQualityType::SCORE; + nameToBarcodeQuality[barcodequal_PROB] = BarcodeQualityType::PROBABILITY; + } +} + +static inline +void InitNameToPlatformModel(void) +{ + if (nameToPlatformModel.empty()) { + nameToPlatformModel[platformModelType_ASTRO] = PlatformModelType::ASTRO; + nameToPlatformModel[platformModelType_RS] = PlatformModelType::RS; + nameToPlatformModel[platformModelType_SEQUEL] = PlatformModelType::SEQUEL; + } +} + +static inline +bool IsLikelyBarcodeKey(const string& name) +{ return name.find("Barcode") == 0; } + +static inline +bool IsBaseFeature(const string& name) +{ + InitNameToFeature(); + return nameToFeature.find(name) != nameToFeature.cend(); +} + +static inline +BaseFeature BaseFeatureFromName(const string& name) +{ + InitNameToFeature(); + return nameToFeature.at(name); +} + +static inline +FrameCodec FrameCodecFromName(const string& name) +{ + InitNameToCodec(); + return nameToCodec.at(name); +} + +static inline +BarcodeModeType BarcodeModeFromName(const string& name) +{ + InitNameToBarcodeMode(); + return nameToBarcodeMode.at(name); +} + +static inline +BarcodeQualityType BarcodeQualityFromName(const string& name) +{ + InitNameToBarcodeQuality(); + return nameToBarcodeQuality.at(name); +} + +static inline +PlatformModelType PlatformModelFromName(const string& name) +{ + InitNameToPlatformModel(); + return nameToPlatformModel.at(name); +} + +} // namespace internal + +ReadGroupInfo::ReadGroupInfo(void) + : platformModel_(PlatformModelType::SEQUEL) + , readType_("UNKNOWN") + , ipdCodec_(FrameCodec::V1) + , pulseWidthCodec_(FrameCodec::V1) +{ } + +ReadGroupInfo::ReadGroupInfo(const std::string& id) + : id_(id) + , platformModel_(PlatformModelType::SEQUEL) + , readType_("UNKNOWN") + , ipdCodec_(FrameCodec::V1) + , pulseWidthCodec_(FrameCodec::V1) +{ } + +ReadGroupInfo::ReadGroupInfo(const std::string& movieName, + const std::string& readType) + : id_(MakeReadGroupId(movieName, readType)) + , movieName_(movieName) + , platformModel_(PlatformModelType::SEQUEL) + , readType_(readType) + , ipdCodec_(FrameCodec::V1) + , pulseWidthCodec_(FrameCodec::V1) +{ } + +ReadGroupInfo::ReadGroupInfo(const std::string& movieName, + const std::string& readType, + const PlatformModelType platform) + : id_(MakeReadGroupId(movieName, readType)) + , movieName_(movieName) + , platformModel_(platform) + , readType_(readType) + , ipdCodec_(FrameCodec::V1) + , pulseWidthCodec_(FrameCodec::V1) +{ } + +ReadGroupInfo::ReadGroupInfo(const ReadGroupInfo& other) + : id_(other.id_) + , sequencingCenter_(other.sequencingCenter_) + , date_(other.date_) + , flowOrder_(other.flowOrder_) + , keySequence_(other.keySequence_) + , library_(other.library_) + , programs_(other.programs_) + , predictedInsertSize_(other.predictedInsertSize_) + , movieName_(other.movieName_) + , sample_(other.sample_) + , platformModel_(other.platformModel_) + , readType_(other.readType_) + , bindingKit_(other.bindingKit_) + , sequencingKit_(other.sequencingKit_) + , basecallerVersion_(other.basecallerVersion_) + , frameRateHz_(other.frameRateHz_) + , control_(other.control_) + , ipdCodec_(other.ipdCodec_) + , pulseWidthCodec_(other.pulseWidthCodec_) + , hasBarcodeData_(other.hasBarcodeData_) + , barcodeFile_(other.barcodeFile_) + , barcodeHash_(other.barcodeHash_) + , barcodeCount_(other.barcodeCount_) + , barcodeMode_(other.barcodeMode_) + , barcodeQuality_(other.barcodeQuality_) + , features_(other.features_) +{ } + +ReadGroupInfo::ReadGroupInfo(ReadGroupInfo&& other) + : id_(std::move(other.id_)) + , sequencingCenter_(std::move(other.sequencingCenter_)) + , date_(std::move(other.date_)) + , flowOrder_(std::move(other.flowOrder_)) + , keySequence_(std::move(other.keySequence_)) + , library_(std::move(other.library_)) + , programs_(std::move(other.programs_)) + , predictedInsertSize_(std::move(other.predictedInsertSize_)) + , movieName_(std::move(other.movieName_)) + , sample_(std::move(other.sample_)) + , platformModel_(std::move(other.platformModel_)) + , readType_(std::move(other.readType_)) + , bindingKit_(std::move(other.bindingKit_)) + , sequencingKit_(std::move(other.sequencingKit_)) + , basecallerVersion_(std::move(other.basecallerVersion_)) + , frameRateHz_(std::move(other.frameRateHz_)) + , control_(std::move(other.control_)) + , ipdCodec_(std::move(other.ipdCodec_)) + , pulseWidthCodec_(std::move(other.pulseWidthCodec_)) + , hasBarcodeData_(std::move(other.hasBarcodeData_)) + , barcodeFile_(std::move(other.barcodeFile_)) + , barcodeHash_(std::move(other.barcodeHash_)) + , barcodeCount_(std::move(other.barcodeCount_)) + , barcodeMode_(std::move(other.barcodeMode_)) + , barcodeQuality_(std::move(other.barcodeQuality_)) + , features_(std::move(other.features_)) +{ } + +ReadGroupInfo::~ReadGroupInfo(void) { } + +ReadGroupInfo& ReadGroupInfo::operator=(const ReadGroupInfo& other) +{ + id_ = other.id_; + sequencingCenter_ = other.sequencingCenter_; + date_ = other.date_; + flowOrder_ = other.flowOrder_; + keySequence_ = other.keySequence_; + library_ = other.library_; + programs_ = other.programs_; + platformModel_ = other.platformModel_; + predictedInsertSize_ = other.predictedInsertSize_; + movieName_ = other.movieName_; + sample_ = other.sample_; + readType_ = other.readType_; + bindingKit_ = other.bindingKit_; + sequencingKit_ = other.sequencingKit_; + basecallerVersion_ = other.basecallerVersion_; + frameRateHz_ = other.frameRateHz_; + control_ = other.control_; + ipdCodec_ = other.ipdCodec_; + pulseWidthCodec_ = other.pulseWidthCodec_; + hasBarcodeData_ = other.hasBarcodeData_; + barcodeFile_ = other.barcodeFile_; + barcodeHash_ = other.barcodeHash_; + barcodeCount_ = other.barcodeCount_; + barcodeMode_ = other.barcodeMode_; + barcodeQuality_ = other.barcodeQuality_; + features_ = other.features_; + return *this; +} + +ReadGroupInfo& ReadGroupInfo::operator=(ReadGroupInfo&& other) +{ + id_ = std::move(other.id_); + sequencingCenter_ = std::move(other.sequencingCenter_); + date_ = std::move(other.date_); + flowOrder_ = std::move(other.flowOrder_); + keySequence_ = std::move(other.keySequence_); + library_ = std::move(other.library_); + programs_ = std::move(other.programs_); + platformModel_ = std::move(other.platformModel_); + predictedInsertSize_ = std::move(other.predictedInsertSize_); + movieName_ = std::move(other.movieName_); + sample_ = std::move(other.sample_); + readType_ = std::move(other.readType_); + bindingKit_ = std::move(other.bindingKit_); + sequencingKit_ = std::move(other.sequencingKit_); + basecallerVersion_ = std::move(other.basecallerVersion_); + frameRateHz_ = std::move(other.frameRateHz_); + control_ = std::move(other.control_); + ipdCodec_ = std::move(other.ipdCodec_); + pulseWidthCodec_ = std::move(other.pulseWidthCodec_); + hasBarcodeData_ = std::move(other.hasBarcodeData_); + barcodeFile_ = std::move(other.barcodeFile_); + barcodeHash_ = std::move(other.barcodeHash_); + barcodeCount_ = std::move(other.barcodeCount_); + barcodeMode_ = std::move(other.barcodeMode_); + barcodeQuality_ = std::move(other.barcodeQuality_); + features_ = std::move(other.features_); + return *this; +} + +void ReadGroupInfo::DecodeSamDescription(const std::string& description) +{ + // split on semicolons + // for each, split on equal + // determine name -> + + auto tokens = internal::Split(description, ';'); + if (tokens.empty()) + return; + + bool hasBarcodeFile = false; + bool hasBarcodeHash = false; + bool hasBarcodeCount = false; + bool hasBarcodeMode = false; + bool hasBarcodeQuality = false; + + // iterate over tokens + for (auto&& token : tokens) { + + const auto foundEqual = token.find('='); + if (foundEqual == string::npos) + continue; + + const auto key = token.substr(0,foundEqual); + const auto value = token.substr(foundEqual+1); + + // 'mandatory' items + if (key == internal::token_RT) readType_ = value; + else if (key == internal::token_BK) bindingKit_ = value; + else if (key == internal::token_BV) basecallerVersion_ = value; + else if (key == internal::token_SK) sequencingKit_ = value; + else if (key == internal::token_FR) frameRateHz_ = value; + else if (key == internal::token_CT) control_ = (value == "TRUE"); + + // base features + else if (internal::IsBaseFeature(key)) + features_[internal::BaseFeatureFromName(key)] = value; + + // barcode data + else if (internal::IsLikelyBarcodeKey(key)) { + if (key == internal::token_BF) { + barcodeFile_ = value; + hasBarcodeFile = true; + } + else if (key == internal::token_BH) { + barcodeHash_ = value; + hasBarcodeHash = true; + } + else if (key == internal::token_BC) { + barcodeCount_ = static_cast(std::stoul(value)); + hasBarcodeCount = true; + } + else if (key == internal::token_BM) { + barcodeMode_ = internal::BarcodeModeFromName(value); + hasBarcodeMode = true; + } + else if (key == internal::token_BQ) { + barcodeQuality_ = internal::BarcodeQualityFromName(value); + hasBarcodeQuality = true; + } + } + + // frame codecs + else { + const auto keyParts = internal::Split(key, ':'); + if (keyParts.size() == 2) { + const auto& subkey = keyParts.at(0); + if (subkey == internal::feature_IP) { + ipdCodec_ = internal::FrameCodecFromName(keyParts.at(1)); + features_[BaseFeature::IPD] = value; + } + else if (subkey == internal::feature_PW) { + pulseWidthCodec_ = internal::FrameCodecFromName(keyParts.at(1)); + features_[BaseFeature::PULSE_WIDTH] = value; + } + } + } + } + + hasBarcodeData_ = (hasBarcodeFile && + hasBarcodeHash && + hasBarcodeCount && + hasBarcodeMode && + hasBarcodeQuality); +} + +std::string ReadGroupInfo::EncodeSamDescription(void) const +{ + auto result = string{ }; + result.reserve(256); + result.append(std::string(internal::token_RT+"=" + readType_)); + + static const auto SEP = string{";"}; + static const auto COLON = string{":"}; + static const auto EQ = string{"="}; + + auto featureName = string{ }; + const auto featureEnd = features_.cend(); + auto featureIter = features_.cbegin(); + for ( ; featureIter != featureEnd; ++featureIter ) { + featureName = internal::BaseFeatureName(featureIter->first); + if (featureName.empty() || featureIter->second.empty()) + continue; + else if (featureName == internal::feature_IP) { + featureName.append(COLON); + featureName.append(internal::FrameCodecName(ipdCodec_)); + } + else if (featureName == internal::feature_PW) { + featureName.append(COLON); + featureName.append(internal::FrameCodecName(pulseWidthCodec_)); + } + result.append(string(SEP + featureName + EQ + featureIter->second)); + } + + if (!bindingKit_.empty()) result.append(SEP + internal::token_BK +EQ + bindingKit_); + if (!sequencingKit_.empty()) result.append(SEP + internal::token_SK +EQ + sequencingKit_); + if (!basecallerVersion_.empty()) result.append(SEP + internal::token_BV +EQ + basecallerVersion_); + if (!frameRateHz_.empty()) result.append(SEP + internal::token_FR +EQ + frameRateHz_); + if (control_) result.append(SEP + internal::token_CT +EQ + (control_ ? "TRUE" + : "FALSE")); + + if (hasBarcodeData_) { + const auto barcodeData = + string { + SEP + internal::token_BF + EQ + barcodeFile_ + + SEP + internal::token_BH + EQ + barcodeHash_ + + SEP + internal::token_BC + EQ + std::to_string(barcodeCount_) + + SEP + internal::token_BM + EQ + internal::BarcodeModeName(barcodeMode_) + + SEP + internal::token_BQ + EQ + internal::BarcodeQualityName(barcodeQuality_) + }; + result.reserve(result.size() + barcodeData.size()); + result.append(barcodeData); + } + + return result; +} + +ReadGroupInfo ReadGroupInfo::FromSam(const string& sam) +{ + // pop off '@RG\t', then split rest of line into tokens + const auto tokens = internal::Split(sam.substr(4), '\t'); + if (tokens.empty()) + return ReadGroupInfo{ }; + + auto rg = ReadGroupInfo{ }; + auto custom = map{ }; + + for (auto&& token : tokens) { + const auto tokenTag = token.substr(0,2); + const auto tokenValue = token.substr(3); + + // set read group info + if (tokenTag == internal::sam_ID) rg.Id(tokenValue); + else if (tokenTag == internal::sam_CN) rg.SequencingCenter(tokenValue); + else if (tokenTag == internal::sam_DT) rg.Date(tokenValue); + else if (tokenTag == internal::sam_FO) rg.FlowOrder(tokenValue); + else if (tokenTag == internal::sam_KS) rg.KeySequence(tokenValue); + else if (tokenTag == internal::sam_LB) rg.Library(tokenValue); + else if (tokenTag == internal::sam_PG) rg.Programs(tokenValue); + else if (tokenTag == internal::sam_PI) rg.PredictedInsertSize(tokenValue); + else if (tokenTag == internal::sam_PU) rg.MovieName(tokenValue); + else if (tokenTag == internal::sam_SM) rg.Sample(tokenValue); + else if (tokenTag == internal::sam_DS) rg.DecodeSamDescription(tokenValue); + else if (tokenTag == internal::sam_PM) rg.PlatformModel(internal::PlatformModelFromName(tokenValue)); + + // otherwise, "custom" tag + else + custom[tokenTag] = tokenValue; + } + rg.CustomTags(custom); + + return rg; +} + +string ReadGroupInfo::IntToId(const int32_t id) +{ + stringstream s; + s << std::setfill('0') << std::setw(8) << std::hex << id; + return s.str(); +} + +ReadGroupInfo& ReadGroupInfo::IpdCodec(const FrameCodec& codec, + const string& tag) +{ + // store desired codec type + ipdCodec_ = codec; + + // update base features map + auto actualTag = tag; + if (actualTag.empty()) + actualTag = "ip"; + BaseFeatureTag(BaseFeature::IPD, actualTag); + return *this; +} + +ReadGroupInfo& ReadGroupInfo::PulseWidthCodec(const FrameCodec& codec, + const string& tag) +{ + // store desired codec type + pulseWidthCodec_ = codec; + + // update base features map + auto actualTag = tag; + if (actualTag.empty()) + actualTag = "pw"; + BaseFeatureTag(BaseFeature::PULSE_WIDTH, actualTag); + return *this; +} + +string ReadGroupInfo::SequencingChemistryFromTriple(const string& bindingKit, + const string& sequencingKit, + const string& basecallerVersion) +{ + const auto verFields = internal::Split(basecallerVersion, '.'); + if (verFields.size() < 2) + throw std::runtime_error("basecaller version too short: " + basecallerVersion); + const string ver = verFields.at(0) + "." + verFields.at(1); +// const string ver{ basecallerVersion.substr(0, 3) }; + for (const auto& row : internal::ChemistryTable) { + if (bindingKit == row[0] && sequencingKit == row[1] && ver == row[2]) + return row[3]; + } + + // not found + throw InvalidSequencingChemistryException(bindingKit, + sequencingKit, + basecallerVersion); +} + +std::string ReadGroupInfo::ToSam(void) const +{ + stringstream out; + out << "@RG" + << internal::MakeSamTag(internal::sam_ID, id_) + << internal::MakeSamTag(internal::sam_PL, Platform()); + + auto description = EncodeSamDescription(); + if (!description.empty()) + out << internal::MakeSamTag(internal::sam_DS, description); + + if (!sequencingCenter_.empty()) out << internal::MakeSamTag(internal::sam_CN, sequencingCenter_); + if (!date_.empty()) out << internal::MakeSamTag(internal::sam_DT, date_); + if (!flowOrder_.empty()) out << internal::MakeSamTag(internal::sam_FO, flowOrder_); + if (!keySequence_.empty()) out << internal::MakeSamTag(internal::sam_KS, keySequence_); + if (!library_.empty()) out << internal::MakeSamTag(internal::sam_LB, library_); + if (!programs_.empty()) out << internal::MakeSamTag(internal::sam_PG, programs_); + if (!predictedInsertSize_.empty()) out << internal::MakeSamTag(internal::sam_PI, predictedInsertSize_); + if (!movieName_.empty()) out << internal::MakeSamTag(internal::sam_PU, movieName_); + if (!sample_.empty()) out << internal::MakeSamTag(internal::sam_SM, sample_); + + out << internal::MakeSamTag(internal::sam_PM, internal::PlatformModelName(platformModel_)); + + // append any custom tags + auto customIter = custom_.cbegin(); + auto customEnd = custom_.cend(); + for ( ; customIter != customEnd; ++customIter ) + out << internal::MakeSamTag(customIter->first, customIter->second); + + return out.str(); +} + +std::string MakeReadGroupId(const std::string& movieName, + const std::string& readType) +{ +/*{ + MD5_CTX md5; + unsigned char digest[16]; + char hexdigest[9]; + + MD5_Init(&md5); + MD5_Update(&md5, reinterpret_cast(const_cast(movieName.c_str())), movieName.size()); + MD5_Update(&md5, reinterpret_cast(const_cast("//")), 2); + MD5_Update(&md5, reinterpret_cast(const_cast(readType.c_str())), readType.size()); + MD5_Final(digest, &md5); + + for (int i = 0; i < 4; ++i) + sprintf(&hexdigest[2*i], "%02x", digest[i]); + + return std::string{hexdigest, 8}; +*/ + return MD5Hash(movieName + "//" + readType).substr(0,8); +} + +bool ReadGroupInfo::operator==(const ReadGroupInfo& other) const +{ + return id_ == other.id_ + && sequencingCenter_ == other.sequencingCenter_ + && date_ == other.date_ + && flowOrder_ == other.flowOrder_ + && keySequence_ == other.keySequence_ + && library_ == other.library_ + && programs_ == other.programs_ + && platformModel_ == other.platformModel_ + && predictedInsertSize_ == other.predictedInsertSize_ + && movieName_ == other.movieName_ + && sample_ == other.sample_ + && readType_ == other.readType_ + && bindingKit_ == other.bindingKit_ + && sequencingKit_ == other.sequencingKit_ + && basecallerVersion_ == other.basecallerVersion_ + && frameRateHz_ == other.frameRateHz_ + && control_ == other.control_ + && ipdCodec_ == other.ipdCodec_ + && pulseWidthCodec_ == other.pulseWidthCodec_ + && hasBarcodeData_ == other.hasBarcodeData_ + && barcodeFile_ == other.barcodeFile_ + && barcodeHash_ == other.barcodeHash_ + && barcodeCount_ == other.barcodeCount_ + && barcodeMode_ == other.barcodeMode_ + && barcodeQuality_ == other.barcodeQuality_ + && features_.size() == other.features_.size() + && std::equal(features_.cbegin(), + features_.cend(), + other.features_.cbegin()) + && custom_.size() == other.custom_.size() + && std::equal(custom_.begin(), + custom_.end(), + other.custom_.cbegin()); +} + +} // namespace BAM +} // namespace PacBio diff --git a/src/SamTagCodec.cpp b/src/SamTagCodec.cpp new file mode 100644 index 0000000..43064b8 --- /dev/null +++ b/src/SamTagCodec.cpp @@ -0,0 +1,300 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file SamTagCodec.h +/// \brief Implements the SamTagCodec class. +// +// Author: Derek Barnett + +#include "pbbam/SamTagCodec.h" +#include "AssertUtils.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +template +inline void appendSamValue(const T& value, + string& result, + bool force8BitInt = false) +{ + if (force8BitInt) + result.append(boost::lexical_cast(static_cast(value))); + else + result.append(boost::lexical_cast(value)); +} + +template +void appendSamMultiValue(const T& container, + string& result, + bool force8BitInt = false) +{ + auto end = container.cend(); + for (auto iter = container.cbegin(); iter != end; ++iter) { + result.append(1, ','); + if ( force8BitInt ) + result.append(boost::lexical_cast(static_cast(*iter))); + else + result.append(boost::lexical_cast(*iter)); + } +} + +static +vector& split(const string& s, char delim, vector& elems) +{ + stringstream ss(s); + string item; + while (getline(ss, item, delim)) + elems.push_back(item); + return elems; +} + +static +vector split(const string& s, char delim) { + vector elems; + split(s, delim, elems); + return elems; +} + +vector readFloatSamMultiValue(const string& data) +{ + vector result; + char* c = (char*)data.c_str(); + const char* end = c + data.length(); + while (c+1 < end) { + const float value = strtof(c+1, &c); // c+1 to skip comma + result.push_back(value); + } + return result; +} + +template +vector readSignedSamMultiValue(const string& data) +{ + vector result; + char* c = (char*)data.c_str(); + const char* end = c + data.length(); + while (c+1 < end) { + const T value = strtol(c+1, &c, 0); // c+1 to skip comma + result.push_back(value); + } + + return result; +} + +template +vector readUnsignedSamMultiValue(const string& data) +{ + vector result; + char* c = (char*)data.c_str(); + const char* end = c + data.length(); + while (c+1 < end) { + const T value = strtoul(c+1, &c, 0); // c+1 to skip comma + result.push_back(value); + } + return result; +} + +TagCollection SamTagCodec::Decode(const string& tagString) +{ + TagCollection tags; + + const vector& tokens = split(tagString, '\t'); + const auto end = tokens.cend(); + for (auto iter = tokens.cbegin(); iter != end; ++iter ) { + const string& token = (*iter); + if (token.size() < 6) // TT:t:X + continue; + + const string& name = token.substr(0, 2); + const char type = token.at(3); + const string& remainder = token.substr(5); + PB_ASSERT_OR_CONTINUE(!remainder.empty()); + + switch (type) { + + // technically only 'A' is allowed in SAM chars, + // but we'll be a little permissive + case 'A' : + case 'a' : + { + tags[name] = Tag(static_cast(remainder.at(0), TagModifier::ASCII_CHAR)); + break; + } + + // technically only 'i' is allowed in SAM ints, but we'll be a little + // permissive since SAM might be a bit more "user-edited" than BAM + case 'c' : + case 'C' : + case 's' : + case 'S' : + case 'i' : + case 'I' : + { + // check out boost::numeric cast for these conversions + + // negative value (force signed int) + if (remainder.at(0) == '-') { + const int32_t x = boost::lexical_cast(remainder); + if ( x >= INT8_MIN ) + tags[name] = static_cast(x); + else if ( x >= INT16_MIN ) + tags[name] = static_cast(x); + else + tags[name] = x; + } + + // unsigned int + else { + const uint32_t x = boost::lexical_cast(remainder); + if ( x <= UINT8_MAX ) + tags[name] = static_cast(x); + else if ( x <= UINT16_MAX ) + tags[name] = static_cast(x); + else + tags[name] = x; + } + break; + } + + case 'f' : + { + tags[name] = boost::lexical_cast(remainder); + break; + } + + case 'Z' : + { + tags[name] = remainder; + break; + } + + case 'H' : + { + tags[name] = Tag(remainder, TagModifier::HEX_STRING); + break; + } + + case 'B' : + { + const char elementType = remainder.at(0); + const string& arrayData = remainder.substr(1); + switch (elementType) { + case 'c' : tags[name] = readSignedSamMultiValue(arrayData); break; + case 'C' : tags[name] = readUnsignedSamMultiValue(arrayData); break; + case 's' : tags[name] = readSignedSamMultiValue(arrayData); break; + case 'S' : tags[name] = readUnsignedSamMultiValue(arrayData); break; + case 'i' : tags[name] = readSignedSamMultiValue(arrayData); break; + case 'I' : tags[name] = readUnsignedSamMultiValue(arrayData); break; + case 'f' : tags[name] = readFloatSamMultiValue(arrayData); break; + default: + PB_ASSERT_OR_CONTINUE(false); + } + break; + } + + // unsupported SAM tag type + default : + PB_ASSERT_OR_CONTINUE(false); + } + } + + return tags; +} + +string SamTagCodec::Encode(const TagCollection& tags) +{ + string result; + result.reserve(1024); + + const auto tagEnd = tags.cend(); + for (auto tagIter = tags.cbegin(); tagIter != tagEnd; ++tagIter) { + const string& name = (*tagIter).first; + const Tag& tag = (*tagIter).second; + PB_ASSERT_OR_CONTINUE(name.size() == 2); + if (tag.IsNull()) + continue; + + // tab separator + if (!result.empty()) + result.append(1, '\t'); + + // ":" + result.append(name); + result.append(1, ':'); + + // ":" for printable, ASCII char + if (tag.HasModifier(TagModifier::ASCII_CHAR)) { + char c = tag.ToAscii(); + if (c != '\0') { + result.append("A:"); + result.append(1, c); + continue; + } + } + + // ":" for all other data + switch (tag.Type()) { + case TagDataType::INT8 : result.append("i:"); appendSamValue(tag.ToInt8(), result, true); break; + case TagDataType::UINT8 : result.append("i:"); appendSamValue(tag.ToUInt8(), result, true); break; + case TagDataType::INT16 : result.append("i:"); appendSamValue(tag.ToInt16(), result); break; + case TagDataType::UINT16 : result.append("i:"); appendSamValue(tag.ToUInt16(), result); break; + case TagDataType::INT32 : result.append("i:"); appendSamValue(tag.ToInt32(), result); break; + case TagDataType::UINT32 : result.append("i:"); appendSamValue(tag.ToUInt32(), result); break; + case TagDataType::FLOAT : result.append("f:"); appendSamValue(tag.ToFloat(), result); break; + + case TagDataType::STRING : + { + result.append(tag.HasModifier(TagModifier::HEX_STRING) ? "H:" : "Z:"); + result.append(tag.ToString()); + break; + } + + case TagDataType::INT8_ARRAY : result.append("B:c"); appendSamMultiValue(tag.ToInt8Array(), result, true); break; + case TagDataType::UINT8_ARRAY : result.append("B:C"); appendSamMultiValue(tag.ToUInt8Array(), result, true); break; + case TagDataType::INT16_ARRAY : result.append("B:s"); appendSamMultiValue(tag.ToInt16Array(), result); break; + case TagDataType::UINT16_ARRAY : result.append("B:S"); appendSamMultiValue(tag.ToUInt16Array(), result); break; + case TagDataType::INT32_ARRAY : result.append("B:i"); appendSamMultiValue(tag.ToInt32Array(), result); break; + case TagDataType::UINT32_ARRAY : result.append("B:I"); appendSamMultiValue(tag.ToUInt32Array(), result); break; + case TagDataType::FLOAT_ARRAY : result.append("B:f"); appendSamMultiValue(tag.ToFloatArray(), result); break; + + default : + PB_ASSERT_OR_RETURN_VALUE(false, string()); + } + } + + return result; +} diff --git a/src/SamWriter.cpp b/src/SamWriter.cpp new file mode 100644 index 0000000..38a28d2 --- /dev/null +++ b/src/SamWriter.cpp @@ -0,0 +1,142 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "pbbam/SamWriter.h" +#include "pbbam/Validator.h" +#include "FileProducer.h" +#include "MemoryUtils.h" +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +class SamWriterPrivate : public internal::FileProducer +{ +public: + SamWriterPrivate(const std::string& filename, + const PBBAM_SHARED_PTR rawHeader) + : internal::FileProducer(filename) + , file_(nullptr) + , header_(rawHeader) + { + if (!header_) + throw std::runtime_error("null header"); + + // open file + const string& usingFilename = TempFilename(); + const string& mode = string("w"); + file_.reset(sam_open(usingFilename.c_str(), mode.c_str())); + if (!file_) + throw std::runtime_error("could not open file for writing"); + + // write header + const int ret = sam_hdr_write(file_.get(), header_.get()); + if (ret != 0) + throw std::runtime_error("could not write header"); + } + + void TryFlush(void); + void Write(const BamRecord& record); + +private: + std::unique_ptr file_; + PBBAM_SHARED_PTR header_; +}; + +void SamWriterPrivate::TryFlush(void) +{ + const auto ret = file_.get()->fp.hfile; + if (ret != 0) + throw std::runtime_error("could not flush output buffer contents"); +} + +void SamWriterPrivate::Write(const BamRecord& record) +{ +#if PBBAM_AUTOVALIDATE + Validator::Validate(record); +#endif + + const auto rawRecord = internal::BamRecordMemory::GetRawData(record); + + // store bin number + // min_shift=14 & n_lvls=5 are SAM/BAM "magic numbers" + rawRecord->core.bin = hts_reg2bin(rawRecord->core.pos, + bam_endpos(rawRecord.get()), 14, 5); + + // write record to file + const int ret = sam_write1(file_.get(), header_.get(), rawRecord.get()); + if (ret <= 0) + throw std::runtime_error("could not write record"); +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +SamWriter::SamWriter(const string& filename, const BamHeader& header) + : IRecordWriter() + , d_(nullptr) +{ +#if PBBAM_AUTOVALIDATE + Validator::Validate(header); +#endif + d_.reset(new internal::SamWriterPrivate{ filename, + internal::BamHeaderMemory::MakeRawHeader(header) + }); +} + +SamWriter::~SamWriter(void) { } + +void SamWriter::TryFlush(void) +{ + d_->TryFlush(); +} + +void SamWriter::Write(const BamRecord& record) +{ + d_->Write(record); +} + +void SamWriter::Write(const BamRecordImpl& recordImpl) +{ + d_->Write( BamRecord{recordImpl} ); +} diff --git a/src/SequenceInfo.cpp b/src/SequenceInfo.cpp new file mode 100644 index 0000000..43e4343 --- /dev/null +++ b/src/SequenceInfo.cpp @@ -0,0 +1,176 @@ + +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file SequenceInfo.cpp +/// \brief Implements the SequenceInfo class. +// +// Author: Derek Barnett + +#include "pbbam/SequenceInfo.h" +#include "SequenceUtils.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +static string token_SN = string("SN"); +static string token_LN = string("LN"); +static string token_AS = string("AS"); +static string token_M5 = string("M5"); +static string token_SP = string("SP"); +static string token_UR = string("UR"); + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +SequenceInfo::SequenceInfo(void) { } + +SequenceInfo::SequenceInfo(const std::string& name, + const std::string& length) + : name_(name) + , length_(length) +{ } + +SequenceInfo::SequenceInfo(const SequenceInfo& other) + : name_(other.name_) + , length_(other.length_) + , assemblyId_(other.assemblyId_) + , checksum_(other.checksum_) + , species_(other.species_) + , uri_(other.uri_) +{ } + +SequenceInfo::SequenceInfo(SequenceInfo&& other) + : name_(std::move(other.name_)) + , length_(std::move(other.length_)) + , assemblyId_(std::move(other.assemblyId_)) + , checksum_(std::move(other.checksum_)) + , species_(std::move(other.species_)) + , uri_(std::move(other.uri_)) +{ } + +SequenceInfo::~SequenceInfo(void) { } + +SequenceInfo& SequenceInfo::operator=(const SequenceInfo& other) +{ + name_ = other.name_; + length_ = other.length_; + assemblyId_ = other.assemblyId_; + checksum_ = other.checksum_; + species_ = other.species_; + uri_ = other.uri_; + return *this; +} + +SequenceInfo& SequenceInfo::operator=(SequenceInfo&& other) +{ + name_ = std::move(other.name_); + length_ = std::move(other.length_); + assemblyId_ = std::move(other.assemblyId_); + checksum_ = std::move(other.checksum_); + species_ = std::move(other.species_); + uri_ = std::move(other.uri_); + return *this; +} + +SequenceInfo SequenceInfo::FromSam(const std::string& sam) +{ + // pop off '@SQ\t', then split rest of line into tokens + const vector& tokens = internal::Split(sam.substr(4), '\t'); + if (tokens.empty()) + return SequenceInfo(); + + SequenceInfo seq; + map custom; + + // iterate over tokens + for (const string& token : tokens) { + const string& tokenTag = token.substr(0,2); + const string& tokenValue = token.substr(3); + + // set sequence info + if (tokenTag == internal::token_SN) seq.Name(tokenValue); + else if (tokenTag == internal::token_LN) seq.Length(tokenValue); + else if (tokenTag == internal::token_AS) seq.AssemblyId(tokenValue); + else if (tokenTag == internal::token_M5) seq.Checksum(tokenValue); + else if (tokenTag == internal::token_SP) seq.Species(tokenValue); + else if (tokenTag == internal::token_UR) seq.Uri(tokenValue); + + // otherwise, "custom" tag + else + custom[tokenTag] = tokenValue; + } + + seq.CustomTags(custom); + return seq; +} + +bool SequenceInfo::IsValid(void) const +{ + if (name_.empty()) + return false; + + // use long instead of int32_t, just to make sure we can catch overflow + const long l = atol(length_.c_str()); + return l >= 0 && l <= INT32_MAX; +} + +std::string SequenceInfo::ToSam(void) const +{ + stringstream out; + out << "@SQ" + << internal::MakeSamTag(internal::token_SN, name_); + + if (!length_.empty()) out << internal::MakeSamTag(internal::token_LN, length_); + if (!assemblyId_.empty()) out << internal::MakeSamTag(internal::token_AS, assemblyId_); + if (!checksum_.empty()) out << internal::MakeSamTag(internal::token_M5, checksum_); + if (!species_.empty()) out << internal::MakeSamTag(internal::token_SP, species_); + if (!uri_.empty()) out << internal::MakeSamTag(internal::token_UR, uri_); + + // append any custom tags + map::const_iterator customIter = custom_.cbegin(); + map::const_iterator customEnd = custom_.cend(); + for ( ; customIter != customEnd; ++customIter ) + out << internal::MakeSamTag(customIter->first, customIter->second); + + return out.str(); +} diff --git a/src/SequenceUtils.h b/src/SequenceUtils.h new file mode 100644 index 0000000..14ad898 --- /dev/null +++ b/src/SequenceUtils.h @@ -0,0 +1,145 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef SEQUENCEUTILS_H +#define SEQUENCEUTILS_H + +#include "StringUtils.h" +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +inline char Complement(const char character) +{ + static char const complementLookup[] = + { + '\0', 'T', 'V', 'G', 'H', + '\0', '\0', 'C', 'D', '\0', + '\0', 'M', '\0', 'K', 'N', + '\0', '\0', '\0', 'Y', 'S', + 'A', 'A', 'B', 'W', '\0', 'R' + }; + if (character == '-' || character == '*') + return character; + return complementLookup[toupper(character) & 0x1f]; +} + +//inline void Reverse(std::string& s) +//{ std::reverse(s.begin(), s.end()); } + +template +void Reverse(T& input) +{ std::reverse(input.begin(), input.end()); } + +template +T MaybeReverse(T&& input, bool reverse) +{ + if (reverse) std::reverse(input.begin(), input.end()); + return input; +} + +template +T Reversed(const T& input) +{ + T result = input; + Reverse(result); + return result; +} + +//inline std::string Reversed(const std::string& input) +//{ +// std::string result = input; +// Reverse(result); +// return result; +//} + +inline void ReverseComplement(std::string& seq) { + + std::string::iterator sIter = seq.begin(); + std::string::iterator sEnd = seq.end(); + for ( ; sIter != sEnd; ++sIter ) + *sIter = Complement(*sIter); + Reverse(seq); +} + +inline std::string MaybeReverseComplement(std::string&& seq, bool reverse) +{ + if (reverse) ReverseComplement(seq); + return seq; +} + +/// Reverse complement a DNA sequence case-sensitive +inline void ReverseComplementCaseSens(std::string& seq) +{ + const std::string original = seq; + int8_t rc_table[128] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 32, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 42, 4, 4, 45, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 84, 4, 71, 4, 4, 4, 67, 4, 4, 4, 4, + 4, 4, 78, 4, 4, 4, 4, 4, 65, 65, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 116, 4, 103, 4, 4, 4, 99, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 97, 97, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; + std::string reverseCompl(original.length(), 'N'); + for (uint32_t i = 0; i < original.length(); ++i) + reverseCompl[original.length()-i-1] = (char)rc_table[(int8_t)original[i]]; + seq = reverseCompl; +} + +inline std::string MaybeReverseComplementCaseSens(std::string&& seq, bool reverse) +{ + if (reverse) ReverseComplementCaseSens(seq); + return seq; +} + + +inline std::string ReverseComplemented(const std::string& input) +{ + std::string result = input; + ReverseComplement(result); + return result; +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // SEQUENCEUTILS_H diff --git a/src/StringUtils.h b/src/StringUtils.h new file mode 100644 index 0000000..24562fb --- /dev/null +++ b/src/StringUtils.h @@ -0,0 +1,74 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef STRINGUTILS_H +#define STRINGUTILS_H + +#include +#include +#include +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +inline std::string MakeSamTag(const std::string& tag, + const std::string& value) +{ + return std::string('\t' + tag + ':' + value); +} + +inline std::vector Split(const std::string& line, + const char delim = '\t') +{ + std::vector tokens; + std::stringstream lineStream(line); + std::string token; + while (std::getline(lineStream, token, delim)) + tokens.push_back(token); + return tokens; +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // STRINGUTILS_H diff --git a/src/SubreadLengthQuery.cpp b/src/SubreadLengthQuery.cpp new file mode 100644 index 0000000..1c7ce41 --- /dev/null +++ b/src/SubreadLengthQuery.cpp @@ -0,0 +1,71 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file SubreadLengthQuery.cpp +/// \brief Implements the SubreadLengthQuery class. +// +// Author: Derek Barnett + +#include "pbbam/SubreadLengthQuery.h" +#include "pbbam/PbiFilterTypes.h" +#include "pbbam/CompositeBamReader.h" +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +struct SubreadLengthQuery::SubreadLengthQueryPrivate +{ + SubreadLengthQueryPrivate(const int32_t length, + const Compare::Type compareType, + const DataSet& dataset) + : reader_(PbiQueryLengthFilter(length, compareType), dataset) + { } + + PbiFilterCompositeBamReader reader_; // unsorted +}; + +SubreadLengthQuery::SubreadLengthQuery(const int32_t length, + const Compare::Type compareType, + const DataSet& dataset) + : internal::IQuery() + , d_(new SubreadLengthQueryPrivate(length, compareType, dataset)) +{ } + +SubreadLengthQuery::~SubreadLengthQuery(void) { } + +bool SubreadLengthQuery::GetNext(BamRecord &r) +{ return d_->reader_.GetNext(r); } diff --git a/src/Tag.cpp b/src/Tag.cpp new file mode 100644 index 0000000..5c51321 --- /dev/null +++ b/src/Tag.cpp @@ -0,0 +1,124 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Tag.cpp +/// \brief Defines the Tag class. +// +// Author: Derek Barnett + +#include "pbbam/Tag.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +Tag::Tag(void) : data_(), modifier_(TagModifier::NONE) { } +Tag::Tag(int8_t value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(uint8_t value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(int16_t value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(uint16_t value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(int32_t value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(uint32_t value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(float value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(const std::string& value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(const vector& value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(const vector& value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(const vector& value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(const vector& value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(const vector& value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(const vector& value) : data_(value), modifier_(TagModifier::NONE) { } +Tag::Tag(const vector& value) : data_(value), modifier_(TagModifier::NONE) { } + +Tag::Tag(int8_t value, const TagModifier mod) + : data_(value) + , modifier_(mod) +{ + if (mod == TagModifier::HEX_STRING) + throw runtime_error("HEX_STRING is not a valid tag modifier for int8_t data. " + "It is intended for string-type data only."); +} + +Tag::Tag(const std::string& value, const TagModifier mod) + : data_(value) + , modifier_(mod) +{ + if (mod == TagModifier::ASCII_CHAR) + throw runtime_error("ASCII_CHAR is not a valid tag modifier for string-type data. " + "To construct an ASCII char tag, use a single-quoted value (e.g. 'X' instead of \"X\")"); +} + +Tag::Tag(const Tag& other) + : data_(other.data_) + , modifier_(other.modifier_) +{ } + +Tag::Tag(Tag&& other) + : data_(std::move(other.data_)) + , modifier_(std::move(other.modifier_)) +{ } + +Tag::~Tag(void) { } + +Tag& Tag::operator=(boost::blank value) { data_ = value; return *this; } +Tag& Tag::operator=(int8_t value) { data_ = value; return *this; } +Tag& Tag::operator=(uint8_t value) { data_ = value; return *this; } +Tag& Tag::operator=(int16_t value) { data_ = value; return *this; } +Tag& Tag::operator=(uint16_t value) { data_ = value; return *this; } +Tag& Tag::operator=(int32_t value) { data_ = value; return *this; } +Tag& Tag::operator=(uint32_t value) { data_ = value; return *this; } +Tag& Tag::operator=(float value) { data_ = value; return *this; } +Tag& Tag::operator=(const std::string& value) { data_ = value; return *this; } +Tag& Tag::operator=(const vector& value) { data_ = value; return *this; } +Tag& Tag::operator=(const vector& value) { data_ = value; return *this; } +Tag& Tag::operator=(const vector& value) { data_ = value; return *this; } +Tag& Tag::operator=(const vector& value) { data_ = value; return *this; } +Tag& Tag::operator=(const vector& value) { data_ = value; return *this; } +Tag& Tag::operator=(const vector& value) { data_ = value; return *this; } +Tag& Tag::operator=(const vector& value) { data_ = value; return *this; } + +Tag& Tag::operator=(const Tag& other) +{ + data_ = other.data_; + modifier_ = other.modifier_; + return *this; +} + +Tag& Tag::operator=(Tag&& other) +{ + data_ = std::move(other.data_); + modifier_ = std::move(other.modifier_); + return *this; +} diff --git a/src/TagCollection.cpp b/src/TagCollection.cpp new file mode 100644 index 0000000..98ed22b --- /dev/null +++ b/src/TagCollection.cpp @@ -0,0 +1,50 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file TagCollection.cpp +/// \brief Implements the TagCollection class. +// +// Author: Derek Barnett + +#include "pbbam/TagCollection.h" +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +bool TagCollection::Contains(const string& name) const +{ + return count(name) != 0; +} diff --git a/src/TimeUtils.h b/src/TimeUtils.h new file mode 100644 index 0000000..b3fd75f --- /dev/null +++ b/src/TimeUtils.h @@ -0,0 +1,100 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef TIMEUTILS_H +#define TIMEUTILS_H + +#include +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +inline +std::string ToIso8601(const std::chrono::system_clock::time_point& tp) +{ + // get time info + const time_t ttime_t = std::chrono::system_clock::to_time_t(tp); + const std::chrono::system_clock::time_point tp_sec = std::chrono::system_clock::from_time_t(ttime_t); + const std::chrono::milliseconds ms = std::chrono::duration_cast(tp - tp_sec); + const std::tm* ttm = gmtime(&ttime_t); // static obj, no free needed (may not be thread-safe though) + + // format output + char date_time_format[] = "%FT%T"; + char date_time_str[50]; + strftime(date_time_str, sizeof(date_time_str), date_time_format, ttm); + std::string result(date_time_str); + if (ms.count() > 0) { + result.append("."); + result.append(std::to_string(ms.count())); + } + result.append("Z"); + return result; +} + +inline +std::string ToDataSetFormat(const std::chrono::system_clock::time_point& tp) +{ + // get time info + const time_t ttime_t = std::chrono::system_clock::to_time_t(tp); + const std::chrono::system_clock::time_point tp_sec = std::chrono::system_clock::from_time_t(ttime_t); + const std::chrono::milliseconds ms = std::chrono::duration_cast(tp - tp_sec); + const std::tm* ttm = gmtime(&ttime_t); // static obj, no free needed (may not be thread-safe though) + + // format output + char date_time_format[] = "%y%m%d_%H%M%S"; + char date_time_str[50]; + strftime(date_time_str, sizeof(date_time_str), date_time_format, ttm); + std::string result(date_time_str); + if (ms.count() > 0) + result.append(std::to_string(ms.count())); + return result; +} + +inline +std::chrono::system_clock::time_point CurrentTime(void) +{ return std::chrono::system_clock::now(); } + +} // namespace PacBio +} // namespace BAM +} // namespace internal + +#endif // TIMEUTILS_H diff --git a/src/UnmappedReadsQuery.cpp b/src/UnmappedReadsQuery.cpp new file mode 100644 index 0000000..9b60657 --- /dev/null +++ b/src/UnmappedReadsQuery.cpp @@ -0,0 +1,133 @@ +// Copyright (c) 2014, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +//#include "pbbam/UnmappedReadsQuery.h" +//#include "pbbam/BamFile.h" +//#include "MemoryUtils.h" + +//#include + +//using namespace PacBio; +//using namespace PacBio::BAM; +//using namespace std; + +//UnmappedReadsQuery::UnmappedReadsQuery(const BamFile& file) +// : QueryBase() +//{ +// // open file +// file_.reset(sam_open(file.Filename().c_str(), "rb"), internal::HtslibFileDeleter()); +// if (!file_) { +// error_ = UnmappedReadsQuery::FileOpenError; +// return; +// } + +// // open index +// index_.reset(bam_index_load(file.Filename().c_str()), internal::HtslibIndexDeleter()); +// if (!index_) { +// error_ = UnmappedReadsQuery::IndexFileOpenError; +// return; +// } + +// // initialize query +// iterator_.reset(bam_itr_queryi(index_.get(), HTS_IDX_NOCOOR, 0, 0), internal::HtslibIteratorDeleter()); +// if (iterator_) { + +// cerr << endl +// << "UnmappedQueryReads::iterator" << endl +// << "read_rest: " << iterator_->read_rest << endl +// << "finished: " << iterator_->finished << endl +// << "dummy: " << iterator_->dummy << endl +// << "tid: " << iterator_->tid << endl +// << "beg: " << iterator_->beg << endl +// << "end: " << iterator_->end << endl +// << "n_off: " << iterator_->n_off << endl +// << "i: " << iterator_->i << endl +// << "curr_off: " << iterator_->curr_off << endl +// << endl; + + +//// uint32_t read_rest:1, finished:1, dummy:29; +//// int tid, beg, end, n_off, i; +//// uint64_t curr_off; +//// hts_pair64_t *off; +//// hts_readrec_func *readrec; +//// struct { +//// int n, m; +//// int *a; +//// } bins; + +// } +//} + +//bool UnmappedReadsQuery::GetNext(BamRecord& record) +//{ +// if (error_ == UnmappedReadsQuery::NoError && iterator_) { +// const int result = bam_itr_next(file_.get(), iterator_.get(), record.RawData().get()); +// if ( result > 0 ) +// return true; +// else { +// cerr << "ERROR - result: " << result << endl; +// if ( result == -4 ) { + +// bam1_t* b = record.RawData().get(); +// bam1_core_t* c = &b->core; +// bool nonBgzfErrorFound = false; + +// if (b->l_data < 0) { +// cerr << "ERROR: bam1_t::l_data < 0" << endl; +// nonBgzfErrorFound = true; +// } +// if (c->l_qseq < 0) { +// cerr << "ERROR: bam1_t::core::l_qseq < 0" << endl; +// nonBgzfErrorFound = true; +// } +// if (!b->data) { +// cerr << "ERROR: bam1_t::data is null" << endl; +// nonBgzfErrorFound = true; +// } +// if (!nonBgzfErrorFound) +// cerr << "ERROR: in bam_read1(), bgzf_read(fp, b->data, b->l_data) returned unexpected value" << endl; +// } +// } +// } +// else { +// cerr << "UnmappedReadsQuery::HasError() - " << Error() << endl; +// } + + +// return false; +//} diff --git a/src/ValidationErrors.cpp b/src/ValidationErrors.cpp new file mode 100644 index 0000000..fe6e69c --- /dev/null +++ b/src/ValidationErrors.cpp @@ -0,0 +1,144 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ValidationErrors.cpp +/// \brief Implements the ValidationErrors class. +// +// Author: Derek Barnett + +#include "ValidationErrors.h" +#include "pbbam/exception/ValidationException.h" +#include "StringUtils.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +const size_t ValidationErrors::MAX; + +ValidationErrors::ValidationErrors(const size_t maxNumErrors) + : maxNumErrors_(maxNumErrors) + , currentNumErrors_(0) +{ + if (maxNumErrors_ == 0) + maxNumErrors_ = ValidationErrors::MAX; +} + +void ValidationErrors::AddFileError(const std::string& fn, + const std::string& details) +{ + string copy = details; + AddFileError(fn, std::move(copy)); +} + +void ValidationErrors::AddFileError(const std::string& fn, + std::string&& details) +{ + fileErrors_[fn].push_back(std::move(details)); + OnErrorAdded(); +} + +void ValidationErrors::AddReadGroupError(const std::string& rg, + const std::string& details) +{ + string copy = details; + AddReadGroupError(rg, std::move(copy)); +} + +void ValidationErrors::AddReadGroupError(const std::string& rg, + std::string&& details) +{ + readGroupErrors_[rg].push_back(std::move(details)); + OnErrorAdded(); +} + +void ValidationErrors::AddRecordError(const std::string& name, + const std::string& details) +{ + string copy = details; + AddRecordError(name, std::move(copy)); +} + +void ValidationErrors::AddRecordError(const std::string& name, + std::string&& details) +{ + recordErrors_[name].push_back(std::move(details)); + OnErrorAdded(); +} + +void ValidationErrors::AddTagLengthError(const string& name, + const string& tagLabel, + const string& tagName, + const size_t observed, + const size_t expected) +{ + string copy = tagLabel; + string copy2 = tagName; + AddTagLengthError(name, std::move(copy), std::move(copy2), observed, expected); +} + +void ValidationErrors::AddTagLengthError(const string& name, + string&& tagLabel, + string&& tagName, + const size_t observed, + const size_t expected) +{ + // format + stringstream s; + s << tagLabel << " tag (" << tagName << ") length: " << observed + << ", does not match expected length: " << expected; + AddRecordError(name, s.str()); +} + +bool ValidationErrors::IsEmpty(void) const +{ + return currentNumErrors_ == 0; +} + +void ValidationErrors::OnErrorAdded(void) +{ + ++currentNumErrors_; + if (currentNumErrors_ == maxNumErrors_) + ThrowErrors(); +} + +void ValidationErrors::ThrowErrors(void) +{ + throw ValidationException(std::move(fileErrors_), + std::move(readGroupErrors_), + std::move(recordErrors_)); +} diff --git a/src/ValidationErrors.h b/src/ValidationErrors.h new file mode 100644 index 0000000..af68ac6 --- /dev/null +++ b/src/ValidationErrors.h @@ -0,0 +1,115 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ValidationErrors.h +/// \brief Defines the ValidationErrors class. +// +// Author: Derek Barnett + +#ifndef VALIDATIONERRORS_H +#define VALIDATIONERRORS_H + +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +/// The ValidationErrors class catches error messages accumulated during +/// validation (see Validator). +/// +/// Convenience methods are provided for different BAM components, to help +/// format the displayed output. +/// +/// A maximum number of errors can be provided at construction, and this class +/// will automatially throw a ValidationException whenever that count is reached. +/// Otherwise, the Validator will check IsEmpty() and call ThrowErrors() if true. +/// +class ValidationErrors +{ +public: + typedef std::vector ErrorList; + typedef std::map ErrorMap; +public: + static const size_t MAX = std::numeric_limits::max(); + +public: + ValidationErrors(const size_t maxNumErrors = ValidationErrors::MAX); + +public: + void AddFileError(const std::string& fn, const std::string& details); + void AddFileError(const std::string& fn, std::string&& details); + + void AddReadGroupError(const std::string& rg, const std::string& details); + void AddReadGroupError(const std::string& rg, std::string&& details); + + void AddRecordError(const std::string& name, const std::string& details); + void AddRecordError(const std::string& name, std::string&& details); + + void AddTagLengthError(const std::string& name, + const std::string& tagLabel, + const std::string& tagName, + const size_t observed, + const size_t expected); + void AddTagLengthError(const std::string& name, + std::string&& tagLabel, + std::string&& tagName, + const size_t observed, + const size_t expected); + +public: + bool IsEmpty(void) const; + void ThrowErrors(void); + +private: + size_t maxNumErrors_; + size_t currentNumErrors_; + ErrorMap fileErrors_; + ErrorMap readGroupErrors_; + ErrorMap recordErrors_; + +private: + void OnErrorAdded(void); +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // VALIDATIONERRORS_H diff --git a/src/ValidationException.cpp b/src/ValidationException.cpp new file mode 100644 index 0000000..2f7c5bc --- /dev/null +++ b/src/ValidationException.cpp @@ -0,0 +1,122 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ValidationException.cpp +/// \brief Implements the ValidationException class. +// +// Author: Derek Barnett + +#include "pbbam/exception/ValidationException.h" +using namespace PacBio; +using namespace PacBio::BAM; + +ValidationException::ValidationException(const ErrorMap& fileErrors, + const ErrorMap& readGroupErrors, + const ErrorMap& recordErrors) + : std::runtime_error("") + , fileErrors_(fileErrors) + , readGroupErrors_(readGroupErrors) + , recordErrors_(recordErrors) +{ + FormatMessage(); +} + +ValidationException::ValidationException(ErrorMap&& fileErrors, + ErrorMap&& readGroupErrors, + ErrorMap&& recordErrors) + : std::runtime_error("") + , fileErrors_(std::move(fileErrors)) + , readGroupErrors_(std::move(readGroupErrors)) + , recordErrors_(std::move(recordErrors)) +{ + FormatMessage(); +} + +const ValidationException::ErrorMap& ValidationException::FileErrors(void) const +{ return fileErrors_; } + +const ValidationException::ErrorMap& ValidationException::ReadGroupErrors(void) const +{ return readGroupErrors_; } + +const ValidationException::ErrorMap& ValidationException::RecordErrors(void) const +{ return recordErrors_; } + +const char* ValidationException::what(void) const noexcept +{ return msg_.c_str(); } + +void ValidationException::FormatMessage(void) +{ + std::stringstream s; + s << "Validation failed: " << std::endl; + + // file errors + if (!fileErrors_.empty()) { + auto fileIter = fileErrors_.cbegin(); + auto fileEnd = fileErrors_.cend(); + for ( ; fileIter != fileEnd; ++fileIter) { + s << " In file (" << fileIter->first << ") : " << std::endl; + const auto& errors = fileIter->second; + for (const auto& e : errors) + s << " " << e << std::endl; + } + } + + // read group errors + if (!readGroupErrors_.empty()) { + auto rgIter = readGroupErrors_.cbegin(); + auto rgEnd = readGroupErrors_.cend(); + for ( ; rgIter != rgEnd; ++rgIter) { + s << " In read group (" << rgIter->first << ") : " << std::endl; + const auto& errors = rgIter->second; + for (const auto& e : errors) + s << " " << e << std::endl; + } + } + + // record errors + if (!recordErrors_.empty()) { + auto recIter = recordErrors_.cbegin(); + auto recEnd = recordErrors_.cend(); + for ( ; recIter != recEnd; ++recIter) { + s << " In record (" << recIter->first << ") : " << std::endl; + const auto& errors = recIter->second; + for (const auto& e : errors) + s << " " << e << std::endl; + } + } + + msg_ = s.str(); +} diff --git a/src/Validator.cpp b/src/Validator.cpp new file mode 100644 index 0000000..158f466 --- /dev/null +++ b/src/Validator.cpp @@ -0,0 +1,470 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Validator.cpp +/// \brief Implements the Validator class. +// +// Author: Derek Barnett + +#include "pbbam/Validator.h" + +#include "pbbam/BamFile.h" +#include "pbbam/BamHeader.h" +#include "pbbam/BamRecord.h" +#include "pbbam/EntireFileQuery.h" +#include "pbbam/ReadGroupInfo.h" +#include "ValidationErrors.h" +#include "Version.h" +#include +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +struct ilexcompare_wrapper { + bool operator()(const string& lhs, const string& rhs) const + { return boost::ilexicographical_compare(lhs, rhs); } +}; + +static const set AcceptedSortOrders = { + "unknown", + "unsorted", + "queryname", + "coordinate" +}; + +static const set AcceptedReadTypes = { + "POLYMERASE", + "HQREGION", + "SUBREAD", + "CCS", + "SCRAP", + "UNKNOWN" +}; + +static +void ValidateReadGroup(const ReadGroupInfo& rg, + unique_ptr& errors) +{ + const string& id = rg.Id(); + + // has required fields + if (id.empty()) + errors->AddReadGroupError(id, "missing ID"); + if (rg.MovieName().empty()) + errors->AddReadGroupError(id, "missing movie name (PU tag)"); + // 3.0.2 adds required RG:PM - do not check for now, we'll add version-aware + // validation down the road + + // description tag has required components + if (rg.ReadType().empty()) + errors->AddReadGroupError(id, "missing READTYPE in description"); + if (rg.BindingKit().empty()) + errors->AddReadGroupError(id, "missing BINDINGKIT in description"); + if (rg.SequencingKit().empty()) + errors->AddReadGroupError(id, "missing SEQUENCINGKIT in description"); + if (rg.BasecallerVersion().empty()) + errors->AddReadGroupError(id, "missing BASECALLERVERSION in description"); + if (rg.FrameRateHz().empty()) + errors->AddReadGroupError(id, "missing FRAMERATEHZ in description"); + + // stored ID matches expected ID (as calculated from movie & type) + if (!id.empty()) { + const auto expectedId = MakeReadGroupId(rg.MovieName(), rg.ReadType()); + if (expectedId != id) { + const string msg = "stored ID: " + id + + " does not match computed ID: " + expectedId; + errors->AddReadGroupError(id, std::move(msg)); + } + } + + // valid read type + if (!rg.ReadType().empty()) { + if (internal::AcceptedReadTypes.find(rg.ReadType()) == internal::AcceptedReadTypes.cend()) + errors->AddReadGroupError(id, "read type: " + rg.ReadType() + " is unknown"); + } + + // valid read chemistry (binding, sequencing, chemistry) + if (!rg.BindingKit().empty() && + !rg.SequencingKit().empty() && + !rg.BasecallerVersion().empty()) + { + try { + auto chem = rg.SequencingChemistry(); + (void)chem; + } catch (std::exception& e) { + errors->AddReadGroupError(id, e.what()); + } + } + + // frame rate convertable to floating point + if (!rg.FrameRateHz().empty()) { + try { + const float frameRate = stof(rg.FrameRateHz()); + (void)frameRate; + } catch (std::exception& e) { + errors->AddReadGroupError(id, e.what()); + } + } +} + +static +void ValidateHeader(const BamHeader& header, + const string& filename, + unique_ptr& errors) +{ + const string& fn = filename; + + // SAM/BAM version + try { + Version v(header.Version()); + (void)v; + } catch (std::exception& e) { + errors->AddFileError(fn, string("SAM version (@HD:VN) failed: ") + e.what()); + } + + // sort order + const string sortOrder = header.SortOrder(); + if (AcceptedSortOrders.find(sortOrder) == AcceptedSortOrders.end()) + errors->AddFileError(fn, string("unknown sort order: ") + sortOrder); + + // PacBio version + try { + const Version v(header.PacBioBamVersion()); + const Version minimum(3,0,1); + if (v < minimum) { + string msg = "PacBioBAM version (@HD:pb) "; + msg += v.ToString(); + msg += string{ " is older than the minimum supported version " }; + msg += ( "(" + minimum.ToString() + ")" ); + errors->AddFileError(fn, std::move(msg)); + } + } catch (std::exception& e) { + errors->AddFileError(fn, string("PacBioBAM version (@HD:pb) failed to parse: ") + e.what()); + } + + // sequences? + + // read groups + for (const ReadGroupInfo& rg : header.ReadGroups() ) + ValidateReadGroup(rg, errors); +} + +static +void ValidateMetadata(const BamFile& file, + unique_ptr& errors) +{ + // filename + const string fn = file.Filename(); + if (fn == "-") { + errors->AddFileError(fn, "validation not is available for streamed BAM. Please " + "write to a file and run validation on it."); + errors->ThrowErrors(); // quit early + } + if (boost::algorithm::ends_with(fn, ".bam") || + boost::algorithm::ends_with(fn, ".bam.tmp")) + { + errors->AddFileError(fn, "non-standard file extension"); + } + + // EOF + if (!file.HasEOF()) + errors->AddFileError(fn, "missing end-of-file marker"); + + // has PBI + if (!file.PacBioIndexExists()) + errors->AddFileError(fn, "missing PBI file"); + + // header + ValidateHeader(file.Header(), file.Filename(), errors); +} + +void ValidateMappedRecord(const BamRecord& b, + unique_ptr& errors) +{ + const string& name = b.FullName(); + if (b.ReferenceStart() < 0) + errors->AddRecordError(name, "mapped record position is invalid"); + if (b.ReferenceId() < 0) + errors->AddRecordError(name, "mapped record reference ID is invalid"); + + // what else?? +} + +void ValidateRecordCore(const BamRecord& b, + unique_ptr& errors) +{ + const string& name = b.FullName(); + + if (b.Type() != RecordType::CCS) { + const auto qStart = b.QueryStart(); + const auto qEnd = b.QueryEnd(); + if (qStart >= qEnd) + errors->AddRecordError(name, "queryStart (qs) should be < queryEnd (qe)"); + } +} + +void ValidateRecordReadGroup(const BamRecord& b, + unique_ptr& errors) +{ + try { + auto rg = b.ReadGroup(); + (void)rg; + } catch (std::exception& e) { + errors->AddRecordError(b.FullName(), e.what()); + } +} + +void ValidateRecordRequiredTags(const BamRecord& b, + unique_ptr& errors) +{ + const string& name = b.FullName(); + + if (b.Type() != RecordType::CCS) { + + // qe/qs + const bool hasQueryStart = b.HasQueryStart(); + const bool hasQueryEnd = b.HasQueryEnd(); + if (hasQueryStart && hasQueryEnd) { + const auto qStart = b.QueryStart(); + const auto qEnd = b.QueryEnd(); + if (qStart >= qEnd) + errors->AddRecordError(name, "queryStart (qs) should be < queryEnd (qe)"); + } else { + if (!hasQueryStart) + errors->AddRecordError(name, "missing tag: qs (queryStart)"); + if (!hasQueryEnd) + errors->AddRecordError(name, "missing tag: qe (queryEnd)"); + } + } + + // zm + if (!b.HasHoleNumber()) + errors->AddRecordError(name, "missing tag: zm (ZMW hole number)"); + + // np + if (!b.HasNumPasses()) + errors->AddRecordError(name, "missing tag: np (num passes)"); + else { + const auto numPasses = b.NumPasses(); + if (b.Type() != RecordType::CCS && numPasses != 1) + errors->AddRecordError(name, "np (numPasses) tag for non-CCS records should be 1"); + } + + // rq + if (!b.HasReadAccuracy()) + errors->AddRecordError(name, "missing tag: rq (read accuracy)"); + + // sn + if (!b.HasSignalToNoise()) + errors->AddRecordError(name, "missing tag: sn (signal-to-noise ratio)"); +} + +void ValidateRecordTagLengths(const BamRecord& b, + unique_ptr& errors) +{ + const string& name = b.FullName(); + const size_t expectedLength = (b.Type() == RecordType::CCS ? b.Sequence().size() + : (b.QueryEnd() - b.QueryStart())); + + // check "per-base"-type data lengths are compatible + if (b.Sequence().size() != expectedLength) + errors->AddRecordError(name, "sequence length does not match expected length"); + + if (b.HasDeletionQV()) { + if (b.DeletionQV().size() != expectedLength) + errors->AddTagLengthError(name, "DeletionQV", "dq", b.DeletionQV().size(), expectedLength); + } + if (b.HasDeletionTag()) { + if (b.DeletionTag().size() != expectedLength) + errors->AddTagLengthError(name, "DeletionTag", "dt", b.DeletionTag().size(), expectedLength); + } + if (b.HasInsertionQV()) { + if (b.InsertionQV().size() != expectedLength) + errors->AddTagLengthError(name, "InsertionQV", "iq", b.InsertionQV().size(), expectedLength); + } + if (b.HasMergeQV()) { + if (b.MergeQV().size() != expectedLength) + errors->AddTagLengthError(name, "MergeQV", "mq", b.MergeQV().size(), expectedLength); + } + if (b.HasSubstitutionQV()) { + if (b.SubstitutionQV().size() != expectedLength) + errors->AddTagLengthError(name, "SubstitutionQV", "sq", b.SubstitutionQV().size(), expectedLength); + } + if (b.HasSubstitutionTag()) { + if (b.SubstitutionTag().size() != expectedLength) + errors->AddTagLengthError(name, "SubstitutionTag", "st", b.SubstitutionTag().size(), expectedLength); + } + if (b.HasIPD()) { + if (b.IPD().size() != expectedLength) + errors->AddTagLengthError(name, "IPD", "ip", b.IPD().size(), expectedLength); + } + + // NOTE: disabling "internal" tag checks for now, only checking "standard" + // PacBioBAM tags + +// if (b.HasAltLabelQV()) { +// if (b.AltLabelQV().size() != expectedLength) +// errors->AddTagLengthError(name, "AltLabelQV", "pv", b.AltLabelQV().size(), expectedLength); +// } +// if (b.HasAltLabelTag()) { +// if (b.AltLabelTag().size() != expectedLength) +// errors->AddTagLengthError(name, "AltLabelTag", "pt", b.AltLabelTag().size(), expectedLength); +// } +// if (b.HasLabelQV()) { +// if (b.LabelQV().size() != expectedLength) +// errors->AddTagLengthError(name, "LabelQV", "pq", b.LabelQV().size(), expectedLength); +// } +// if (b.HasPkmean()) { +// if (b.Pkmean().size() != expectedLength) +// errors->AddTagLengthError(name, "Pkmean", "pa", b.Pkmean().size(), expectedLength); +// } +// if (b.HasPkmean2()) { +// if (b.Pkmean2().size() != expectedLength) +// errors->AddTagLengthError(name, "Pkmean2", "ps", b.Pkmean2().size(), expectedLength); +// } +// if (b.HasPkmid()) { +// if (b.Pkmid().size() != expectedLength) +// errors->AddTagLengthError(name, "Pkmid", "pm", b.Pkmid().size(), expectedLength); +// } +// if (b.HasPkmid2()) { +// if (b.Pkmid2().size() != expectedLength) +// errors->AddTagLengthError(name, "Pkmid2", "pi", b.Pkmid2().size(), expectedLength); +// } +// if (b.HasPrePulseFrames()) { +// if (b.PrePulseFrames().size() != expectedLength) +// errors->AddTagLengthError(name, "PrePulseFrames", "pd", b.PrePulseFrames().size(), expectedLength); +// } +// if (b.HasPulseCall()) { +// if (b.PulseCall().size() != expectedLength) +// errors->AddTagLengthError(name, "PulseCall", "pc", b.PulseCall().size(), expectedLength); +// } +// if (b.HasPulseCallWidth()) { +// if (b.PulseCallWidth().size() != expectedLength) +// errors->AddTagLengthError(name, "PulseCallWidth", "px", b.PulseCallWidth().size(), expectedLength); +// } +// if (b.HasPulseMergeQV()) { +// if (b.PulseMergeQV().size() != expectedLength) +// errors->AddTagLengthError(name, "PulseMergeQV", "pg", b.PulseMergeQV().size(), expectedLength); +// } +// if (b.HasPulseWidth()) { +// if (b.PulseWidth().size() != expectedLength) +// errors->AddTagLengthError(name, "PulseWidth", "pw", b.PulseWidth().size(), expectedLength); +// } +} + +void ValidateUnmappedRecord(const BamRecord& b, + unique_ptr& errors) +{ + const string& name = b.FullName(); + if (b.ReferenceStart() != -1) + errors->AddRecordError(name, "unmapped record has a position"); + if (b.ReferenceId() != -1) + errors->AddRecordError(name, "unmapped record has a reference ID"); +} + +static +void ValidateRecord(const BamRecord& b, + unique_ptr& errors) +{ + ValidateRecordCore(b, errors); + ValidateRecordReadGroup(b, errors); + ValidateRecordRequiredTags(b, errors); + ValidateRecordTagLengths(b, errors); + if (b.IsMapped()) + ValidateMappedRecord(b, errors); + else + ValidateUnmappedRecord(b, errors); +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +void Validator::Validate(const BamHeader& header, const size_t maxErrors) +{ + unique_ptr errors{ new ValidationErrors(maxErrors) }; + internal::ValidateHeader(header, "unknown", errors); + if (!errors->IsEmpty()) + errors->ThrowErrors(); +} + +void Validator::Validate(const ReadGroupInfo& rg, const size_t maxErrors) +{ + unique_ptr errors{ new ValidationErrors(maxErrors) }; + internal::ValidateReadGroup(rg, errors); + if (!errors->IsEmpty()) + errors->ThrowErrors(); +} + +void Validator::Validate(const BamRecord& b, const size_t maxErrors) +{ + unique_ptr errors{ new ValidationErrors(maxErrors) }; + internal::ValidateRecord(b, errors); + if (!errors->IsEmpty()) + errors->ThrowErrors(); +} + +void Validator::ValidateEntireFile(const BamFile& file, const size_t maxErrors) +{ + unique_ptr errors{ new ValidationErrors(maxErrors) }; + internal::ValidateMetadata(file, errors); + + EntireFileQuery query(file); + for (const BamRecord& record : query) + internal::ValidateRecord(record, errors); + + if (!errors->IsEmpty()) + errors->ThrowErrors(); +} + +void Validator::ValidateFileMetadata(const BamFile& file, const size_t maxErrors) +{ + unique_ptr errors{ new ValidationErrors(maxErrors) }; + internal::ValidateMetadata(file, errors); + if (!errors->IsEmpty()) + errors->ThrowErrors(); +} diff --git a/src/Version.cpp b/src/Version.cpp new file mode 100644 index 0000000..b9089e3 --- /dev/null +++ b/src/Version.cpp @@ -0,0 +1,87 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Version.cpp +/// \brief Implements the Version class. +// +// Author: Derek Barnett + +#include "Version.h" +#include "SequenceUtils.h" +#include + +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +const Version Version::Current = Version(3,0,3); +const Version Version::Minimum = Version(3,0,1); + +// string must be ".." +Version::Version(const std::string& v) + : major_(0) + , minor_(0) + , revision_(0) +{ + // parse string + try { + const auto fields = internal::Split(v, '.'); + const auto numFields = fields.size(); + if (numFields == 0) + throw std::runtime_error("invalid version number - empty string"); + major_ = std::stoi(fields.at(0)); + if (numFields > 1) { + minor_ = std::stoi(fields.at(1)); + if (numFields > 2 ) + revision_ = std::stoi(fields.at(2)); + } + } catch (std::exception&) { + auto msg = string{ "invalid version number (" + v + "): failed to parse" }; + throw std::runtime_error(msg); + } + + // ensure valid numbers + Check(); +} + +std::string Version::ToString(void) const +{ + std::stringstream s; + s << major_ << '.' << minor_ << '.' << revision_; + return s.str(); +} + diff --git a/src/Version.h b/src/Version.h new file mode 100644 index 0000000..70427c3 --- /dev/null +++ b/src/Version.h @@ -0,0 +1,209 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file Version.h +/// \brief Defines the Version class. +// +// Author: Derek Barnett + +#ifndef PACBIOBAM_VERSION_H +#define PACBIOBAM_VERSION_H + +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +class Version +{ +public: + static const Version Current; + static const Version Minimum; + +public: + constexpr Version(void); + + Version(int major, int minor, int revision); + + // string must be ".." + Version(const std::string& v); + + Version(const Version& other) = default; + Version(Version&& other) = default; + Version& operator=(const Version&) = default; + Version& operator=(Version&&) = default; + ~Version(void) = default; + +public: + bool operator==(const Version& other) const; + bool operator!=(const Version& other) const; + bool operator<(const Version& other) const; + bool operator<=(const Version& other) const; + bool operator>(const Version& other) const; + bool operator>=(const Version& other) const; + +public: + std::string ToString(void) const; + operator std::string(void) const; + +public: + int Major(void) const; + int Minor(void) const; + int Revision(void) const; + +public: + Version& Major(int major); + Version& Minor(int minor); + Version& Revision(int revision); + +private: + int major_; + int minor_; + int revision_; + +private: + void Check(void) const; +}; + +inline std::ostream& operator<<(std::ostream& out, const Version& version) +{ + out << version.ToString(); + return out; +} + +inline constexpr Version::Version(void) + : major_(0) + , minor_(0) + , revision_(0) +{ } + +inline Version::Version(int major, int minor, int revision) + : major_(major) + , minor_(minor) + , revision_(revision) +{ Check(); } + +inline bool Version::operator==(const Version& other) const +{ + return major_ == other.major_ && + minor_ == other.minor_ && + revision_ == other.revision_; +} + +inline bool Version::operator!=(const Version& other) const +{ return !(*this == other); } + +inline bool Version::operator<(const Version& other) const +{ + // 2.* < 3.* + if (major_ < other.major_) + return true; + + // 3. == 3. + else if (major_ == other.major_) { + + // 3.1.* < 3.2.* + if (minor_ < other.minor_) + return true; + + // 3.2. == 3.2. + else if (minor_ == other.minor_) { + + // 3.2.1 < 3.2.2 + if (revision_ < other.revision_) + return true; + } + } + + // otherwise not less-than + return false; +} +inline bool Version::operator<=(const Version& other) const +{ return !(*this > other); } + +inline bool Version::operator>(const Version& other) const +{ return other < *this; } + +inline bool Version::operator>=(const Version& other) const +{ return !(*this < other); } + +inline Version::operator std::string(void) const +{ return ToString(); } + +inline void Version::Check(void) const +{ + if (major_ < 0 || minor_ < 0 || revision_ < 0) + throw std::runtime_error("version cannot contain negative numbers"); +} + +inline int Version::Major(void) const +{ return major_; } + +inline Version& Version::Major(int major) +{ + major_ = major; + Check(); + return *this; +} + +inline int Version::Minor(void) const +{ return minor_; } + +inline Version& Version::Minor(int minor) +{ + minor_ = minor; + Check(); + return *this; +} + +inline int Version::Revision(void) const +{ return revision_; } + +inline Version& Version::Revision(int revision) +{ + revision_ = revision; + Check(); + return *this; +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // PACBIOBAM_VERSION_H diff --git a/src/VirtualRegionTypeMap.cpp b/src/VirtualRegionTypeMap.cpp new file mode 100644 index 0000000..8c6c757 --- /dev/null +++ b/src/VirtualRegionTypeMap.cpp @@ -0,0 +1,54 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualRegionTypeMap.cpp +/// \brief Implements the VirtualRegionTypeMap class. +// +// Author: Armin Töpfer + +#include "pbbam/virtual/VirtualRegionTypeMap.h" + +using namespace PacBio; +using namespace PacBio::BAM; + +std::map VirtualRegionTypeMap::ParseChar +{ + { 'A' , VirtualRegionType::ADAPTER }, + { 'B' , VirtualRegionType::BARCODE }, + { 'H' , VirtualRegionType::HQREGION }, + { 'F' , VirtualRegionType::FILTERED }, + { 'L' , VirtualRegionType::LQREGION } +}; diff --git a/src/VirtualZmwBamRecord.cpp b/src/VirtualZmwBamRecord.cpp new file mode 100644 index 0000000..d494f89 --- /dev/null +++ b/src/VirtualZmwBamRecord.cpp @@ -0,0 +1,399 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualZmwBamRecord.cpp +/// \brief Implements the VirtualZmwBamRecord class. +// +// Author: Armin Töpfer + +#include +#include +#include +#include + +#include "pbbam/virtual/VirtualZmwBamRecord.h" +#include "pbbam/virtual/VirtualRegionType.h" +#include "pbbam/virtual/VirtualRegionTypeMap.h" + +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; + +namespace PacBio { +namespace BAM { +namespace internal { + +/// \brief Appends content of src vector to dst vector using move semantics. +/// +/// \param[in] src Input vector that will be empty after execution +/// \param[in,out] dst Output vector that will be appended to +/// +template +inline void MoveAppend(std::vector& src, std::vector& dst) noexcept +{ + if (dst.empty()) + { + dst = std::move(src); + } + else + { + dst.reserve(dst.size() + src.size()); + std::move(src.begin(), src.end(), std::back_inserter(dst)); + src.clear(); + } +} + +/// \brief Appends content of src vector to dst vector using move semantics. +/// +/// \param[in] src Input vector via perfect forwarding +/// \param[in,out] dst Output vector that will be appended to +/// +template +inline void MoveAppend(std::vector&& src, std::vector& dst) noexcept +{ + if (dst.empty()) + { + dst = std::move(src); + } + else + { + dst.reserve(dst.size() + src.size()); + std::move(src.begin(), src.end(), std::back_inserter(dst)); + src.clear(); + } +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +VirtualZmwBamRecord::VirtualZmwBamRecord( + std::vector&& unorderedSources, const BamHeader& header) + : BamRecord(header) + , sources_(std::move(unorderedSources)) +{ + // Sort sources by queryStart + std::sort(sources_.begin(), sources_.end(), + [](const BamRecord& l1, const BamRecord& l2) + { return l1.QueryStart() < l2.QueryStart(); }); + StitchSources(); +} + +bool VirtualZmwBamRecord::HasVirtualRegionType(const VirtualRegionType regionType) const +{ return virtualRegionsMap_.find(regionType) != virtualRegionsMap_.end(); } + +Frames VirtualZmwBamRecord::IPDV1Frames(Orientation orientation) const +{ + const auto rawFrames = this->IPDRaw(orientation); + const std::vector rawData(rawFrames.Data().begin(), rawFrames.Data().end()); + return Frames::Decode(rawData); +} + + +void VirtualZmwBamRecord::StitchSources(void) +{ + const auto& firstRecord = sources_[0]; + const auto& lastRecord = sources_[sources_.size() - 1]; + + std::string sequence; + std::string deletionTag; + std::string substitutionTag; + std::string alternativeLabelTag; + std::string pulseCall; + + QualityValues qualities; + QualityValues deletionQv; + QualityValues insertionQv; + QualityValues mergeQv; + QualityValues pulseMergeQv; + QualityValues substitutionQv; + QualityValues labelQv; + QualityValues alternativeLabelQv; + + Frames ipd; + Frames pw; + Frames pd; + Frames px; + std::vector pa; + std::vector pm; + std::vector sf; + + // initialize capacity + const auto stitchedSize = lastRecord.QueryEnd() - firstRecord.QueryStart(); + sequence.reserve(stitchedSize); + deletionTag.reserve(stitchedSize); + substitutionTag.reserve(stitchedSize); + alternativeLabelTag.reserve(stitchedSize); + pulseCall.reserve(stitchedSize); + qualities.reserve(stitchedSize); + deletionQv.reserve(stitchedSize); + insertionQv.reserve(stitchedSize); + mergeQv.reserve(stitchedSize); + pulseMergeQv.reserve(stitchedSize); + substitutionQv.reserve(stitchedSize); + labelQv.reserve(stitchedSize); + alternativeLabelQv.reserve(stitchedSize); + ipd.DataRaw().reserve(stitchedSize); + pw.DataRaw().reserve(stitchedSize); + pd.DataRaw().reserve(stitchedSize); + px.DataRaw().reserve(stitchedSize); + pa.reserve(stitchedSize); + pm.reserve(stitchedSize); + sf.reserve(stitchedSize); + + // Stitch using tmp vars + for(auto& b : sources_) + { + sequence.append(b.Sequence()); + + MoveAppend(b.Qualities(), qualities); + + if (b.HasDeletionQV()) + MoveAppend(std::move(b.DeletionQV()), deletionQv); + + if (b.HasInsertionQV()) + MoveAppend(std::move(b.InsertionQV()), insertionQv); + + if (b.HasMergeQV()) + MoveAppend(std::move(b.MergeQV()), mergeQv); + + if (b.HasPulseMergeQV()) + MoveAppend(std::move(b.PulseMergeQV()), pulseMergeQv); + + if (b.HasSubstitutionQV()) + MoveAppend(std::move(b.SubstitutionQV()), substitutionQv); + + if (b.HasLabelQV()) + MoveAppend(std::move(b.LabelQV()), labelQv); + + if (b.HasAltLabelQV()) + MoveAppend(std::move(b.AltLabelQV()), alternativeLabelQv); + + if (b.HasDeletionTag()) + deletionTag.append(std::move(b.DeletionTag())); + + if (b.HasSubstitutionTag()) + substitutionTag.append(std::move(b.SubstitutionTag())); + + if (b.HasAltLabelTag()) + alternativeLabelTag.append(std::move(b.AltLabelTag())); + + if (b.HasPulseCall()) + pulseCall.append(std::move(b.PulseCall())); + + if (b.HasIPD()) + MoveAppend(b.IPDRaw().DataRaw(), ipd.DataRaw()); + + if (b.HasPulseWidth()) + MoveAppend(b.PulseWidthRaw().DataRaw(), pw.DataRaw()); + + if (b.HasPulseCallWidth()) + MoveAppend(b.PulseCallWidth().DataRaw(), px.DataRaw()); + + if (b.HasPrePulseFrames()) + MoveAppend(b.PrePulseFrames().DataRaw(), pd.DataRaw()); + + if (b.HasPkmid()) + MoveAppend(b.Pkmid(), pm); + + if (b.HasPkmean()) + MoveAppend(b.Pkmean(), pa); + + if (b.HasPkmid2()) + MoveAppend(b.Pkmid2(), pm); + + if (b.HasPkmean2()) + MoveAppend(b.Pkmean2(), pa); + + if (b.HasStartFrame()) + MoveAppend(b.StartFrame(), sf); + + if (b.HasScrapRegionType()) + { + const VirtualRegionType regionType = b.ScrapRegionType(); + + if (!HasVirtualRegionType(regionType)) + virtualRegionsMap_[regionType] = std::vector(); + + virtualRegionsMap_[regionType].emplace_back( + regionType, b.QueryStart(), b.QueryEnd()); + } + + if (b.HasLocalContextFlags()) + { + std::pair barcodes{-1, -1}; + if (b.HasBarcodes()) + barcodes = b.Barcodes(); + + constexpr auto regionType = VirtualRegionType::SUBREAD; + if (!HasVirtualRegionType(regionType)) + virtualRegionsMap_[regionType] = std::vector(); + + virtualRegionsMap_[regionType].emplace_back( + regionType, b.QueryStart(), b.QueryEnd(), b.LocalContextFlags(), + barcodes.first, barcodes.second); + } + + if (b.HasBarcodes() && !this->HasBarcodes()) + this->Barcodes(b.Barcodes()); + + if (b.HasBarcodeQuality() && !this->HasBarcodeQuality()) + this->BarcodeQuality(b.BarcodeQuality()); + + if (b.HasReadAccuracy() && !this->HasReadAccuracy()) + this->ReadAccuracy(b.ReadAccuracy()); + + if (b.HasScrapZmwType()) + { + if (!this->HasScrapZmwType()) + this->ScrapZmwType(b.ScrapZmwType()); + else if (this->ScrapZmwType() != b.ScrapZmwType()) + throw std::runtime_error("ScrapZmwTypes do not match"); + } + } + + // ReadGroup + this->ReadGroup(this->header_.ReadGroups()[0]); + + this->NumPasses(1); + + // All records should contain the same SNR and hole number + if (firstRecord.HasSignalToNoise()) + this->SignalToNoise(firstRecord.SignalToNoise()); + this->HoleNumber(firstRecord.HoleNumber()); + + // QueryStart + this->QueryStart(firstRecord.QueryStart()); + this->QueryEnd(lastRecord.QueryEnd()); + this->UpdateName(); + + std::string qualitiesStr = qualities.Fastq(); + if (sequence.size() == qualitiesStr.size()) + this->Impl().SetSequenceAndQualities(sequence, qualitiesStr); + else + this->Impl().SetSequenceAndQualities(sequence); + + // Tags as strings + if (!deletionTag.empty()) + this->DeletionTag(deletionTag); + if (!substitutionTag.empty()) + this->SubstitutionTag(substitutionTag); + if (!alternativeLabelTag.empty()) + this->AltLabelTag(alternativeLabelTag); + if (!pulseCall.empty()) + this->PulseCall(pulseCall); + + // QVs + if (!deletionQv.empty()) + this->DeletionQV(deletionQv); + if (!insertionQv.empty()) + this->InsertionQV(insertionQv); + if (!mergeQv.empty()) + this->MergeQV(mergeQv); + if (!pulseMergeQv.empty()) + this->PulseMergeQV(pulseMergeQv); + if (!substitutionQv.empty()) + this->SubstitutionQV(substitutionQv); + if (!labelQv.empty()) + this->LabelQV(labelQv); + if (!alternativeLabelQv.empty()) + this->AltLabelQV(alternativeLabelQv); + + // 16 bit arrays + if (!ipd.Data().empty()) + this->IPD(ipd, FrameEncodingType::LOSSLESS); + if (!pw.Data().empty()) + this->PulseWidth(pw, FrameEncodingType::LOSSLESS); + if (!pa.empty()) + this->Pkmean(pa); + if (!pm.empty()) + this->Pkmid(pm); + if (!pd.Data().empty()) + this->PrePulseFrames(pd, FrameEncodingType::LOSSLESS); + if (!px.Data().empty()) + this->PulseCallWidth(px, FrameEncodingType::LOSSLESS); + + // 32 bit arrays + if (!sf.empty()) + this->StartFrame(sf); + + // Determine HQREGION bases on LQREGIONS + if (HasVirtualRegionType(VirtualRegionType::LQREGION)) + { + if (virtualRegionsMap_[VirtualRegionType::LQREGION].size() == 1) + { + const auto lq = virtualRegionsMap_[VirtualRegionType::LQREGION][0]; + if (lq.beginPos == 0) + virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back( + VirtualRegionType::HQREGION, lq.endPos, sequence.size()); + else if (lq.endPos == static_cast(sequence.size())) + virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back( + VirtualRegionType::HQREGION, 0, lq.beginPos); + else + throw std::runtime_error("Unknown HQREGION"); + } + else + { + int beginPos = 0; + for (const auto& lqregion : virtualRegionsMap_[VirtualRegionType::LQREGION]) + { + if (lqregion.beginPos - beginPos > 0) + virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back( + VirtualRegionType::HQREGION, beginPos, lqregion.beginPos); + beginPos = lqregion.endPos; + } + } + } + else + { + virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back( + VirtualRegionType::HQREGION, 0, sequence.size()); + } +} + + +std::map> +VirtualZmwBamRecord::VirtualRegionsMap(void) const +{ return virtualRegionsMap_; } + +std::vector +VirtualZmwBamRecord::VirtualRegionsTable(const VirtualRegionType regionType) const +{ + const auto iter = virtualRegionsMap_.find(regionType); + if (iter != virtualRegionsMap_.cend()) + return iter->second; + return std::vector(); +} diff --git a/src/VirtualZmwCompositeReader.cpp b/src/VirtualZmwCompositeReader.cpp new file mode 100644 index 0000000..686aaae --- /dev/null +++ b/src/VirtualZmwCompositeReader.cpp @@ -0,0 +1,147 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualZmwCompositeReader.cpp +/// \brief Implements the VirtualZmwCompositeReader class. +// +// Author: Derek Barnett + +#include "VirtualZmwCompositeReader.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +VirtualZmwCompositeReader::VirtualZmwCompositeReader(const DataSet& dataset) + : currentReader_(nullptr) + , filter_(PbiFilter::FromDataSet(dataset)) +{ + // set up source queue + string primaryFn; + string scrapsFn; + const ExternalResources& resources = dataset.ExternalResources(); + for (const ExternalResource& resource : resources) { + + primaryFn.clear(); + scrapsFn.clear(); + + // if resource is possible "primary" BAM + const auto& metatype = resource.MetaType(); + if (metatype == "PacBio.SubreadFile.SubreadBamFile" || + metatype == "PacBio.SubreadFile.HqRegionBamFile") + { + // possible resolve relative path + primaryFn = dataset.ResolvePath(resource.ResourceId()); + + // check for associated scraps file + const ExternalResources& childResources = resource.ExternalResources(); + for (const ExternalResource& childResource : childResources) { + const auto& childMetatype = childResource.MetaType(); + if (childMetatype == "PacBio.SubreadFile.ScrapsBamFile" || + childMetatype == "PacBio.SubreadFile.HqScrapsBamFile") + { + // possible resolve relative path + scrapsFn = dataset.ResolvePath(childResource.ResourceId()); + break; + } + } + } + + // queue up source for later + if (!primaryFn.empty() && !scrapsFn.empty()) + sources_.push_back(make_pair(primaryFn,scrapsFn)); + } + + // open first available source + OpenNextReader(); +} + +bool VirtualZmwCompositeReader::HasNext(void) +{ + return (currentReader_ && currentReader_->HasNext()); +} + +VirtualZmwBamRecord VirtualZmwCompositeReader::Next(void) +{ + if (currentReader_) { + const auto result = currentReader_->Next(); + if (!currentReader_->HasNext()) + OpenNextReader(); + return result; + } + + // no reader active + const string msg = { "no readers active, make sure you use " + "VirtualZmwCompositeReader::HasNext before " + "requesting next record" + }; + throw std::runtime_error(msg); +} + +vector VirtualZmwCompositeReader::NextRaw(void) +{ + if (currentReader_) { + const auto result = currentReader_->NextRaw(); + if (!currentReader_->HasNext()) + OpenNextReader(); + return result; + } + + // no reader active + const string msg = { "no readers active, make sure you use " + "VirtualZmwCompositeReader::HasNext before " + "requesting next group of records" + }; + throw std::runtime_error(msg); +} + +void VirtualZmwCompositeReader::OpenNextReader(void) +{ + currentReader_.reset(nullptr); + + // find next source pair with data + while(!sources_.empty()) { + const auto nextSource = sources_.front(); + sources_.pop_front(); + + currentReader_.reset(new VirtualZmwReader(nextSource.first, + nextSource.second, + filter_)); + if (currentReader_->HasNext()) + return; + } +} diff --git a/src/VirtualZmwCompositeReader.h b/src/VirtualZmwCompositeReader.h new file mode 100644 index 0000000..7c920da --- /dev/null +++ b/src/VirtualZmwCompositeReader.h @@ -0,0 +1,113 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualZmwCompositeReader.h +/// \brief Defines the VirtualZmwCompositeReader class. +// +// Author: Derek Barnett + +#ifndef VIRTUALZMWCOMPOSITEREADER_H +#define VIRTUALZMWCOMPOSITEREADER_H + +#include "pbbam/DataSet.h" +#include "pbbam/PbiFilter.h" +#include "VirtualZmwReader.h" +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +/// \brief The VirtualZmwCompositeReader provides an interface for +/// re-stitching "virtual" polymerase reads from their constituent parts, +/// across multiple %BAM resources from a DataSet. +/// +/// This class is essentially a DataSet-aware wrapper around +/// VirtualZmwReader, enabling multiple resources as input. See that +/// class's documentation for more info. +/// +class PBBAM_EXPORT VirtualZmwCompositeReader +{ +public: + /// \name Constructors & Related Methods + /// \{ + + VirtualZmwCompositeReader(const DataSet& dataset); + + VirtualZmwCompositeReader(void) = delete; + VirtualZmwCompositeReader(const VirtualZmwCompositeReader&) = delete; + VirtualZmwCompositeReader(VirtualZmwCompositeReader&&) = delete; + VirtualZmwCompositeReader& operator=(const VirtualZmwCompositeReader&) = delete; + VirtualZmwCompositeReader& operator=(VirtualZmwCompositeReader&&) = delete; + ~VirtualZmwCompositeReader(void) = default; + + /// \} + +public: + /// \name Stitched Record Reading + /// + + /// \returns true if more ZMWs/files are available for reading. + bool HasNext(void); + + /// \returns the next stitched polymerase read + VirtualZmwBamRecord Next(void); + + /// \returns the next set of reads that belong to one ZMW from one %BAM + /// resource (a primary %BAM and/or its scraps file). This enables + /// stitching records in a distinct thread. + /// + std::vector NextRaw(void); + + /// \} + +private: + std::deque< std::pair > sources_; + std::unique_ptr currentReader_; + PbiFilter filter_; + +private: + void OpenNextReader(void); +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // VIRTUALCOMPOSITEREADER_H diff --git a/src/VirtualZmwReader.cpp b/src/VirtualZmwReader.cpp new file mode 100644 index 0000000..239135d --- /dev/null +++ b/src/VirtualZmwReader.cpp @@ -0,0 +1,143 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualZmwReader.cpp +/// \brief Implements the VirtualZmwReader class. +// +// Author: Armin Töpfer + +#include + +#include "VirtualZmwReader.h" +#include "pbbam/ReadGroupInfo.h" + +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +VirtualZmwReader::VirtualZmwReader(const std::string& primaryBamFilepath, + const std::string& scrapsBamFilepath) + : VirtualZmwReader(primaryBamFilepath, scrapsBamFilepath, PbiFilter{}) +{ } + +VirtualZmwReader::VirtualZmwReader(const std::string& primaryBamFilepath, + const std::string& scrapsBamFilepath, + const PbiFilter& filter) +{ + primaryBamFile_.reset(new BamFile{ primaryBamFilepath }); + scrapsBamFile_.reset(new BamFile{ scrapsBamFilepath }); + + if (filter.IsEmpty()) { + primaryQuery_.reset(new EntireFileQuery(*primaryBamFile_)); + scrapsQuery_.reset(new EntireFileQuery(*scrapsBamFile_)); + } + else { + primaryQuery_.reset(new PbiFilterQuery{ filter, *primaryBamFile_ }); + scrapsQuery_.reset(new PbiFilterQuery{ filter, *scrapsBamFile_ }); + } + + primaryIt_ = (primaryQuery_->begin()); + scrapsIt_ = (scrapsQuery_->begin()); + + stitchedHeader_.reset(new BamHeader{ primaryBamFile_->Header().ToSam() }); + + // update stitched read group in header + auto readGroups = stitchedHeader_->ReadGroups(); + if (readGroups.empty()) + throw std::runtime_error("Bam header of the primary bam has no read groups."); + readGroups[0].ReadType("POLYMERASE"); + readGroups[0].Id(readGroups[0].MovieName(), "POLYMERASE"); + if (readGroups.size() > 1) + { + std::vector singleGroup; + singleGroup.emplace_back(std::move(readGroups[0])); + readGroups = std::move(singleGroup); + stitchedHeader_->ClearReadGroups(); + } + stitchedHeader_->ReadGroups(readGroups); +} + +VirtualZmwReader::~VirtualZmwReader(void) { } + +bool VirtualZmwReader::HasNext(void) +{ + // Return true until both iterators are at the end of the query + return primaryIt_ != primaryQuery_->end() || + scrapsIt_ != scrapsQuery_->end(); +} + +// This method is not thread safe +VirtualZmwBamRecord VirtualZmwReader::Next(void) +{ return VirtualZmwBamRecord{ NextRaw(), *stitchedHeader_ }; } + +std::vector VirtualZmwReader::NextRaw(void) +{ + std::vector bamRecordVec; + + // Current hole number, the smallest of scraps and primary. + // It can be that the next ZMW is scrap only. + int currentHoleNumber; + if (primaryIt_ == primaryQuery_->end()) + currentHoleNumber = (*scrapsIt_).HoleNumber(); + else if (scrapsIt_ == scrapsQuery_->end()) + currentHoleNumber = (*primaryIt_).HoleNumber(); + else + currentHoleNumber = std::min((*primaryIt_).HoleNumber(), + (*scrapsIt_).HoleNumber()); + + // collect subreads or hqregions + while (primaryIt_ != primaryQuery_->end() && + currentHoleNumber == (*primaryIt_).HoleNumber()) + { + bamRecordVec.push_back(*primaryIt_++); + } + + // collect scraps + while (scrapsIt_ != scrapsQuery_->end() && + currentHoleNumber == (*scrapsIt_).HoleNumber()) + { + bamRecordVec.push_back(*scrapsIt_++); + } + + return bamRecordVec; +} + +BamHeader VirtualZmwReader::PrimaryHeader(void) const +{ return primaryBamFile_->Header(); } + +BamHeader VirtualZmwReader::ScrapsHeader(void) const +{ return scrapsBamFile_->Header(); } diff --git a/src/VirtualZmwReader.h b/src/VirtualZmwReader.h new file mode 100644 index 0000000..aaa9797 --- /dev/null +++ b/src/VirtualZmwReader.h @@ -0,0 +1,129 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file VirtualZmwReader.h +/// \brief Defines the VirtualZmwReader class. +// +// Author: Armin Töpfer + +#ifndef VIRTUALZMWREADER_H +#define VIRTUALZMWREADER_H + +#include + +#include "pbbam/BamFile.h" +#include "pbbam/BamRecord.h" +#include "pbbam/Config.h" +#include "pbbam/EntireFileQuery.h" +#include "pbbam/PbiFilter.h" +#include "pbbam/PbiFilterQuery.h" +#include "pbbam/virtual/VirtualZmwBamRecord.h" + +namespace PacBio { +namespace BAM { +namespace internal { + +class VirtualZmwReader +{ +public: + /// \brief Creates a reader that will operate on a primary %BAM file (e.g. + /// subread data) and a scraps file, consuming all reads. + /// + /// \param[in] primaryBamFilepath hqregion.bam or subreads.bam file path + /// \param[in] scrapsBamFilepath scraps.bam file path + /// + VirtualZmwReader(const std::string& primaryBamFilepath, + const std::string& scrapsBamFilepath); + + /// \brief Creates a reader that will operate on a primary %BAM file (e.g. + /// subread data) and a scraps file, respecting the provided PBI + /// filter. + /// + /// \note All %BAM files must have a corresponding ".pbi" index file to use + /// the filter. You may need to call BamFile::EnsurePacBioIndexExists + /// before constructing the reader. + /// + /// \param[in] primaryBamFilepath hqregion.bam or subreads.bam file path + /// \param[in] scrapsBamFilepath scraps.bam file path + /// \param[in] filter PBI filter criteria + /// + VirtualZmwReader(const std::string& primaryBamFilepath, + const std::string& scrapsBamFilepath, + const PbiFilter& filter); + + VirtualZmwReader(void) = delete; + VirtualZmwReader(const VirtualZmwReader&) = delete; + VirtualZmwReader(VirtualZmwReader&&) = delete; + VirtualZmwReader& operator=(const VirtualZmwReader&) = delete; + VirtualZmwReader& operator=(VirtualZmwReader&&) = delete; + ~VirtualZmwReader(void); + +public: + + /// \returns the BamHeader associated with this reader's "primary" %BAM file + BamHeader PrimaryHeader(void) const; + + /// \returns the BamHeader associated with this reader's "scraps" %BAM file + BamHeader ScrapsHeader(void) const; + +public: + + /// \returns true if more ZMWs are available for reading. + bool HasNext(void); + + /// \returns the next stitched polymerase read + VirtualZmwBamRecord Next(void); + + /// \returns the next set of reads that belong to one ZMW. + /// This enables stitching records in a distinct thread. + /// + std::vector NextRaw(void); + +private: + std::unique_ptr primaryBamFile_; + std::unique_ptr scrapsBamFile_; + std::unique_ptr primaryQuery_; + std::unique_ptr scrapsQuery_; + internal::IQuery::iterator primaryIt_; + internal::IQuery::iterator scrapsIt_; + std::unique_ptr stitchedHeader_; +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // VirtualZmwREADER_H diff --git a/src/WhitelistedZmwReadStitcher.cpp b/src/WhitelistedZmwReadStitcher.cpp new file mode 100644 index 0000000..621305b --- /dev/null +++ b/src/WhitelistedZmwReadStitcher.cpp @@ -0,0 +1,186 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file WhitelistedZmwReadStitcher.cpp +/// \brief Implements the WhitelistedZmwReadStitcher class. +// +// Author: Derek Barnett + +#include "pbbam/virtual/WhitelistedZmwReadStitcher.h" +#include "pbbam/PbiIndexedBamReader.h" +#include "VirtualZmwReader.h" +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { + +struct WhitelistedZmwReadStitcher::WhitelistedZmwReadStitcherPrivate +{ +public: + WhitelistedZmwReadStitcherPrivate(const vector& zmwWhitelist, + const string& primaryBamFilePath, + const string& scrapsBamFilePath) + : primaryBamFile_(new BamFile{ primaryBamFilePath }) + , scrapsBamFile_(new BamFile{ scrapsBamFilePath }) + , primaryReader_(new PbiIndexedBamReader{ *primaryBamFile_ }) + , scrapsReader_(new PbiIndexedBamReader{ *scrapsBamFile_ }) + { + // setup new header for stitched data + polyHeader_ = unique_ptr(new BamHeader(primaryBamFile_->Header().ToSam())); + auto readGroups = polyHeader_->ReadGroups(); + if (readGroups.empty()) + throw runtime_error("Bam header of the primary bam has no read groups."); + readGroups[0].ReadType("POLYMERASE"); + readGroups[0].Id(readGroups[0].MovieName(), "POLYMERASE"); + if (readGroups.size() > 1) + { + vector singleGroup; + singleGroup.emplace_back(move(readGroups[0])); + readGroups = move(singleGroup); + polyHeader_->ClearReadGroups(); + } + polyHeader_->ReadGroups(readGroups); + + // remove ZMWs up front, that are not found in either file + PreFilterZmws(zmwWhitelist); + } + + bool HasNext(void) const + { + return !zmwWhitelist_.empty(); + } + + VirtualZmwBamRecord Next(void) + { + auto bamRecordVec = NextRaw(); + VirtualZmwBamRecord stitched(move(bamRecordVec), *polyHeader_); + return stitched; + } + + vector NextRaw(void) + { + auto result = vector{ }; + if (!HasNext()) + return result; + + const auto& zmw = zmwWhitelist_.front(); + primaryReader_->Filter(PbiZmwFilter{zmw}); + scrapsReader_->Filter(PbiZmwFilter{zmw}); + + auto record = BamRecord{ }; + while (primaryReader_->GetNext(record)) + result.push_back(record); + while (scrapsReader_->GetNext(record)) + result.push_back(record); + + zmwWhitelist_.pop_front(); + return result; + } + + BamHeader PrimaryHeader(void) const + { return primaryBamFile_->Header(); } + + BamHeader ScrapsHeader(void) const + { return scrapsBamFile_->Header(); } + +private: + unique_ptr primaryBamFile_; + unique_ptr scrapsBamFile_; + unique_ptr primaryReader_; + unique_ptr scrapsReader_; + unique_ptr polyHeader_; + deque zmwWhitelist_; + +private: + void PreFilterZmws(const vector& zmwWhitelist) + { + // fetch input ZMWs + const PbiRawData primaryIndex(primaryBamFile_->PacBioIndexFilename()); + const PbiRawData scrapsIndex(scrapsBamFile_->PacBioIndexFilename()); + const auto& primaryZmws = primaryIndex.BasicData().holeNumber_; + const auto& scrapsZmws = scrapsIndex.BasicData().holeNumber_; + + // toss them all into a set (for uniqueness & lookup here soon) + set inputZmws; + for (const auto& zmw : primaryZmws) + inputZmws.insert(zmw); + for (const auto& zmw : scrapsZmws) + inputZmws.insert(zmw); + + // check our requested whitelist against files' ZMWs, keep if found + const auto inputEnd = inputZmws.cend(); + for (const int32_t zmw : zmwWhitelist) { + if (inputZmws.find(zmw) != inputEnd) + zmwWhitelist_.push_back(zmw); + } + } +}; + +} // namespace BAM +} // namespace PacBio + +// -------------------------------- +// ZmwReadStitcher implementation +// -------------------------------- + +WhitelistedZmwReadStitcher::WhitelistedZmwReadStitcher(const vector& zmwWhitelist, + const string& primaryBamFilePath, + const string& scrapsBamFilePath) + : d_(new WhitelistedZmwReadStitcherPrivate(zmwWhitelist, + primaryBamFilePath, + scrapsBamFilePath)) +{ } + +WhitelistedZmwReadStitcher::~WhitelistedZmwReadStitcher(void) { } + +bool WhitelistedZmwReadStitcher::HasNext(void) const +{ return d_->HasNext(); } + +VirtualZmwBamRecord WhitelistedZmwReadStitcher::Next(void) +{ return d_->Next(); } + +vector WhitelistedZmwReadStitcher::NextRaw(void) +{ return d_->NextRaw(); } + +BamHeader WhitelistedZmwReadStitcher::PrimaryHeader(void) const +{ return d_->PrimaryHeader(); } + +BamHeader WhitelistedZmwReadStitcher::ScrapsHeader(void) const +{ return d_->ScrapsHeader(); } diff --git a/src/XmlReader.cpp b/src/XmlReader.cpp new file mode 100644 index 0000000..df4e782 --- /dev/null +++ b/src/XmlReader.cpp @@ -0,0 +1,154 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "XmlReader.h" +#include "StringUtils.h" +#include "pugixml/pugixml.hpp" +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +static +void UpdateRegistry(const string& attributeName, + const string& attributeValue, + NamespaceRegistry& registry) +{ + vector nameParts = Split(attributeName, ':'); + assert(!nameParts.empty()); + if (nameParts.size() > 2) + throw std::runtime_error("malformed xmlns attribute: " + attributeName); + + const bool isDefault = (nameParts.size() == 1); + const XsdType& xsd = registry.XsdForUri(attributeValue); + + if (isDefault) + registry.SetDefaultXsd(xsd); + else { + assert(nameParts.size() == 2); + const string& name = nameParts.at(1); + const string& uri = attributeValue; + NamespaceInfo namespaceInfo(name, uri); + registry.Register(xsd, namespaceInfo); + } +} + +static +void FromXml(const pugi::xml_node& xmlNode, DataSetElement& parent) +{ + // ignore non-named XML nodes + // + // pugi::xml separates XML parts into more node types than we use + // + const string& label = xmlNode.name(); + if (label.empty()) + return; + + // label & text + DataSetElement e(xmlNode.name(), FromInputXml()); + e.Text(xmlNode.text().get()); + + // iterate attributes + auto attrIter = xmlNode.attributes_begin(); + auto attrEnd = xmlNode.attributes_end(); + for ( ; attrIter != attrEnd; ++attrIter ) + e.Attribute(attrIter->name(), attrIter->value()); + + // iterate children, recursively building up subtree + auto childIter = xmlNode.begin(); + auto childEnd = xmlNode.end(); + for ( ; childIter != childEnd; ++childIter ) { + pugi::xml_node childNode = *childIter; + FromXml(childNode, e); + } + + // add our element to its parent + parent.AddChild(e); +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +std::unique_ptr XmlReader::FromStream(istream& in) +{ + pugi::xml_document doc; + const pugi::xml_parse_result& loadResult = doc.load(in); + if (loadResult.status != pugi::status_ok) + throw std::runtime_error(string("could not read XML file, error code:") + to_string(loadResult.status) ); + + // parse top-level attributes + pugi::xml_node rootNode = doc.document_element(); + if (rootNode == pugi::xml_node()) + throw std::runtime_error("could not fetch XML root node"); + + // create dataset matching type strings + std::unique_ptr dataset(new DataSetBase); + dataset->Label(rootNode.name()); + + // iterate attributes, capture namespace info + const string xmlnsPrefix("xmlns"); + auto attrIter = rootNode.attributes_begin(); + auto attrEnd = rootNode.attributes_end(); + for ( ; attrIter != attrEnd; ++attrIter ) { + const string& name = attrIter->name(); + const string& value = attrIter->value(); + dataset->Attribute(name, value); + + if (name.find(xmlnsPrefix) == 0) + UpdateRegistry(name, value, dataset->Namespaces()); + } + + // iterate children, recursively building up subtree + auto childIter = rootNode.begin(); + auto childEnd = rootNode.end(); + for ( ; childIter != childEnd; ++childIter ) { + pugi::xml_node childNode = *childIter; + internal::FromXml(childNode, *dataset.get()); + } + + return dataset; +} diff --git a/src/XmlReader.h b/src/XmlReader.h new file mode 100644 index 0000000..f5830a3 --- /dev/null +++ b/src/XmlReader.h @@ -0,0 +1,59 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef XMLREADER_H +#define XMLREADER_H + +#include "pbbam/DataSet.h" +#include +#include + +namespace PacBio { +namespace BAM { +namespace internal { + +class XmlReader +{ +public: + static std::unique_ptr FromStream(std::istream& in); +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // XMLREADER_H diff --git a/src/XmlWriter.cpp b/src/XmlWriter.cpp new file mode 100644 index 0000000..6c7b7af --- /dev/null +++ b/src/XmlWriter.cpp @@ -0,0 +1,219 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "XmlWriter.h" +#include "pbbam/DataSet.h" +#include "pugixml/pugixml.hpp" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace internal { + +static +string Prefix(const string& input) +{ + const size_t colonFound = input.find(':'); + if (colonFound == std::string::npos || colonFound == 0) + return string(); + return input.substr(0, colonFound); +} + +static +string OutputName(const DataSetElement& node, + const NamespaceRegistry& registry) +{ + // if from input XML, respect the namespaces given + if (node.IsVerbatimLabel()) + return node.QualifiedNameLabel(); + + // otherwise, probably user-generated + else { + // if no namespace prefix, prepend the appropriate one & return + if (node.PrefixLabel().empty()) { + static const string colon = ":"; + XsdType xsdType = node.Xsd(); + if (xsdType == XsdType::NONE) + xsdType = registry.XsdForElement(node.LocalNameLabel().to_string()); + return registry.Namespace(xsdType).Name() + colon + node.LocalNameLabel().to_string(); + } + // otherwise, has prefix - return full name + else + return node.QualifiedNameLabel(); + } +} + +static +void ToXml(const DataSetElement& node, + const NamespaceRegistry& registry, + map& xsdPrefixesUsed, + pugi::xml_node& parentXml) +{ + // create child of parent, w/ label & text + const string& label = OutputName(node, registry); + if (label.empty()) + return; // error? + pugi::xml_node xmlNode = parentXml.append_child(label.c_str()); + + if (!node.Text().empty()) + xmlNode.text().set(node.Text().c_str()); + + // store XSD type for later + const string prefix = Prefix(label); + if (!prefix.empty()) + xsdPrefixesUsed[node.Xsd()] = prefix; + + // add attributes + auto attrIter = node.Attributes().cbegin(); + auto attrEnd = node.Attributes().cend(); + for ( ; attrIter != attrEnd; ++attrIter) { + const string& name = attrIter->first; + if (name.empty()) + continue; + pugi::xml_attribute attr = xmlNode.append_attribute(name.c_str()); + attr.set_value(attrIter->second.c_str()); + } + + // additional stuff later? (e.g. comments) + + // iterate children, recursively building up subtree + auto childIter = node.Children().cbegin(); + auto childEnd = node.Children().cend(); + for ( ; childIter != childEnd; ++childIter) { + const DataSetElement& child = (*childIter); + ToXml(child, registry, xsdPrefixesUsed, xmlNode); + } +} + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +void XmlWriter::ToStream(const DataSetBase& dataset, + ostream& out) +{ + pugi::xml_document doc; + + const NamespaceRegistry& registry = dataset.Namespaces(); + + // create top-level dataset XML node + const string& label = internal::OutputName(dataset, registry); + if (label.empty()) + throw std::runtime_error("could not convert dataset node to XML"); + pugi::xml_node root = doc.append_child(label.c_str()); + + const string& text = dataset.Text(); + if (!text.empty()) + root.text().set(text.c_str()); + + // add top-level attributes + auto attrIter = dataset.Attributes().cbegin(); + auto attrEnd = dataset.Attributes().cend(); + for ( ; attrIter != attrEnd; ++attrIter) { + const string name = attrIter->first; + const string value = attrIter->second; + if (name.empty()) + continue; + pugi::xml_attribute attr = root.append_attribute(name.c_str()); + attr.set_value(value.c_str()); + } + + map xsdPrefixesUsed; + xsdPrefixesUsed[dataset.Xsd()] = Prefix(label); + + // iterate children, recursively building up subtree + auto childIter = dataset.Children().cbegin(); + auto childEnd = dataset.Children().cend(); + for ( ; childIter != childEnd; ++childIter) { + const DataSetElement& child = (*childIter); + ToXml(child, registry, xsdPrefixesUsed, root); + } + + // write XML to stream + pugi::xml_node decl = doc.prepend_child(pugi::node_declaration); + decl.append_attribute("version") = "1.0"; + decl.append_attribute("encoding") = "utf-8"; + + // add XSD namespace attributes + pugi::xml_attribute xmlnsDefaultAttribute = root.attribute("xmlns"); + if (xmlnsDefaultAttribute.empty()) { + xmlnsDefaultAttribute = root.append_attribute("xmlns"); + xmlnsDefaultAttribute.set_value(registry.DefaultNamespace().Uri().c_str()); + } + pugi::xml_attribute xsiAttribute = root.attribute("xmlns:xsi"); + if (xsiAttribute.empty()) { + xsiAttribute = root.append_attribute("xmlns:xsi"); + xsiAttribute.set_value("http://www.w3.org/2001/XMLSchema-instance"); + } + pugi::xml_attribute xsiSchemaLocationAttribute = root.attribute("xsi:schemaLocation"); + if (xsiSchemaLocationAttribute.empty()) { + xsiSchemaLocationAttribute = root.append_attribute("xsi:schemaLocation"); + xsiSchemaLocationAttribute.set_value(registry.DefaultNamespace().Uri().c_str()); + } + + static const string xmlnsPrefix = "xmlns:"; + map::const_iterator prefixIter = xsdPrefixesUsed.cbegin(); + map::const_iterator prefixEnd = xsdPrefixesUsed.cend(); + for ( ; prefixIter != prefixEnd; ++prefixIter ) { + const XsdType& xsd = prefixIter->first; + const string& prefix = prefixIter->second; + if (xsd == XsdType::NONE || prefix.empty()) + continue; + const NamespaceInfo& nsInfo = registry.Namespace(xsd); + assert(nsInfo.Name() == prefix); + const string xmlnsName = xmlnsPrefix + prefix; + pugi::xml_attribute xmlnsAttribute = root.attribute(xmlnsName.c_str()); + if (xmlnsAttribute.empty()) { + xmlnsAttribute = root.append_attribute(xmlnsName.c_str()); + xmlnsAttribute.set_value(nsInfo.Uri().c_str()); + } + } + + // "no escapes" to allow explicit ">" "<" comparison operators in filter parameters + // we may remove this if/when comparison is separated from the value + doc.save(out, "\t", pugi::format_default | pugi::format_no_escapes, pugi::encoding_utf8); +} + +void XmlWriter::ToStream(const unique_ptr& dataset, + ostream& out) +{ ToStream(*dataset.get(), out); } diff --git a/src/XmlWriter.h b/src/XmlWriter.h new file mode 100644 index 0000000..7fc457d --- /dev/null +++ b/src/XmlWriter.h @@ -0,0 +1,62 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef XMLWRITER_H +#define XMLWRITER_H + +#include +#include + +namespace PacBio { +namespace BAM { + +class DataSetBase; + +namespace internal { + +class XmlWriter +{ +public: + static void ToStream(const DataSetBase& dataset, std::ostream& out); + static void ToStream(const std::unique_ptr& dataset, std::ostream& out); +}; + +} // namespace internal +} // namespace BAM +} // namespace PacBio + +#endif // XMLWRITER_H diff --git a/src/ZmwGroupQuery.cpp b/src/ZmwGroupQuery.cpp new file mode 100644 index 0000000..d33b34a --- /dev/null +++ b/src/ZmwGroupQuery.cpp @@ -0,0 +1,112 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ZmwQuery.cpp +/// \brief Implements the ZmwQuery class. +// +// Author: Derek Barnett + +#include "pbbam/ZmwGroupQuery.h" +#include "pbbam/BamRecord.h" +#include "pbbam/CompositeBamReader.h" +#include "pbbam/PbiFilterTypes.h" +#include "MemoryUtils.h" +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +struct ZmwGroupQuery::ZmwGroupQueryPrivate +{ + typedef PbiFilterCompositeBamReader ReaderType; + typedef std::unique_ptr ReaderPtr; + + ZmwGroupQueryPrivate(const std::vector& zmwWhitelist, + const DataSet& dataset) + : whitelist_(zmwWhitelist.cbegin(), zmwWhitelist.cend()) + , reader_(nullptr) + { + std::sort(whitelist_.begin(), whitelist_.end()); + whitelist_.erase(std::unique(whitelist_.begin(), + whitelist_.end()), + whitelist_.end()); + + if (!whitelist_.empty()) { + reader_ = ReaderPtr(new ReaderType(PbiZmwFilter{whitelist_.front()}, dataset)); + whitelist_.pop_front(); + } + } + + bool GetNext(std::vector& records) + { + records.clear(); + if (!reader_) + return false; + + // get all records matching ZMW + BamRecord r; + while (reader_->GetNext(r)) + records.push_back(r); + + // set next ZMW (if any left) + if (!whitelist_.empty()) { + reader_->Filter(PbiZmwFilter{whitelist_.front()}); + whitelist_.pop_front(); + } + + // otherwise destroy reader, next iteration will return false + else + reader_.reset(nullptr); + + return true; + } + + std::deque whitelist_; + ReaderPtr reader_; +}; + +ZmwGroupQuery::ZmwGroupQuery(const std::vector& zmwWhitelist, + const DataSet& dataset) + : internal::IGroupQuery() + , d_(new ZmwGroupQueryPrivate(zmwWhitelist, dataset)) +{ } + +ZmwGroupQuery::~ZmwGroupQuery(void) { } + +bool ZmwGroupQuery::GetNext(std::vector& records) +{ return d_->GetNext(records); } diff --git a/src/ZmwQuery.cpp b/src/ZmwQuery.cpp new file mode 100644 index 0000000..7a45541 --- /dev/null +++ b/src/ZmwQuery.cpp @@ -0,0 +1,69 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ZmwQuery.cpp +/// \brief Implements the ZmwQuery class. +// +// Author: Derek Barnett + +#include "pbbam/ZmwQuery.h" +#include "pbbam/PbiFilterTypes.h" +#include "pbbam/CompositeBamReader.h" +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +struct ZmwQuery::ZmwQueryPrivate +{ + ZmwQueryPrivate(const std::vector& zmwWhitelist, + const DataSet& dataset) + : reader_(PbiZmwFilter(zmwWhitelist), dataset) + { } + + PbiFilterCompositeBamReader reader_; +}; + +ZmwQuery::ZmwQuery(const std::vector& zmwWhitelist, + const DataSet& dataset) + : internal::IQuery() + , d_(new ZmwQueryPrivate(zmwWhitelist, dataset)) +{ } + +ZmwQuery::~ZmwQuery(void) { } + +bool ZmwQuery::GetNext(BamRecord &r) +{ return d_->reader_.GetNext(r); } diff --git a/src/ZmwReadStitcher.cpp b/src/ZmwReadStitcher.cpp new file mode 100644 index 0000000..98daa48 --- /dev/null +++ b/src/ZmwReadStitcher.cpp @@ -0,0 +1,223 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ZmwReadStitcher.cpp +/// \brief Implements the ZmwReadStitcher class. +// +// Author: Derek Barnett + +#include "pbbam/virtual/ZmwReadStitcher.h" +#include "pbbam/DataSet.h" +#include "pbbam/EntireFileQuery.h" +#include "pbbam/PbiFilter.h" +#include "pbbam/PbiFilterQuery.h" +#include "VirtualZmwReader.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { + +struct ZmwReadStitcher::ZmwReadStitcherPrivate +{ +public: + ZmwReadStitcherPrivate(const string& primaryBamFilePath, + const string& scrapsBamFilePath, + const PbiFilter& filter) + : filter_(filter) + { + sources_.push_back(std::make_pair(primaryBamFilePath, scrapsBamFilePath)); + OpenNextReader(); + } + + ZmwReadStitcherPrivate(const DataSet& dataset) + : filter_(PbiFilter::FromDataSet(dataset)) + { + // set up source queue + string primaryFn; + string scrapsFn; + const ExternalResources& resources = dataset.ExternalResources(); + for (const ExternalResource& resource : resources) { + + primaryFn.clear(); + scrapsFn.clear(); + + // if resource is possible "primary" BAM + const auto& metatype = resource.MetaType(); + if (metatype == "PacBio.SubreadFile.SubreadBamFile" || + metatype == "PacBio.SubreadFile.HqRegionBamFile") + { + // possible resolve relative path + primaryFn = dataset.ResolvePath(resource.ResourceId()); + + // check for associated scraps file + const ExternalResources& childResources = resource.ExternalResources(); + for (const ExternalResource& childResource : childResources) { + const auto& childMetatype = childResource.MetaType(); + if (childMetatype == "PacBio.SubreadFile.ScrapsBamFile" || + childMetatype == "PacBio.SubreadFile.HqScrapsBamFile") + { + // possible resolve relative path + scrapsFn = dataset.ResolvePath(childResource.ResourceId()); + break; + } + } + } + + // queue up source for later + if (!primaryFn.empty() && !scrapsFn.empty()) + sources_.push_back(make_pair(primaryFn,scrapsFn)); + } + + OpenNextReader(); + } + +public: + bool HasNext(void) const + { return (currentReader_ && currentReader_->HasNext()); } + + VirtualZmwBamRecord Next(void) + { + if (currentReader_) { + const auto result = currentReader_->Next(); + if (!currentReader_->HasNext()) + OpenNextReader(); + return result; + } + + // no reader active + const string msg = { "no readers active, make sure you use " + "ZmwReadStitcher::HasNext before " + "requesting next record" + }; + throw std::runtime_error(msg); + } + + vector NextRaw(void) + { + if (currentReader_) { + const auto result = currentReader_->NextRaw(); + if (!currentReader_->HasNext()) + OpenNextReader(); + return result; + } + + // no reader active + const string msg = { "no readers active, make sure you use " + "ZmwReadStitcher::HasNext before " + "requesting next group of records" + }; + throw std::runtime_error(msg); + } + + BamHeader PrimaryHeader(void) const + { return currentReader_->PrimaryHeader(); } + + BamHeader ScrapsHeader(void) const + { return currentReader_->ScrapsHeader(); } + +private: + std::deque< std::pair > sources_; + std::unique_ptr currentReader_; + PbiFilter filter_; + +private: + void OpenNextReader(void) + { + currentReader_.reset(nullptr); + + // find next source pair with data + while(!sources_.empty()) { + const auto nextSource = sources_.front(); + sources_.pop_front(); + + currentReader_.reset(new VirtualZmwReader(nextSource.first, + nextSource.second, + filter_)); + if (currentReader_->HasNext()) + return; + } + } +}; + +} // namespace BAM +} // namespace PacBio + +// -------------------------------- +// ZmwReadStitcher implementation +// -------------------------------- + +ZmwReadStitcher::ZmwReadStitcher(const string& primaryBamFilePath, + const string& scrapsBamFilePath) + : ZmwReadStitcher(primaryBamFilePath, + scrapsBamFilePath, + PbiFilter{}) +{ } + +ZmwReadStitcher::ZmwReadStitcher(const string& primaryBamFilePath, + const string& scrapsBamFilePath, + const PbiFilter& filter) + : d_(new ZmwReadStitcherPrivate(primaryBamFilePath, + scrapsBamFilePath, + filter)) +{ } + +ZmwReadStitcher::ZmwReadStitcher(const DataSet& dataset) + : d_(new ZmwReadStitcherPrivate(dataset)) +{ } + +ZmwReadStitcher::~ZmwReadStitcher(void) { } + +bool ZmwReadStitcher::HasNext(void) +{ return d_->HasNext(); } + +VirtualZmwBamRecord ZmwReadStitcher::Next(void) +{ return d_->Next(); } + +vector ZmwReadStitcher::NextRaw(void) +{ return d_->NextRaw(); } + +BamHeader ZmwReadStitcher::PrimaryHeader(void) const +{ return d_->PrimaryHeader().DeepCopy(); } + +BamHeader ZmwReadStitcher::ScrapsHeader(void) const +{ return d_->ScrapsHeader().DeepCopy(); } + diff --git a/src/ZmwTypeMap.cpp b/src/ZmwTypeMap.cpp new file mode 100644 index 0000000..2eea7b7 --- /dev/null +++ b/src/ZmwTypeMap.cpp @@ -0,0 +1,53 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// File Description +/// \file ZmwTypeMap.cpp +/// \brief Implements the ZmwTypeMap class. +// +// Author: Armin Töpfer + +#include "pbbam/ZmwTypeMap.h" + +using namespace PacBio; +using namespace PacBio::BAM; + +std::map ZmwTypeMap::ParseChar +{ + { 'C' , ZmwType::CONTROL }, + { 'M' , ZmwType::MALFORMED }, + { 'N' , ZmwType::NORMAL }, + { 'S' , ZmwType::SENTINEL } +}; diff --git a/src/files.cmake b/src/files.cmake new file mode 100644 index 0000000..808b585 --- /dev/null +++ b/src/files.cmake @@ -0,0 +1,235 @@ + +# headers +set( PacBioBAM_H + + # API headers + ${PacBioBAM_IncludeDir}/pbbam/Accuracy.h + ${PacBioBAM_IncludeDir}/pbbam/AlignmentPrinter.h + ${PacBioBAM_IncludeDir}/pbbam/BamFile.h + ${PacBioBAM_IncludeDir}/pbbam/BamHeader.h + ${PacBioBAM_IncludeDir}/pbbam/BamRecord.h + ${PacBioBAM_IncludeDir}/pbbam/BamRecordBuilder.h + ${PacBioBAM_IncludeDir}/pbbam/BamRecordImpl.h + ${PacBioBAM_IncludeDir}/pbbam/BamRecordTag.h + ${PacBioBAM_IncludeDir}/pbbam/BamRecordView.h + ${PacBioBAM_IncludeDir}/pbbam/BamTagCodec.h + ${PacBioBAM_IncludeDir}/pbbam/BaiIndexedBamReader.h + ${PacBioBAM_IncludeDir}/pbbam/BamReader.h + ${PacBioBAM_IncludeDir}/pbbam/CompositeBamReader.h + ${PacBioBAM_IncludeDir}/pbbam/BamWriter.h + ${PacBioBAM_IncludeDir}/pbbam/BarcodeQuery.h + ${PacBioBAM_IncludeDir}/pbbam/Cigar.h + ${PacBioBAM_IncludeDir}/pbbam/CigarOperation.h + ${PacBioBAM_IncludeDir}/pbbam/ClipType.h + ${PacBioBAM_IncludeDir}/pbbam/Compare.h + ${PacBioBAM_IncludeDir}/pbbam/Config.h + ${PacBioBAM_IncludeDir}/pbbam/DataSet.h + ${PacBioBAM_IncludeDir}/pbbam/DataSetTypes.h + ${PacBioBAM_IncludeDir}/pbbam/DataSetXsd.h + ${PacBioBAM_IncludeDir}/pbbam/EntireFileQuery.h + ${PacBioBAM_IncludeDir}/pbbam/FastaReader.h + ${PacBioBAM_IncludeDir}/pbbam/FastaSequence.h + ${PacBioBAM_IncludeDir}/pbbam/FrameEncodingType.h + ${PacBioBAM_IncludeDir}/pbbam/Frames.h + ${PacBioBAM_IncludeDir}/pbbam/GenomicInterval.h + ${PacBioBAM_IncludeDir}/pbbam/GenomicIntervalQuery.h + ${PacBioBAM_IncludeDir}/pbbam/IndexedFastaReader.h + ${PacBioBAM_IncludeDir}/pbbam/Interval.h + ${PacBioBAM_IncludeDir}/pbbam/IRecordWriter.h + ${PacBioBAM_IncludeDir}/pbbam/LocalContextFlags.h + ${PacBioBAM_IncludeDir}/pbbam/MD5.h + ${PacBioBAM_IncludeDir}/pbbam/Orientation.h + ${PacBioBAM_IncludeDir}/pbbam/PbiBasicTypes.h + ${PacBioBAM_IncludeDir}/pbbam/PbiBuilder.h + ${PacBioBAM_IncludeDir}/pbbam/PbiFile.h + ${PacBioBAM_IncludeDir}/pbbam/PbiFilter.h + ${PacBioBAM_IncludeDir}/pbbam/PbiFilterQuery.h + ${PacBioBAM_IncludeDir}/pbbam/PbiFilterTypes.h + ${PacBioBAM_IncludeDir}/pbbam/PbiIndex.h + ${PacBioBAM_IncludeDir}/pbbam/PbiIndexedBamReader.h + ${PacBioBAM_IncludeDir}/pbbam/PbiLookupData.h + ${PacBioBAM_IncludeDir}/pbbam/PbiRawData.h + ${PacBioBAM_IncludeDir}/pbbam/Position.h + ${PacBioBAM_IncludeDir}/pbbam/ProgramInfo.h + ${PacBioBAM_IncludeDir}/pbbam/PulseBehavior.h + ${PacBioBAM_IncludeDir}/pbbam/QNameQuery.h + ${PacBioBAM_IncludeDir}/pbbam/QualityValue.h + ${PacBioBAM_IncludeDir}/pbbam/QualityValues.h + ${PacBioBAM_IncludeDir}/pbbam/ReadAccuracyQuery.h + ${PacBioBAM_IncludeDir}/pbbam/ReadGroupInfo.h + ${PacBioBAM_IncludeDir}/pbbam/RecordType.h + ${PacBioBAM_IncludeDir}/pbbam/SamTagCodec.h + ${PacBioBAM_IncludeDir}/pbbam/SamWriter.h + ${PacBioBAM_IncludeDir}/pbbam/SequenceInfo.h + ${PacBioBAM_IncludeDir}/pbbam/Strand.h + ${PacBioBAM_IncludeDir}/pbbam/SubreadLengthQuery.h + ${PacBioBAM_IncludeDir}/pbbam/Tag.h + ${PacBioBAM_IncludeDir}/pbbam/TagCollection.h +# ${PacBioBAM_IncludeDir}/pbbam/UnmappedReadsQuery.h + ${PacBioBAM_IncludeDir}/pbbam/Validator.h + ${PacBioBAM_IncludeDir}/pbbam/ZmwGroupQuery.h + ${PacBioBAM_IncludeDir}/pbbam/ZmwQuery.h + ${PacBioBAM_IncludeDir}/pbbam/ZmwType.h + ${PacBioBAM_IncludeDir}/pbbam/ZmwTypeMap.h + + # exception headers + ${PacBioBAM_IncludeDir}/pbbam/exception/InvalidSequencingChemistryException.h + ${PacBioBAM_IncludeDir}/pbbam/exception/ValidationException.h + + # API-internal headers & inline files + ${PacBioBAM_IncludeDir}/pbbam/internal/Accuracy.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/BamHeader.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/BamRecord.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/BamRecordBuilder.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/BamRecordImpl.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/BamRecordView.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/Cigar.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/CigarOperation.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/Compare.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/CompositeBamReader.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/DataSet.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetBaseTypes.h + ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetBaseTypes.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetElement.h + ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetElement.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetListElement.h + ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetListElement.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetTypes.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/FastaSequence.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/Frames.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/GenomicInterval.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/Interval.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/PbiBasicTypes.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/PbiFilter.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/PbiFilterTypes.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/PbiIndex.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/PbiLookupData.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/PbiRawData.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/ProgramInfo.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/QualityValue.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/QualityValues.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/QueryBase.h + ${PacBioBAM_IncludeDir}/pbbam/internal/QueryBase.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/ReadGroupInfo.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/SequenceInfo.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/Tag.inl + ${PacBioBAM_IncludeDir}/pbbam/internal/Validator.inl + + # virtual headers + ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualPolymeraseBamRecord.h + ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualPolymeraseCompositeReader.h + ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualPolymeraseReader.h + ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualRegion.h + ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualRegionType.h + ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualRegionTypeMap.h + ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualZmwBamRecord.h + ${PacBioBAM_IncludeDir}/pbbam/virtual/WhitelistedZmwReadStitcher.h + ${PacBioBAM_IncludeDir}/pbbam/virtual/ZmwReadStitcher.h + ${PacBioBAM_IncludeDir}/pbbam/virtual/ZmwWhitelistVirtualReader.h + + # library-internal headers + ${PacBioBAM_SourceDir}/AssertUtils.h + ${PacBioBAM_SourceDir}/BamRecordTags.h + ${PacBioBAM_SourceDir}/ChemistryTable.h + ${PacBioBAM_SourceDir}/DataSetIO.h + ${PacBioBAM_SourceDir}/DataSetUtils.h + ${PacBioBAM_SourceDir}/EnumClassHash.h + ${PacBioBAM_SourceDir}/FileProducer.h + ${PacBioBAM_SourceDir}/FileUtils.h + ${PacBioBAM_SourceDir}/FofnReader.h + ${PacBioBAM_SourceDir}/MemoryUtils.h + ${PacBioBAM_SourceDir}/PbiIndexIO.h + ${PacBioBAM_SourceDir}/Pulse2BaseCache.h + ${PacBioBAM_SourceDir}/SequenceUtils.h + ${PacBioBAM_SourceDir}/StringUtils.h + ${PacBioBAM_SourceDir}/TimeUtils.h + ${PacBioBAM_SourceDir}/ValidationErrors.h + ${PacBioBAM_SourceDir}/Version.h + ${PacBioBAM_SourceDir}/VirtualZmwCompositeReader.h + ${PacBioBAM_SourceDir}/VirtualZmwReader.h + ${PacBioBAM_SourceDir}/XmlReader.h + ${PacBioBAM_SourceDir}/XmlWriter.h + ${PacBioBAM_SourceDir}/pugixml/pugiconfig.hpp + ${PacBioBAM_SourceDir}/pugixml/pugixml.hpp +) + +# sources +set( PacBioBAM_CPP + + ${PacBioBAM_SourceDir}/Accuracy.cpp + ${PacBioBAM_SourceDir}/AlignmentPrinter.cpp + ${PacBioBAM_SourceDir}/AssertUtils.cpp + ${PacBioBAM_SourceDir}/BaiIndexedBamReader.cpp + ${PacBioBAM_SourceDir}/BamFile.cpp + ${PacBioBAM_SourceDir}/BamHeader.cpp + ${PacBioBAM_SourceDir}/BamReader.cpp + ${PacBioBAM_SourceDir}/BamRecord.cpp + ${PacBioBAM_SourceDir}/BamRecordBuilder.cpp + ${PacBioBAM_SourceDir}/BamRecordImpl.cpp + ${PacBioBAM_SourceDir}/BamRecordTags.cpp + ${PacBioBAM_SourceDir}/BamTagCodec.cpp + ${PacBioBAM_SourceDir}/BamWriter.cpp + ${PacBioBAM_SourceDir}/BarcodeQuery.cpp + ${PacBioBAM_SourceDir}/ChemistryTable.cpp + ${PacBioBAM_SourceDir}/Cigar.cpp + ${PacBioBAM_SourceDir}/CigarOperation.cpp + ${PacBioBAM_SourceDir}/Compare.cpp + ${PacBioBAM_SourceDir}/Config.cpp + ${PacBioBAM_SourceDir}/DataSet.cpp + ${PacBioBAM_SourceDir}/DataSetBaseTypes.cpp + ${PacBioBAM_SourceDir}/DataSetElement.cpp + ${PacBioBAM_SourceDir}/DataSetIO.cpp + ${PacBioBAM_SourceDir}/DataSetTypes.cpp + ${PacBioBAM_SourceDir}/DataSetXsd.cpp + ${PacBioBAM_SourceDir}/EntireFileQuery.cpp + ${PacBioBAM_SourceDir}/FastaReader.cpp + ${PacBioBAM_SourceDir}/FileProducer.cpp + ${PacBioBAM_SourceDir}/FileUtils.cpp + ${PacBioBAM_SourceDir}/FofnReader.cpp + ${PacBioBAM_SourceDir}/Frames.cpp + ${PacBioBAM_SourceDir}/GenomicInterval.cpp + ${PacBioBAM_SourceDir}/GenomicIntervalQuery.cpp + ${PacBioBAM_SourceDir}/IndexedFastaReader.cpp + ${PacBioBAM_SourceDir}/IRecordWriter.cpp + ${PacBioBAM_SourceDir}/MD5.cpp + ${PacBioBAM_SourceDir}/MemoryUtils.cpp + ${PacBioBAM_SourceDir}/PbiBuilder.cpp + ${PacBioBAM_SourceDir}/PbiFile.cpp + ${PacBioBAM_SourceDir}/PbiFilter.cpp + ${PacBioBAM_SourceDir}/PbiFilterQuery.cpp + ${PacBioBAM_SourceDir}/PbiFilterTypes.cpp + ${PacBioBAM_SourceDir}/PbiIndex.cpp + ${PacBioBAM_SourceDir}/PbiIndexedBamReader.cpp + ${PacBioBAM_SourceDir}/PbiIndexIO.cpp + ${PacBioBAM_SourceDir}/PbiRawData.cpp + ${PacBioBAM_SourceDir}/ProgramInfo.cpp + ${PacBioBAM_SourceDir}/QNameQuery.cpp + ${PacBioBAM_SourceDir}/QualityValue.cpp + ${PacBioBAM_SourceDir}/ReadAccuracyQuery.cpp + ${PacBioBAM_SourceDir}/ReadGroupInfo.cpp + ${PacBioBAM_SourceDir}/SamTagCodec.cpp + ${PacBioBAM_SourceDir}/SamWriter.cpp + ${PacBioBAM_SourceDir}/SequenceInfo.cpp + ${PacBioBAM_SourceDir}/SubreadLengthQuery.cpp + ${PacBioBAM_SourceDir}/Tag.cpp + ${PacBioBAM_SourceDir}/TagCollection.cpp +# ${PacBioBAM_SourceDir}/UnmappedReadsQuery.cpp + ${PacBioBAM_SourceDir}/Validator.cpp + ${PacBioBAM_SourceDir}/ValidationErrors.cpp + ${PacBioBAM_SourceDir}/ValidationException.cpp + ${PacBioBAM_SourceDir}/Version.cpp + ${PacBioBAM_SourceDir}/VirtualZmwBamRecord.cpp + ${PacBioBAM_SourceDir}/VirtualZmwCompositeReader.cpp + ${PacBioBAM_SourceDir}/VirtualZmwReader.cpp + ${PacBioBAM_SourceDir}/VirtualRegionTypeMap.cpp + ${PacBioBAM_SourceDir}/XmlReader.cpp + ${PacBioBAM_SourceDir}/XmlWriter.cpp + ${PacBioBAM_SourceDir}/WhitelistedZmwReadStitcher.cpp + ${PacBioBAM_SourceDir}/ZmwGroupQuery.cpp + ${PacBioBAM_SourceDir}/ZmwReadStitcher.cpp + ${PacBioBAM_SourceDir}/ZmwQuery.cpp + ${PacBioBAM_SourceDir}/ZmwTypeMap.cpp + + # XML I/O + ${PacBioBAM_SourceDir}/pugixml/pugixml.cpp +) diff --git a/src/pugixml/pugiconfig.hpp b/src/pugixml/pugiconfig.hpp new file mode 100644 index 0000000..6219dbe --- /dev/null +++ b/src/pugixml/pugiconfig.hpp @@ -0,0 +1,71 @@ +/** + * pugixml parser - version 1.5 + * -------------------------------------------------------- + * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Report bugs and download new versions at http://pugixml.org/ + * + * This library is distributed under the MIT License. See notice at the end + * of this file. + * + * This work is based on the pugxml parser, which is: + * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) + */ + +#ifndef HEADER_PUGICONFIG_HPP +#define HEADER_PUGICONFIG_HPP + +// Uncomment this to enable wchar_t mode +// #define PUGIXML_WCHAR_MODE + +// Uncomment this to disable XPath +// #define PUGIXML_NO_XPATH + +// Uncomment this to disable STL +// #define PUGIXML_NO_STL + +// Uncomment this to disable exceptions +// #define PUGIXML_NO_EXCEPTIONS + +// Set this to control attributes for public classes/functions, i.e.: +// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL +// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL +// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall +// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead + +// Tune these constants to adjust memory-related behavior +// #define PUGIXML_MEMORY_PAGE_SIZE 32768 +// #define PUGIXML_MEMORY_OUTPUT_STACK 10240 +// #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096 + +// Uncomment this to switch to header-only version +// #define PUGIXML_HEADER_ONLY + +// Uncomment this to enable long long support +// #define PUGIXML_HAS_LONG_LONG + +#endif + +/** + * Copyright (c) 2006-2014 Arseny Kapoulkine + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ diff --git a/src/pugixml/pugixml.cpp b/src/pugixml/pugixml.cpp new file mode 100644 index 0000000..0f696ab --- /dev/null +++ b/src/pugixml/pugixml.cpp @@ -0,0 +1,11525 @@ +/** + * pugixml parser - version 1.5 + * -------------------------------------------------------- + * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Report bugs and download new versions at http://pugixml.org/ + * + * This library is distributed under the MIT License. See notice at the end + * of this file. + * + * This work is based on the pugxml parser, which is: + * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) + */ + +#ifndef SOURCE_PUGIXML_CPP +#define SOURCE_PUGIXML_CPP + +#include "pugixml.hpp" + +#include +#include +#include +#include + +#ifdef PUGIXML_WCHAR_MODE +# include +#endif + +#ifndef PUGIXML_NO_XPATH +# include +# include +# ifdef PUGIXML_NO_EXCEPTIONS +# include +# endif +#endif + +#ifndef PUGIXML_NO_STL +# include +# include +# include +#endif + +// For placement new +#include + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable: 4127) // conditional expression is constant +# pragma warning(disable: 4324) // structure was padded due to __declspec(align()) +# pragma warning(disable: 4611) // interaction between '_setjmp' and C++ object destruction is non-portable +# pragma warning(disable: 4702) // unreachable code +# pragma warning(disable: 4996) // this function or variable may be unsafe +# pragma warning(disable: 4793) // function compiled as native: presence of '_setjmp' makes a function unmanaged +#endif + +#ifdef __INTEL_COMPILER +# pragma warning(disable: 177) // function was declared but never referenced +# pragma warning(disable: 279) // controlling expression is constant +# pragma warning(disable: 1478 1786) // function was declared "deprecated" +# pragma warning(disable: 1684) // conversion from pointer to same-sized integral type +#endif + +#if defined(__BORLANDC__) && defined(PUGIXML_HEADER_ONLY) +# pragma warn -8080 // symbol is declared but never used; disabling this inside push/pop bracket does not make the warning go away +#endif + +#ifdef __BORLANDC__ +# pragma option push +# pragma warn -8008 // condition is always false +# pragma warn -8066 // unreachable code +#endif + +#ifdef __SNC__ +// Using diag_push/diag_pop does not disable the warnings inside templates due to a compiler bug +# pragma diag_suppress=178 // function was declared but never referenced +# pragma diag_suppress=237 // controlling expression is constant +#endif + +// Inlining controls +#if defined(_MSC_VER) && _MSC_VER >= 1300 +# define PUGI__NO_INLINE __declspec(noinline) +#elif defined(__GNUC__) +# define PUGI__NO_INLINE __attribute__((noinline)) +#else +# define PUGI__NO_INLINE +#endif + +// Branch weight controls +#if defined(__GNUC__) +# define PUGI__UNLIKELY(cond) __builtin_expect(cond, 0) +#else +# define PUGI__UNLIKELY(cond) (cond) +#endif + +// Simple static assertion +#define PUGI__STATIC_ASSERT(cond) { static const char condition_failed[(cond) ? 1 : -1] = {0}; (void)condition_failed[0]; } + +// Digital Mars C++ bug workaround for passing char loaded from memory via stack +#ifdef __DMC__ +# define PUGI__DMC_VOLATILE volatile +#else +# define PUGI__DMC_VOLATILE +#endif + +// Borland C++ bug workaround for not defining ::memcpy depending on header include order (can't always use std::memcpy because some compilers don't have it at all) +#if defined(__BORLANDC__) && !defined(__MEM_H_USING_LIST) +using std::memcpy; +using std::memmove; +#endif + +// In some environments MSVC is a compiler but the CRT lacks certain MSVC-specific features +#if defined(_MSC_VER) && !defined(__S3E__) +# define PUGI__MSVC_CRT_VERSION _MSC_VER +#endif + +#ifdef PUGIXML_HEADER_ONLY +# define PUGI__NS_BEGIN namespace pugi { namespace impl { +# define PUGI__NS_END } } +# define PUGI__FN inline +# define PUGI__FN_NO_INLINE inline +#else +# if defined(_MSC_VER) && _MSC_VER < 1300 // MSVC6 seems to have an amusing bug with anonymous namespaces inside namespaces +# define PUGI__NS_BEGIN namespace pugi { namespace impl { +# define PUGI__NS_END } } +# else +# define PUGI__NS_BEGIN namespace pugi { namespace impl { namespace { +# define PUGI__NS_END } } } +# endif +# define PUGI__FN +# define PUGI__FN_NO_INLINE PUGI__NO_INLINE +#endif + +// uintptr_t +#if !defined(_MSC_VER) || _MSC_VER >= 1600 +# include +#else +# ifndef _UINTPTR_T_DEFINED +// No native uintptr_t in MSVC6 and in some WinCE versions +typedef size_t uintptr_t; +#define _UINTPTR_T_DEFINED +# endif +PUGI__NS_BEGIN + typedef unsigned __int8 uint8_t; + typedef unsigned __int16 uint16_t; + typedef unsigned __int32 uint32_t; +PUGI__NS_END +#endif + +// Memory allocation +PUGI__NS_BEGIN + PUGI__FN void* default_allocate(size_t size) + { + return malloc(size); + } + + PUGI__FN void default_deallocate(void* ptr) + { + free(ptr); + } + + template + struct xml_memory_management_function_storage + { + static allocation_function allocate; + static deallocation_function deallocate; + }; + + // Global allocation functions are stored in class statics so that in header mode linker deduplicates them + // Without a template<> we'll get multiple definitions of the same static + template allocation_function xml_memory_management_function_storage::allocate = default_allocate; + template deallocation_function xml_memory_management_function_storage::deallocate = default_deallocate; + + typedef xml_memory_management_function_storage xml_memory; +PUGI__NS_END + +// String utilities +PUGI__NS_BEGIN + // Get string length + PUGI__FN size_t strlength(const char_t* s) + { + assert(s); + + #ifdef PUGIXML_WCHAR_MODE + return wcslen(s); + #else + return strlen(s); + #endif + } + + // Compare two strings + PUGI__FN bool strequal(const char_t* src, const char_t* dst) + { + assert(src && dst); + + #ifdef PUGIXML_WCHAR_MODE + return wcscmp(src, dst) == 0; + #else + return strcmp(src, dst) == 0; + #endif + } + + // Compare lhs with [rhs_begin, rhs_end) + PUGI__FN bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count) + { + for (size_t i = 0; i < count; ++i) + if (lhs[i] != rhs[i]) + return false; + + return lhs[count] == 0; + } + + // Get length of wide string, even if CRT lacks wide character support + PUGI__FN size_t strlength_wide(const wchar_t* s) + { + assert(s); + + #ifdef PUGIXML_WCHAR_MODE + return wcslen(s); + #else + const wchar_t* end = s; + while (*end) end++; + return static_cast(end - s); + #endif + } + +#ifdef PUGIXML_WCHAR_MODE + // Convert string to wide string, assuming all symbols are ASCII + PUGI__FN void widen_ascii(wchar_t* dest, const char* source) + { + for (const char* i = source; *i; ++i) *dest++ = *i; + *dest = 0; + } +#endif +PUGI__NS_END + +#if !defined(PUGIXML_NO_STL) || !defined(PUGIXML_NO_XPATH) +// auto_ptr-like buffer holder for exception recovery +PUGI__NS_BEGIN + struct buffer_holder + { + void* data; + void (*deleter)(void*); + + buffer_holder(void* data_, void (*deleter_)(void*)): data(data_), deleter(deleter_) + { + } + + ~buffer_holder() + { + if (data) deleter(data); + } + + void* release() + { + void* result = data; + data = 0; + return result; + } + }; +PUGI__NS_END +#endif + +PUGI__NS_BEGIN + static const size_t xml_memory_page_size = + #ifdef PUGIXML_MEMORY_PAGE_SIZE + PUGIXML_MEMORY_PAGE_SIZE + #else + 32768 + #endif + ; + + static const uintptr_t xml_memory_page_alignment = 64; + static const uintptr_t xml_memory_page_pointer_mask = ~(xml_memory_page_alignment - 1); + static const uintptr_t xml_memory_page_contents_shared_mask = 32; + static const uintptr_t xml_memory_page_name_allocated_mask = 16; + static const uintptr_t xml_memory_page_value_allocated_mask = 8; + static const uintptr_t xml_memory_page_type_mask = 7; + static const uintptr_t xml_memory_page_name_allocated_or_shared_mask = xml_memory_page_name_allocated_mask | xml_memory_page_contents_shared_mask; + static const uintptr_t xml_memory_page_value_allocated_or_shared_mask = xml_memory_page_value_allocated_mask | xml_memory_page_contents_shared_mask; + + #define PUGI__NODETYPE(n) static_cast(((n)->header & impl::xml_memory_page_type_mask) + 1) + + struct xml_allocator; + + struct xml_memory_page + { + static xml_memory_page* construct(void* memory) + { + xml_memory_page* result = static_cast(memory); + + result->allocator = 0; + result->prev = 0; + result->next = 0; + result->busy_size = 0; + result->freed_size = 0; + + return result; + } + + xml_allocator* allocator; + + xml_memory_page* prev; + xml_memory_page* next; + + size_t busy_size; + size_t freed_size; + }; + + struct xml_memory_string_header + { + uint16_t page_offset; // offset from page->data + uint16_t full_size; // 0 if string occupies whole page + }; + + struct xml_allocator + { + xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size) + { + } + + xml_memory_page* allocate_page(size_t data_size) + { + size_t size = sizeof(xml_memory_page) + data_size; + + // allocate block with some alignment, leaving memory for worst-case padding + void* memory = xml_memory::allocate(size + xml_memory_page_alignment); + if (!memory) return 0; + + // align to next page boundary (note: this guarantees at least 1 usable byte before the page) + char* page_memory = reinterpret_cast((reinterpret_cast(memory) + xml_memory_page_alignment) & ~(xml_memory_page_alignment - 1)); + + // prepare page structure + xml_memory_page* page = xml_memory_page::construct(page_memory); + assert(page); + + page->allocator = _root->allocator; + + // record the offset for freeing the memory block + assert(page_memory > memory && page_memory - static_cast(memory) <= 127); + page_memory[-1] = static_cast(page_memory - static_cast(memory)); + + return page; + } + + static void deallocate_page(xml_memory_page* page) + { + char* page_memory = reinterpret_cast(page); + + xml_memory::deallocate(page_memory - page_memory[-1]); + } + + void* allocate_memory_oob(size_t size, xml_memory_page*& out_page); + + void* allocate_memory(size_t size, xml_memory_page*& out_page) + { + if (_busy_size + size > xml_memory_page_size) return allocate_memory_oob(size, out_page); + + void* buf = reinterpret_cast(_root) + sizeof(xml_memory_page) + _busy_size; + + _busy_size += size; + + out_page = _root; + + return buf; + } + + void deallocate_memory(void* ptr, size_t size, xml_memory_page* page) + { + if (page == _root) page->busy_size = _busy_size; + + assert(ptr >= reinterpret_cast(page) + sizeof(xml_memory_page) && ptr < reinterpret_cast(page) + sizeof(xml_memory_page) + page->busy_size); + (void)!ptr; + + page->freed_size += size; + assert(page->freed_size <= page->busy_size); + + if (page->freed_size == page->busy_size) + { + if (page->next == 0) + { + assert(_root == page); + + // top page freed, just reset sizes + page->busy_size = page->freed_size = 0; + _busy_size = 0; + } + else + { + assert(_root != page); + assert(page->prev); + + // remove from the list + page->prev->next = page->next; + page->next->prev = page->prev; + + // deallocate + deallocate_page(page); + } + } + } + + char_t* allocate_string(size_t length) + { + PUGI__STATIC_ASSERT(xml_memory_page_size <= (1 << 16)); + + // allocate memory for string and header block + size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t); + + // round size up to pointer alignment boundary + size_t full_size = (size + (sizeof(void*) - 1)) & ~(sizeof(void*) - 1); + + xml_memory_page* page; + xml_memory_string_header* header = static_cast(allocate_memory(full_size, page)); + + if (!header) return 0; + + // setup header + ptrdiff_t page_offset = reinterpret_cast(header) - reinterpret_cast(page) - sizeof(xml_memory_page); + + assert(page_offset >= 0 && page_offset < (1 << 16)); + header->page_offset = static_cast(page_offset); + + // full_size == 0 for large strings that occupy the whole page + assert(full_size < (1 << 16) || (page->busy_size == full_size && page_offset == 0)); + header->full_size = static_cast(full_size < (1 << 16) ? full_size : 0); + + // round-trip through void* to avoid 'cast increases required alignment of target type' warning + // header is guaranteed a pointer-sized alignment, which should be enough for char_t + return static_cast(static_cast(header + 1)); + } + + void deallocate_string(char_t* string) + { + // this function casts pointers through void* to avoid 'cast increases required alignment of target type' warnings + // we're guaranteed the proper (pointer-sized) alignment on the input string if it was allocated via allocate_string + + // get header + xml_memory_string_header* header = static_cast(static_cast(string)) - 1; + assert(header); + + // deallocate + size_t page_offset = sizeof(xml_memory_page) + header->page_offset; + xml_memory_page* page = reinterpret_cast(static_cast(reinterpret_cast(header) - page_offset)); + + // if full_size == 0 then this string occupies the whole page + size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size; + + deallocate_memory(header, full_size, page); + } + + xml_memory_page* _root; + size_t _busy_size; + }; + + PUGI__FN_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page) + { + const size_t large_allocation_threshold = xml_memory_page_size / 4; + + xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size); + out_page = page; + + if (!page) return 0; + + if (size <= large_allocation_threshold) + { + _root->busy_size = _busy_size; + + // insert page at the end of linked list + page->prev = _root; + _root->next = page; + _root = page; + + _busy_size = size; + } + else + { + // insert page before the end of linked list, so that it is deleted as soon as possible + // the last page is not deleted even if it's empty (see deallocate_memory) + assert(_root->prev); + + page->prev = _root->prev; + page->next = _root; + + _root->prev->next = page; + _root->prev = page; + } + + // allocate inside page + page->busy_size = size; + + return reinterpret_cast(page) + sizeof(xml_memory_page); + } +PUGI__NS_END + +namespace pugi +{ + /// A 'name=value' XML attribute structure. + struct xml_attribute_struct + { + /// Default ctor + xml_attribute_struct(impl::xml_memory_page* page): header(reinterpret_cast(page)), name(0), value(0), prev_attribute_c(0), next_attribute(0) + { + } + + uintptr_t header; + + char_t* name; ///< Pointer to attribute name. + char_t* value; ///< Pointer to attribute value. + + xml_attribute_struct* prev_attribute_c; ///< Previous attribute (cyclic list) + xml_attribute_struct* next_attribute; ///< Next attribute + }; + + /// An XML document tree node. + struct xml_node_struct + { + /// Default ctor + /// \param type - node type + xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(reinterpret_cast(page) | (type - 1)), parent(0), name(0), value(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0) + { + } + + uintptr_t header; + + xml_node_struct* parent; ///< Pointer to parent + + char_t* name; ///< Pointer to element name. + char_t* value; ///< Pointer to any associated string data. + + xml_node_struct* first_child; ///< First child + + xml_node_struct* prev_sibling_c; ///< Left brother (cyclic list) + xml_node_struct* next_sibling; ///< Right brother + + xml_attribute_struct* first_attribute; ///< First attribute + }; +} + +PUGI__NS_BEGIN + struct xml_extra_buffer + { + char_t* buffer; + xml_extra_buffer* next; + }; + + struct xml_document_struct: public xml_node_struct, public xml_allocator + { + xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0), extra_buffers(0) + { + } + + const char_t* buffer; + + xml_extra_buffer* extra_buffers; + }; + + inline xml_allocator& get_allocator(const xml_node_struct* node) + { + assert(node); + + return *reinterpret_cast(node->header & xml_memory_page_pointer_mask)->allocator; + } + + template inline xml_document_struct& get_document(const Object* object) + { + assert(object); + + return *static_cast(reinterpret_cast(object->header & xml_memory_page_pointer_mask)->allocator); + } +PUGI__NS_END + +// Low-level DOM operations +PUGI__NS_BEGIN + inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc) + { + xml_memory_page* page; + void* memory = alloc.allocate_memory(sizeof(xml_attribute_struct), page); + + return new (memory) xml_attribute_struct(page); + } + + inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type) + { + xml_memory_page* page; + void* memory = alloc.allocate_memory(sizeof(xml_node_struct), page); + + return new (memory) xml_node_struct(page, type); + } + + inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc) + { + uintptr_t header = a->header; + + if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(a->name); + if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(a->value); + + alloc.deallocate_memory(a, sizeof(xml_attribute_struct), reinterpret_cast(header & xml_memory_page_pointer_mask)); + } + + inline void destroy_node(xml_node_struct* n, xml_allocator& alloc) + { + uintptr_t header = n->header; + + if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(n->name); + if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(n->value); + + for (xml_attribute_struct* attr = n->first_attribute; attr; ) + { + xml_attribute_struct* next = attr->next_attribute; + + destroy_attribute(attr, alloc); + + attr = next; + } + + for (xml_node_struct* child = n->first_child; child; ) + { + xml_node_struct* next = child->next_sibling; + + destroy_node(child, alloc); + + child = next; + } + + alloc.deallocate_memory(n, sizeof(xml_node_struct), reinterpret_cast(header & xml_memory_page_pointer_mask)); + } + + inline void append_node(xml_node_struct* child, xml_node_struct* node) + { + child->parent = node; + + xml_node_struct* head = node->first_child; + + if (head) + { + xml_node_struct* tail = head->prev_sibling_c; + + tail->next_sibling = child; + child->prev_sibling_c = tail; + head->prev_sibling_c = child; + } + else + { + node->first_child = child; + child->prev_sibling_c = child; + } + } + + inline void prepend_node(xml_node_struct* child, xml_node_struct* node) + { + child->parent = node; + + xml_node_struct* head = node->first_child; + + if (head) + { + child->prev_sibling_c = head->prev_sibling_c; + head->prev_sibling_c = child; + } + else + child->prev_sibling_c = child; + + child->next_sibling = head; + node->first_child = child; + } + + inline void insert_node_after(xml_node_struct* child, xml_node_struct* node) + { + xml_node_struct* parent = node->parent; + + child->parent = parent; + + if (node->next_sibling) + node->next_sibling->prev_sibling_c = child; + else + parent->first_child->prev_sibling_c = child; + + child->next_sibling = node->next_sibling; + child->prev_sibling_c = node; + + node->next_sibling = child; + } + + inline void insert_node_before(xml_node_struct* child, xml_node_struct* node) + { + xml_node_struct* parent = node->parent; + + child->parent = parent; + + if (node->prev_sibling_c->next_sibling) + node->prev_sibling_c->next_sibling = child; + else + parent->first_child = child; + + child->prev_sibling_c = node->prev_sibling_c; + child->next_sibling = node; + + node->prev_sibling_c = child; + } + + inline void remove_node(xml_node_struct* node) + { + xml_node_struct* parent = node->parent; + + if (node->next_sibling) + node->next_sibling->prev_sibling_c = node->prev_sibling_c; + else + parent->first_child->prev_sibling_c = node->prev_sibling_c; + + if (node->prev_sibling_c->next_sibling) + node->prev_sibling_c->next_sibling = node->next_sibling; + else + parent->first_child = node->next_sibling; + + node->parent = 0; + node->prev_sibling_c = 0; + node->next_sibling = 0; + } + + inline void append_attribute(xml_attribute_struct* attr, xml_node_struct* node) + { + xml_attribute_struct* head = node->first_attribute; + + if (head) + { + xml_attribute_struct* tail = head->prev_attribute_c; + + tail->next_attribute = attr; + attr->prev_attribute_c = tail; + head->prev_attribute_c = attr; + } + else + { + node->first_attribute = attr; + attr->prev_attribute_c = attr; + } + } + + inline void prepend_attribute(xml_attribute_struct* attr, xml_node_struct* node) + { + xml_attribute_struct* head = node->first_attribute; + + if (head) + { + attr->prev_attribute_c = head->prev_attribute_c; + head->prev_attribute_c = attr; + } + else + attr->prev_attribute_c = attr; + + attr->next_attribute = head; + node->first_attribute = attr; + } + + inline void insert_attribute_after(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node) + { + if (place->next_attribute) + place->next_attribute->prev_attribute_c = attr; + else + node->first_attribute->prev_attribute_c = attr; + + attr->next_attribute = place->next_attribute; + attr->prev_attribute_c = place; + place->next_attribute = attr; + } + + inline void insert_attribute_before(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node) + { + if (place->prev_attribute_c->next_attribute) + place->prev_attribute_c->next_attribute = attr; + else + node->first_attribute = attr; + + attr->prev_attribute_c = place->prev_attribute_c; + attr->next_attribute = place; + place->prev_attribute_c = attr; + } + + inline void remove_attribute(xml_attribute_struct* attr, xml_node_struct* node) + { + if (attr->next_attribute) + attr->next_attribute->prev_attribute_c = attr->prev_attribute_c; + else + node->first_attribute->prev_attribute_c = attr->prev_attribute_c; + + if (attr->prev_attribute_c->next_attribute) + attr->prev_attribute_c->next_attribute = attr->next_attribute; + else + node->first_attribute = attr->next_attribute; + + attr->prev_attribute_c = 0; + attr->next_attribute = 0; + } + + PUGI__FN_NO_INLINE xml_node_struct* append_new_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element) + { + xml_node_struct* child = allocate_node(alloc, type); + if (!child) return 0; + + append_node(child, node); + + return child; + } + + PUGI__FN_NO_INLINE xml_attribute_struct* append_new_attribute(xml_node_struct* node, xml_allocator& alloc) + { + xml_attribute_struct* attr = allocate_attribute(alloc); + if (!attr) return 0; + + append_attribute(attr, node); + + return attr; + } +PUGI__NS_END + +// Helper classes for code generation +PUGI__NS_BEGIN + struct opt_false + { + enum { value = 0 }; + }; + + struct opt_true + { + enum { value = 1 }; + }; +PUGI__NS_END + +// Unicode utilities +PUGI__NS_BEGIN + inline uint16_t endian_swap(uint16_t value) + { + return static_cast(((value & 0xff) << 8) | (value >> 8)); + } + + inline uint32_t endian_swap(uint32_t value) + { + return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24); + } + + struct utf8_counter + { + typedef size_t value_type; + + static value_type low(value_type result, uint32_t ch) + { + // U+0000..U+007F + if (ch < 0x80) return result + 1; + // U+0080..U+07FF + else if (ch < 0x800) return result + 2; + // U+0800..U+FFFF + else return result + 3; + } + + static value_type high(value_type result, uint32_t) + { + // U+10000..U+10FFFF + return result + 4; + } + }; + + struct utf8_writer + { + typedef uint8_t* value_type; + + static value_type low(value_type result, uint32_t ch) + { + // U+0000..U+007F + if (ch < 0x80) + { + *result = static_cast(ch); + return result + 1; + } + // U+0080..U+07FF + else if (ch < 0x800) + { + result[0] = static_cast(0xC0 | (ch >> 6)); + result[1] = static_cast(0x80 | (ch & 0x3F)); + return result + 2; + } + // U+0800..U+FFFF + else + { + result[0] = static_cast(0xE0 | (ch >> 12)); + result[1] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + result[2] = static_cast(0x80 | (ch & 0x3F)); + return result + 3; + } + } + + static value_type high(value_type result, uint32_t ch) + { + // U+10000..U+10FFFF + result[0] = static_cast(0xF0 | (ch >> 18)); + result[1] = static_cast(0x80 | ((ch >> 12) & 0x3F)); + result[2] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + result[3] = static_cast(0x80 | (ch & 0x3F)); + return result + 4; + } + + static value_type any(value_type result, uint32_t ch) + { + return (ch < 0x10000) ? low(result, ch) : high(result, ch); + } + }; + + struct utf16_counter + { + typedef size_t value_type; + + static value_type low(value_type result, uint32_t) + { + return result + 1; + } + + static value_type high(value_type result, uint32_t) + { + return result + 2; + } + }; + + struct utf16_writer + { + typedef uint16_t* value_type; + + static value_type low(value_type result, uint32_t ch) + { + *result = static_cast(ch); + + return result + 1; + } + + static value_type high(value_type result, uint32_t ch) + { + uint32_t msh = static_cast(ch - 0x10000) >> 10; + uint32_t lsh = static_cast(ch - 0x10000) & 0x3ff; + + result[0] = static_cast(0xD800 + msh); + result[1] = static_cast(0xDC00 + lsh); + + return result + 2; + } + + static value_type any(value_type result, uint32_t ch) + { + return (ch < 0x10000) ? low(result, ch) : high(result, ch); + } + }; + + struct utf32_counter + { + typedef size_t value_type; + + static value_type low(value_type result, uint32_t) + { + return result + 1; + } + + static value_type high(value_type result, uint32_t) + { + return result + 1; + } + }; + + struct utf32_writer + { + typedef uint32_t* value_type; + + static value_type low(value_type result, uint32_t ch) + { + *result = ch; + + return result + 1; + } + + static value_type high(value_type result, uint32_t ch) + { + *result = ch; + + return result + 1; + } + + static value_type any(value_type result, uint32_t ch) + { + *result = ch; + + return result + 1; + } + }; + + struct latin1_writer + { + typedef uint8_t* value_type; + + static value_type low(value_type result, uint32_t ch) + { + *result = static_cast(ch > 255 ? '?' : ch); + + return result + 1; + } + + static value_type high(value_type result, uint32_t ch) + { + (void)ch; + + *result = '?'; + + return result + 1; + } + }; + + template struct wchar_selector; + + template <> struct wchar_selector<2> + { + typedef uint16_t type; + typedef utf16_counter counter; + typedef utf16_writer writer; + }; + + template <> struct wchar_selector<4> + { + typedef uint32_t type; + typedef utf32_counter counter; + typedef utf32_writer writer; + }; + + typedef wchar_selector::counter wchar_counter; + typedef wchar_selector::writer wchar_writer; + + template struct utf_decoder + { + static inline typename Traits::value_type decode_utf8_block(const uint8_t* data, size_t size, typename Traits::value_type result) + { + const uint8_t utf8_byte_mask = 0x3f; + + while (size) + { + uint8_t lead = *data; + + // 0xxxxxxx -> U+0000..U+007F + if (lead < 0x80) + { + result = Traits::low(result, lead); + data += 1; + size -= 1; + + // process aligned single-byte (ascii) blocks + if ((reinterpret_cast(data) & 3) == 0) + { + // round-trip through void* to silence 'cast increases required alignment of target type' warnings + while (size >= 4 && (*static_cast(static_cast(data)) & 0x80808080) == 0) + { + result = Traits::low(result, data[0]); + result = Traits::low(result, data[1]); + result = Traits::low(result, data[2]); + result = Traits::low(result, data[3]); + data += 4; + size -= 4; + } + } + } + // 110xxxxx -> U+0080..U+07FF + else if (static_cast(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80) + { + result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask)); + data += 2; + size -= 2; + } + // 1110xxxx -> U+0800-U+FFFF + else if (static_cast(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80) + { + result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask)); + data += 3; + size -= 3; + } + // 11110xxx -> U+10000..U+10FFFF + else if (static_cast(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80) + { + result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask)); + data += 4; + size -= 4; + } + // 10xxxxxx or 11111xxx -> invalid + else + { + data += 1; + size -= 1; + } + } + + return result; + } + + static inline typename Traits::value_type decode_utf16_block(const uint16_t* data, size_t size, typename Traits::value_type result) + { + const uint16_t* end = data + size; + + while (data < end) + { + unsigned int lead = opt_swap::value ? endian_swap(*data) : *data; + + // U+0000..U+D7FF + if (lead < 0xD800) + { + result = Traits::low(result, lead); + data += 1; + } + // U+E000..U+FFFF + else if (static_cast(lead - 0xE000) < 0x2000) + { + result = Traits::low(result, lead); + data += 1; + } + // surrogate pair lead + else if (static_cast(lead - 0xD800) < 0x400 && data + 1 < end) + { + uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1]; + + if (static_cast(next - 0xDC00) < 0x400) + { + result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff)); + data += 2; + } + else + { + data += 1; + } + } + else + { + data += 1; + } + } + + return result; + } + + static inline typename Traits::value_type decode_utf32_block(const uint32_t* data, size_t size, typename Traits::value_type result) + { + const uint32_t* end = data + size; + + while (data < end) + { + uint32_t lead = opt_swap::value ? endian_swap(*data) : *data; + + // U+0000..U+FFFF + if (lead < 0x10000) + { + result = Traits::low(result, lead); + data += 1; + } + // U+10000..U+10FFFF + else + { + result = Traits::high(result, lead); + data += 1; + } + } + + return result; + } + + static inline typename Traits::value_type decode_latin1_block(const uint8_t* data, size_t size, typename Traits::value_type result) + { + for (size_t i = 0; i < size; ++i) + { + result = Traits::low(result, data[i]); + } + + return result; + } + + static inline typename Traits::value_type decode_wchar_block_impl(const uint16_t* data, size_t size, typename Traits::value_type result) + { + return decode_utf16_block(data, size, result); + } + + static inline typename Traits::value_type decode_wchar_block_impl(const uint32_t* data, size_t size, typename Traits::value_type result) + { + return decode_utf32_block(data, size, result); + } + + static inline typename Traits::value_type decode_wchar_block(const wchar_t* data, size_t size, typename Traits::value_type result) + { + return decode_wchar_block_impl(reinterpret_cast::type*>(data), size, result); + } + }; + + template PUGI__FN void convert_utf_endian_swap(T* result, const T* data, size_t length) + { + for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]); + } + +#ifdef PUGIXML_WCHAR_MODE + PUGI__FN void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length) + { + for (size_t i = 0; i < length; ++i) result[i] = static_cast(endian_swap(static_cast::type>(data[i]))); + } +#endif +PUGI__NS_END + +PUGI__NS_BEGIN + enum chartype_t + { + ct_parse_pcdata = 1, // \0, &, \r, < + ct_parse_attr = 2, // \0, &, \r, ', " + ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, tab + ct_space = 8, // \r, \n, space, tab + ct_parse_cdata = 16, // \0, ], >, \r + ct_parse_comment = 32, // \0, -, >, \r + ct_symbol = 64, // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, . + ct_start_symbol = 128 // Any symbol > 127, a-z, A-Z, _, : + }; + + static const unsigned char chartype_table[256] = + { + 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31 + 8, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47 + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 0, 1, 0, 48, 0, // 48-63 + 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 64-79 + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 16, 0, 192, // 80-95 + 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 96-111 + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, // 112-127 + + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 128+ + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192 + }; + + enum chartypex_t + { + ctx_special_pcdata = 1, // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, > + ctx_special_attr = 2, // Any symbol >= 0 and < 32 (except \t), &, <, >, " + ctx_start_symbol = 4, // Any symbol > 127, a-z, A-Z, _ + ctx_digit = 8, // 0-9 + ctx_symbol = 16 // Any symbol > 127, a-z, A-Z, 0-9, _, -, . + }; + + static const unsigned char chartypex_table[256] = + { + 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 2, 3, 3, // 0-15 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 16-31 + 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 16, 16, 0, // 32-47 + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 3, 0, 3, 0, // 48-63 + + 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 64-79 + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 20, // 80-95 + 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 96-111 + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, // 112-127 + + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 128+ + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 + }; + +#ifdef PUGIXML_WCHAR_MODE + #define PUGI__IS_CHARTYPE_IMPL(c, ct, table) ((static_cast(c) < 128 ? table[static_cast(c)] : table[128]) & (ct)) +#else + #define PUGI__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast(c)] & (ct)) +#endif + + #define PUGI__IS_CHARTYPE(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartype_table) + #define PUGI__IS_CHARTYPEX(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartypex_table) + + PUGI__FN bool is_little_endian() + { + unsigned int ui = 1; + + return *reinterpret_cast(&ui) == 1; + } + + PUGI__FN xml_encoding get_wchar_encoding() + { + PUGI__STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); + + if (sizeof(wchar_t) == 2) + return is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + else + return is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + } + + PUGI__FN xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3) + { + // look for BOM in first few bytes + if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be; + if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le; + if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be; + if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le; + if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8; + + // look for <, (contents); + + PUGI__DMC_VOLATILE uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3]; + + return guess_buffer_encoding(d0, d1, d2, d3); + } + + PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) + { + size_t length = size / sizeof(char_t); + + if (is_mutable) + { + out_buffer = static_cast(const_cast(contents)); + out_length = length; + } + else + { + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + if (contents) + memcpy(buffer, contents, length * sizeof(char_t)); + else + assert(length == 0); + + buffer[length] = 0; + + out_buffer = buffer; + out_length = length + 1; + } + + return true; + } + +#ifdef PUGIXML_WCHAR_MODE + PUGI__FN bool need_endian_swap_utf(xml_encoding le, xml_encoding re) + { + return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) || + (le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be); + } + + PUGI__FN bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) + { + const char_t* data = static_cast(contents); + size_t length = size / sizeof(char_t); + + if (is_mutable) + { + char_t* buffer = const_cast(data); + + convert_wchar_endian_swap(buffer, data, length); + + out_buffer = buffer; + out_length = length; + } + else + { + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + convert_wchar_endian_swap(buffer, data, length); + buffer[length] = 0; + + out_buffer = buffer; + out_length = length + 1; + } + + return true; + } + + PUGI__FN bool convert_buffer_utf8(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size) + { + const uint8_t* data = static_cast(contents); + size_t data_length = size; + + // first pass: get length in wchar_t units + size_t length = utf_decoder::decode_utf8_block(data, data_length, 0); + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // second pass: convert utf8 input to wchar_t + wchar_writer::value_type obegin = reinterpret_cast(buffer); + wchar_writer::value_type oend = utf_decoder::decode_utf8_block(data, data_length, obegin); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; + } + + template PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap) + { + const uint16_t* data = static_cast(contents); + size_t data_length = size / sizeof(uint16_t); + + // first pass: get length in wchar_t units + size_t length = utf_decoder::decode_utf16_block(data, data_length, 0); + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // second pass: convert utf16 input to wchar_t + wchar_writer::value_type obegin = reinterpret_cast(buffer); + wchar_writer::value_type oend = utf_decoder::decode_utf16_block(data, data_length, obegin); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; + } + + template PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap) + { + const uint32_t* data = static_cast(contents); + size_t data_length = size / sizeof(uint32_t); + + // first pass: get length in wchar_t units + size_t length = utf_decoder::decode_utf32_block(data, data_length, 0); + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // second pass: convert utf32 input to wchar_t + wchar_writer::value_type obegin = reinterpret_cast(buffer); + wchar_writer::value_type oend = utf_decoder::decode_utf32_block(data, data_length, obegin); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; + } + + PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size) + { + const uint8_t* data = static_cast(contents); + size_t data_length = size; + + // get length in wchar_t units + size_t length = data_length; + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // convert latin1 input to wchar_t + wchar_writer::value_type obegin = reinterpret_cast(buffer); + wchar_writer::value_type oend = utf_decoder::decode_latin1_block(data, data_length, obegin); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; + } + + PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable) + { + // get native encoding + xml_encoding wchar_encoding = get_wchar_encoding(); + + // fast path: no conversion required + if (encoding == wchar_encoding) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); + + // only endian-swapping is required + if (need_endian_swap_utf(encoding, wchar_encoding)) return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable); + + // source encoding is utf8 + if (encoding == encoding_utf8) return convert_buffer_utf8(out_buffer, out_length, contents, size); + + // source encoding is utf16 + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) + { + xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + return (native_encoding == encoding) ? + convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) : + convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true()); + } + + // source encoding is utf32 + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) + { + xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + return (native_encoding == encoding) ? + convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) : + convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true()); + } + + // source encoding is latin1 + if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size); + + assert(!"Invalid encoding"); + return false; + } +#else + template PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap) + { + const uint16_t* data = static_cast(contents); + size_t data_length = size / sizeof(uint16_t); + + // first pass: get length in utf8 units + size_t length = utf_decoder::decode_utf16_block(data, data_length, 0); + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // second pass: convert utf16 input to utf8 + uint8_t* obegin = reinterpret_cast(buffer); + uint8_t* oend = utf_decoder::decode_utf16_block(data, data_length, obegin); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; + } + + template PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap) + { + const uint32_t* data = static_cast(contents); + size_t data_length = size / sizeof(uint32_t); + + // first pass: get length in utf8 units + size_t length = utf_decoder::decode_utf32_block(data, data_length, 0); + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // second pass: convert utf32 input to utf8 + uint8_t* obegin = reinterpret_cast(buffer); + uint8_t* oend = utf_decoder::decode_utf32_block(data, data_length, obegin); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; + } + + PUGI__FN size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size) + { + for (size_t i = 0; i < size; ++i) + if (data[i] > 127) + return i; + + return size; + } + + PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) + { + const uint8_t* data = static_cast(contents); + size_t data_length = size; + + // get size of prefix that does not need utf8 conversion + size_t prefix_length = get_latin1_7bit_prefix_length(data, data_length); + assert(prefix_length <= data_length); + + const uint8_t* postfix = data + prefix_length; + size_t postfix_length = data_length - prefix_length; + + // if no conversion is needed, just return the original buffer + if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); + + // first pass: get length in utf8 units + size_t length = prefix_length + utf_decoder::decode_latin1_block(postfix, postfix_length, 0); + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // second pass: convert latin1 input to utf8 + memcpy(buffer, data, prefix_length); + + uint8_t* obegin = reinterpret_cast(buffer); + uint8_t* oend = utf_decoder::decode_latin1_block(postfix, postfix_length, obegin + prefix_length); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; + } + + PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable) + { + // fast path: no conversion required + if (encoding == encoding_utf8) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); + + // source encoding is utf16 + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) + { + xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + return (native_encoding == encoding) ? + convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) : + convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true()); + } + + // source encoding is utf32 + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) + { + xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + return (native_encoding == encoding) ? + convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) : + convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true()); + } + + // source encoding is latin1 + if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable); + + assert(!"Invalid encoding"); + return false; + } +#endif + + PUGI__FN size_t as_utf8_begin(const wchar_t* str, size_t length) + { + // get length in utf8 characters + return utf_decoder::decode_wchar_block(str, length, 0); + } + + PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length) + { + // convert to utf8 + uint8_t* begin = reinterpret_cast(buffer); + uint8_t* end = utf_decoder::decode_wchar_block(str, length, begin); + + assert(begin + size == end); + (void)!end; + + // zero-terminate + buffer[size] = 0; + } + +#ifndef PUGIXML_NO_STL + PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length) + { + // first pass: get length in utf8 characters + size_t size = as_utf8_begin(str, length); + + // allocate resulting string + std::string result; + result.resize(size); + + // second pass: convert to utf8 + if (size > 0) as_utf8_end(&result[0], size, str, length); + + return result; + } + + PUGI__FN std::basic_string as_wide_impl(const char* str, size_t size) + { + const uint8_t* data = reinterpret_cast(str); + + // first pass: get length in wchar_t units + size_t length = utf_decoder::decode_utf8_block(data, size, 0); + + // allocate resulting string + std::basic_string result; + result.resize(length); + + // second pass: convert to wchar_t + if (length > 0) + { + wchar_writer::value_type begin = reinterpret_cast(&result[0]); + wchar_writer::value_type end = utf_decoder::decode_utf8_block(data, size, begin); + + assert(begin + length == end); + (void)!end; + } + + return result; + } +#endif + + inline bool strcpy_insitu_allow(size_t length, uintptr_t header, uintptr_t header_mask, char_t* target) + { + // never reuse shared memory + if (header & xml_memory_page_contents_shared_mask) return false; + + size_t target_length = strlength(target); + + // always reuse document buffer memory if possible + if ((header & header_mask) == 0) return target_length >= length; + + // reuse heap memory if waste is not too great + const size_t reuse_threshold = 32; + + return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2); + } + + PUGI__FN bool strcpy_insitu(char_t*& dest, uintptr_t& header, uintptr_t header_mask, const char_t* source) + { + assert(header); + + size_t source_length = strlength(source); + + if (source_length == 0) + { + // empty string and null pointer are equivalent, so just deallocate old memory + xml_allocator* alloc = reinterpret_cast(header & xml_memory_page_pointer_mask)->allocator; + + if (header & header_mask) alloc->deallocate_string(dest); + + // mark the string as not allocated + dest = 0; + header &= ~header_mask; + + return true; + } + else if (dest && strcpy_insitu_allow(source_length, header, header_mask, dest)) + { + // we can reuse old buffer, so just copy the new data (including zero terminator) + memcpy(dest, source, (source_length + 1) * sizeof(char_t)); + + return true; + } + else + { + xml_allocator* alloc = reinterpret_cast(header & xml_memory_page_pointer_mask)->allocator; + + // allocate new buffer + char_t* buf = alloc->allocate_string(source_length + 1); + if (!buf) return false; + + // copy the string (including zero terminator) + memcpy(buf, source, (source_length + 1) * sizeof(char_t)); + + // deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures) + if (header & header_mask) alloc->deallocate_string(dest); + + // the string is now allocated, so set the flag + dest = buf; + header |= header_mask; + + return true; + } + } + + struct gap + { + char_t* end; + size_t size; + + gap(): end(0), size(0) + { + } + + // Push new gap, move s count bytes further (skipping the gap). + // Collapse previous gap. + void push(char_t*& s, size_t count) + { + if (end) // there was a gap already; collapse it + { + // Move [old_gap_end, new_gap_start) to [old_gap_start, ...) + assert(s >= end); + memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end)); + } + + s += count; // end of current gap + + // "merge" two gaps + end = s; + size += count; + } + + // Collapse all gaps, return past-the-end pointer + char_t* flush(char_t* s) + { + if (end) + { + // Move [old_gap_end, current_pos) to [old_gap_start, ...) + assert(s >= end); + memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end)); + + return s - size; + } + else return s; + } + }; + + PUGI__FN char_t* strconv_escape(char_t* s, gap& g) + { + char_t* stre = s + 1; + + switch (*stre) + { + case '#': // &#... + { + unsigned int ucsc = 0; + + if (stre[1] == 'x') // &#x... (hex code) + { + stre += 2; + + char_t ch = *stre; + + if (ch == ';') return stre; + + for (;;) + { + if (static_cast(ch - '0') <= 9) + ucsc = 16 * ucsc + (ch - '0'); + else if (static_cast((ch | ' ') - 'a') <= 5) + ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10); + else if (ch == ';') + break; + else // cancel + return stre; + + ch = *++stre; + } + + ++stre; + } + else // &#... (dec code) + { + char_t ch = *++stre; + + if (ch == ';') return stre; + + for (;;) + { + if (static_cast(static_cast(ch) - '0') <= 9) + ucsc = 10 * ucsc + (ch - '0'); + else if (ch == ';') + break; + else // cancel + return stre; + + ch = *++stre; + } + + ++stre; + } + + #ifdef PUGIXML_WCHAR_MODE + s = reinterpret_cast(wchar_writer::any(reinterpret_cast(s), ucsc)); + #else + s = reinterpret_cast(utf8_writer::any(reinterpret_cast(s), ucsc)); + #endif + + g.push(s, stre - s); + return stre; + } + + case 'a': // &a + { + ++stre; + + if (*stre == 'm') // &am + { + if (*++stre == 'p' && *++stre == ';') // & + { + *s++ = '&'; + ++stre; + + g.push(s, stre - s); + return stre; + } + } + else if (*stre == 'p') // &ap + { + if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // ' + { + *s++ = '\''; + ++stre; + + g.push(s, stre - s); + return stre; + } + } + break; + } + + case 'g': // &g + { + if (*++stre == 't' && *++stre == ';') // > + { + *s++ = '>'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + + case 'l': // &l + { + if (*++stre == 't' && *++stre == ';') // < + { + *s++ = '<'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + + case 'q': // &q + { + if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // " + { + *s++ = '"'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + + default: + break; + } + + return stre; + } + + // Parser utilities + #define PUGI__ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e))) + #define PUGI__SKIPWS() { while (PUGI__IS_CHARTYPE(*s, ct_space)) ++s; } + #define PUGI__OPTSET(OPT) ( optmsk & (OPT) ) + #define PUGI__PUSHNODE(TYPE) { cursor = append_new_node(cursor, alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); } + #define PUGI__POPNODE() { cursor = cursor->parent; } + #define PUGI__SCANFOR(X) { while (*s != 0 && !(X)) ++s; } + #define PUGI__SCANWHILE(X) { while (X) ++s; } + #define PUGI__SCANWHILE_UNROLL(X) { for (;;) { char_t ss = s[0]; if (PUGI__UNLIKELY(!(X))) { break; } ss = s[1]; if (PUGI__UNLIKELY(!(X))) { s += 1; break; } ss = s[2]; if (PUGI__UNLIKELY(!(X))) { s += 2; break; } ss = s[3]; if (PUGI__UNLIKELY(!(X))) { s += 3; break; } s += 4; } } + #define PUGI__ENDSEG() { ch = *s; *s = 0; ++s; } + #define PUGI__THROW_ERROR(err, m) return error_offset = m, error_status = err, static_cast(0) + #define PUGI__CHECK_ERROR(err, m) { if (*s == 0) PUGI__THROW_ERROR(err, m); } + + PUGI__FN char_t* strconv_comment(char_t* s, char_t endch) + { + gap g; + + while (true) + { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_comment)); + + if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair + { + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); + } + else if (s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>')) // comment ends here + { + *g.flush(s) = 0; + + return s + (s[2] == '>' ? 3 : 2); + } + else if (*s == 0) + { + return 0; + } + else ++s; + } + } + + PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch) + { + gap g; + + while (true) + { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_cdata)); + + if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair + { + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); + } + else if (s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')) // CDATA ends here + { + *g.flush(s) = 0; + + return s + 1; + } + else if (*s == 0) + { + return 0; + } + else ++s; + } + } + + typedef char_t* (*strconv_pcdata_t)(char_t*); + + template struct strconv_pcdata_impl + { + static char_t* parse(char_t* s) + { + gap g; + + char_t* begin = s; + + while (true) + { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_pcdata)); + + if (*s == '<') // PCDATA ends here + { + char_t* end = g.flush(s); + + if (opt_trim::value) + while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space)) + --end; + + *end = 0; + + return s + 1; + } + else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair + { + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); + } + else if (opt_escape::value && *s == '&') + { + s = strconv_escape(s, g); + } + else if (*s == 0) + { + char_t* end = g.flush(s); + + if (opt_trim::value) + while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space)) + --end; + + *end = 0; + + return s; + } + else ++s; + } + } + }; + + PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask) + { + PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800); + + switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim) + { + case 0: return strconv_pcdata_impl::parse; + case 1: return strconv_pcdata_impl::parse; + case 2: return strconv_pcdata_impl::parse; + case 3: return strconv_pcdata_impl::parse; + case 4: return strconv_pcdata_impl::parse; + case 5: return strconv_pcdata_impl::parse; + case 6: return strconv_pcdata_impl::parse; + case 7: return strconv_pcdata_impl::parse; + default: assert(false); return 0; // should not get here + } + } + + typedef char_t* (*strconv_attribute_t)(char_t*, char_t); + + template struct strconv_attribute_impl + { + static char_t* parse_wnorm(char_t* s, char_t end_quote) + { + gap g; + + // trim leading whitespaces + if (PUGI__IS_CHARTYPE(*s, ct_space)) + { + char_t* str = s; + + do ++str; + while (PUGI__IS_CHARTYPE(*str, ct_space)); + + g.push(s, str - s); + } + + while (true) + { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space)); + + if (*s == end_quote) + { + char_t* str = g.flush(s); + + do *str-- = 0; + while (PUGI__IS_CHARTYPE(*str, ct_space)); + + return s + 1; + } + else if (PUGI__IS_CHARTYPE(*s, ct_space)) + { + *s++ = ' '; + + if (PUGI__IS_CHARTYPE(*s, ct_space)) + { + char_t* str = s + 1; + while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str; + + g.push(s, str - s); + } + } + else if (opt_escape::value && *s == '&') + { + s = strconv_escape(s, g); + } + else if (!*s) + { + return 0; + } + else ++s; + } + } + + static char_t* parse_wconv(char_t* s, char_t end_quote) + { + gap g; + + while (true) + { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws)); + + if (*s == end_quote) + { + *g.flush(s) = 0; + + return s + 1; + } + else if (PUGI__IS_CHARTYPE(*s, ct_space)) + { + if (*s == '\r') + { + *s++ = ' '; + + if (*s == '\n') g.push(s, 1); + } + else *s++ = ' '; + } + else if (opt_escape::value && *s == '&') + { + s = strconv_escape(s, g); + } + else if (!*s) + { + return 0; + } + else ++s; + } + } + + static char_t* parse_eol(char_t* s, char_t end_quote) + { + gap g; + + while (true) + { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr)); + + if (*s == end_quote) + { + *g.flush(s) = 0; + + return s + 1; + } + else if (*s == '\r') + { + *s++ = '\n'; + + if (*s == '\n') g.push(s, 1); + } + else if (opt_escape::value && *s == '&') + { + s = strconv_escape(s, g); + } + else if (!*s) + { + return 0; + } + else ++s; + } + } + + static char_t* parse_simple(char_t* s, char_t end_quote) + { + gap g; + + while (true) + { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr)); + + if (*s == end_quote) + { + *g.flush(s) = 0; + + return s + 1; + } + else if (opt_escape::value && *s == '&') + { + s = strconv_escape(s, g); + } + else if (!*s) + { + return 0; + } + else ++s; + } + } + }; + + PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask) + { + PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80); + + switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes) + { + case 0: return strconv_attribute_impl::parse_simple; + case 1: return strconv_attribute_impl::parse_simple; + case 2: return strconv_attribute_impl::parse_eol; + case 3: return strconv_attribute_impl::parse_eol; + case 4: return strconv_attribute_impl::parse_wconv; + case 5: return strconv_attribute_impl::parse_wconv; + case 6: return strconv_attribute_impl::parse_wconv; + case 7: return strconv_attribute_impl::parse_wconv; + case 8: return strconv_attribute_impl::parse_wnorm; + case 9: return strconv_attribute_impl::parse_wnorm; + case 10: return strconv_attribute_impl::parse_wnorm; + case 11: return strconv_attribute_impl::parse_wnorm; + case 12: return strconv_attribute_impl::parse_wnorm; + case 13: return strconv_attribute_impl::parse_wnorm; + case 14: return strconv_attribute_impl::parse_wnorm; + case 15: return strconv_attribute_impl::parse_wnorm; + default: assert(false); return 0; // should not get here + } + } + + inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0) + { + xml_parse_result result; + result.status = status; + result.offset = offset; + + return result; + } + + struct xml_parser + { + xml_allocator alloc; + char_t* error_offset; + xml_parse_status error_status; + + xml_parser(const xml_allocator& alloc_): alloc(alloc_), error_offset(0), error_status(status_ok) + { + } + + // DOCTYPE consists of nested sections of the following possible types: + // , , "...", '...' + // + // + // First group can not contain nested groups + // Second group can contain nested groups of the same type + // Third group can contain all other groups + char_t* parse_doctype_primitive(char_t* s) + { + if (*s == '"' || *s == '\'') + { + // quoted string + char_t ch = *s++; + PUGI__SCANFOR(*s == ch); + if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s); + + s++; + } + else if (s[0] == '<' && s[1] == '?') + { + // + s += 2; + PUGI__SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype + if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s); + + s += 2; + } + else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-') + { + s += 4; + PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype + if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s); + + s += 4; + } + else PUGI__THROW_ERROR(status_bad_doctype, s); + + return s; + } + + char_t* parse_doctype_ignore(char_t* s) + { + size_t depth = 0; + + assert(s[0] == '<' && s[1] == '!' && s[2] == '['); + s += 3; + + while (*s) + { + if (s[0] == '<' && s[1] == '!' && s[2] == '[') + { + // nested ignore section + s += 3; + depth++; + } + else if (s[0] == ']' && s[1] == ']' && s[2] == '>') + { + // ignore section end + s += 3; + + if (depth == 0) + return s; + + depth--; + } + else s++; + } + + PUGI__THROW_ERROR(status_bad_doctype, s); + } + + char_t* parse_doctype_group(char_t* s, char_t endch) + { + size_t depth = 0; + + assert((s[0] == '<' || s[0] == 0) && s[1] == '!'); + s += 2; + + while (*s) + { + if (s[0] == '<' && s[1] == '!' && s[2] != '-') + { + if (s[2] == '[') + { + // ignore + s = parse_doctype_ignore(s); + if (!s) return s; + } + else + { + // some control group + s += 2; + depth++; + } + } + else if (s[0] == '<' || s[0] == '"' || s[0] == '\'') + { + // unknown tag (forbidden), or some primitive group + s = parse_doctype_primitive(s); + if (!s) return s; + } + else if (*s == '>') + { + if (depth == 0) + return s; + + depth--; + s++; + } + else s++; + } + + if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s); + + return s; + } + + char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch) + { + // parse node contents, starting with exclamation mark + ++s; + + if (*s == '-') // 'value = s; // Save the offset. + } + + if (PUGI__OPTSET(parse_eol) && PUGI__OPTSET(parse_comments)) + { + s = strconv_comment(s, endch); + + if (!s) PUGI__THROW_ERROR(status_bad_comment, cursor->value); + } + else + { + // Scan for terminating '-->'. + PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>')); + PUGI__CHECK_ERROR(status_bad_comment, s); + + if (PUGI__OPTSET(parse_comments)) + *s = 0; // Zero-terminate this segment at the first terminating '-'. + + s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'. + } + } + else PUGI__THROW_ERROR(status_bad_comment, s); + } + else if (*s == '[') + { + // 'value = s; // Save the offset. + + if (PUGI__OPTSET(parse_eol)) + { + s = strconv_cdata(s, endch); + + if (!s) PUGI__THROW_ERROR(status_bad_cdata, cursor->value); + } + else + { + // Scan for terminating ']]>'. + PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')); + PUGI__CHECK_ERROR(status_bad_cdata, s); + + *s++ = 0; // Zero-terminate this segment. + } + } + else // Flagged for discard, but we still have to scan for the terminator. + { + // Scan for terminating ']]>'. + PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')); + PUGI__CHECK_ERROR(status_bad_cdata, s); + + ++s; + } + + s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'. + } + else PUGI__THROW_ERROR(status_bad_cdata, s); + } + else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && PUGI__ENDSWITH(s[6], 'E')) + { + s -= 2; + + if (cursor->parent) PUGI__THROW_ERROR(status_bad_doctype, s); + + char_t* mark = s + 9; + + s = parse_doctype_group(s, endch); + if (!s) return s; + + assert((*s == 0 && endch == '>') || *s == '>'); + if (*s) *s++ = 0; + + if (PUGI__OPTSET(parse_doctype)) + { + while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark; + + PUGI__PUSHNODE(node_doctype); + + cursor->value = mark; + } + } + else if (*s == 0 && endch == '-') PUGI__THROW_ERROR(status_bad_comment, s); + else if (*s == 0 && endch == '[') PUGI__THROW_ERROR(status_bad_cdata, s); + else PUGI__THROW_ERROR(status_unrecognized_tag, s); + + return s; + } + + char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch) + { + // load into registers + xml_node_struct* cursor = ref_cursor; + char_t ch = 0; + + // parse node contents, starting with question mark + ++s; + + // read PI target + char_t* target = s; + + if (!PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_pi, s); + + PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol)); + PUGI__CHECK_ERROR(status_bad_pi, s); + + // determine node type; stricmp / strcasecmp is not portable + bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s; + + if (declaration ? PUGI__OPTSET(parse_declaration) : PUGI__OPTSET(parse_pi)) + { + if (declaration) + { + // disallow non top-level declarations + if (cursor->parent) PUGI__THROW_ERROR(status_bad_pi, s); + + PUGI__PUSHNODE(node_declaration); + } + else + { + PUGI__PUSHNODE(node_pi); + } + + cursor->name = target; + + PUGI__ENDSEG(); + + // parse value/attributes + if (ch == '?') + { + // empty node + if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_pi, s); + s += (*s == '>'); + + PUGI__POPNODE(); + } + else if (PUGI__IS_CHARTYPE(ch, ct_space)) + { + PUGI__SKIPWS(); + + // scan for tag end + char_t* value = s; + + PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>')); + PUGI__CHECK_ERROR(status_bad_pi, s); + + if (declaration) + { + // replace ending ? with / so that 'element' terminates properly + *s = '/'; + + // we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES + s = value; + } + else + { + // store value and step over > + cursor->value = value; + PUGI__POPNODE(); + + PUGI__ENDSEG(); + + s += (*s == '>'); + } + } + else PUGI__THROW_ERROR(status_bad_pi, s); + } + else + { + // scan for tag end + PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>')); + PUGI__CHECK_ERROR(status_bad_pi, s); + + s += (s[1] == '>' ? 2 : 1); + } + + // store from registers + ref_cursor = cursor; + + return s; + } + + char_t* parse_tree(char_t* s, xml_node_struct* root, unsigned int optmsk, char_t endch) + { + strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk); + strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk); + + char_t ch = 0; + xml_node_struct* cursor = root; + char_t* mark = s; + + while (*s != 0) + { + if (*s == '<') + { + ++s; + + LOC_TAG: + if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...' + { + PUGI__PUSHNODE(node_element); // Append a new node to the tree. + + cursor->name = s; + + PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator. + PUGI__ENDSEG(); // Save char in 'ch', terminate & step over. + + if (ch == '>') + { + // end of tag + } + else if (PUGI__IS_CHARTYPE(ch, ct_space)) + { + LOC_ATTRIBUTES: + while (true) + { + PUGI__SKIPWS(); // Eat any whitespace. + + if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // <... #... + { + xml_attribute_struct* a = append_new_attribute(cursor, alloc); // Make space for this attribute. + if (!a) PUGI__THROW_ERROR(status_out_of_memory, s); + + a->name = s; // Save the offset. + + PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator. + PUGI__ENDSEG(); // Save char in 'ch', terminate & step over. + + if (PUGI__IS_CHARTYPE(ch, ct_space)) + { + PUGI__SKIPWS(); // Eat any whitespace. + + ch = *s; + ++s; + } + + if (ch == '=') // '<... #=...' + { + PUGI__SKIPWS(); // Eat any whitespace. + + if (*s == '"' || *s == '\'') // '<... #="...' + { + ch = *s; // Save quote char to avoid breaking on "''" -or- '""'. + ++s; // Step over the quote. + a->value = s; // Save the offset. + + s = strconv_attribute(s, ch); + + if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value); + + // After this line the loop continues from the start; + // Whitespaces, / and > are ok, symbols and EOF are wrong, + // everything else will be detected + if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_attribute, s); + } + else PUGI__THROW_ERROR(status_bad_attribute, s); + } + else PUGI__THROW_ERROR(status_bad_attribute, s); + } + else if (*s == '/') + { + ++s; + + if (*s == '>') + { + PUGI__POPNODE(); + s++; + break; + } + else if (*s == 0 && endch == '>') + { + PUGI__POPNODE(); + break; + } + else PUGI__THROW_ERROR(status_bad_start_element, s); + } + else if (*s == '>') + { + ++s; + + break; + } + else if (*s == 0 && endch == '>') + { + break; + } + else PUGI__THROW_ERROR(status_bad_start_element, s); + } + + // !!! + } + else if (ch == '/') // '<#.../' + { + if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_start_element, s); + + PUGI__POPNODE(); // Pop. + + s += (*s == '>'); + } + else if (ch == 0) + { + // we stepped over null terminator, backtrack & handle closing tag + --s; + + if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s); + } + else PUGI__THROW_ERROR(status_bad_start_element, s); + } + else if (*s == '/') + { + ++s; + + char_t* name = cursor->name; + if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, s); + + while (PUGI__IS_CHARTYPE(*s, ct_symbol)) + { + if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, s); + } + + if (*name) + { + if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s); + else PUGI__THROW_ERROR(status_end_element_mismatch, s); + } + + PUGI__POPNODE(); // Pop. + + PUGI__SKIPWS(); + + if (*s == 0) + { + if (endch != '>') PUGI__THROW_ERROR(status_bad_end_element, s); + } + else + { + if (*s != '>') PUGI__THROW_ERROR(status_bad_end_element, s); + ++s; + } + } + else if (*s == '?') // 'first_child) continue; + } + } + + if (!PUGI__OPTSET(parse_trim_pcdata)) + s = mark; + + if (cursor->parent || PUGI__OPTSET(parse_fragment)) + { + PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree. + cursor->value = s; // Save the offset. + + s = strconv_pcdata(s); + + PUGI__POPNODE(); // Pop since this is a standalone. + + if (!*s) break; + } + else + { + PUGI__SCANFOR(*s == '<'); // '...<' + if (!*s) break; + + ++s; + } + + // We're after '<' + goto LOC_TAG; + } + } + + // check that last tag is closed + if (cursor != root) PUGI__THROW_ERROR(status_end_element_mismatch, s); + + return s; + } + + #ifdef PUGIXML_WCHAR_MODE + static char_t* parse_skip_bom(char_t* s) + { + unsigned int bom = 0xfeff; + return (s[0] == static_cast(bom)) ? s + 1 : s; + } + #else + static char_t* parse_skip_bom(char_t* s) + { + return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s; + } + #endif + + static bool has_element_node_siblings(xml_node_struct* node) + { + while (node) + { + if (PUGI__NODETYPE(node) == node_element) return true; + + node = node->next_sibling; + } + + return false; + } + + static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk) + { + // allocator object is a part of document object + xml_allocator& alloc_ = *static_cast(xmldoc); + + // early-out for empty documents + if (length == 0) + return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element); + + // get last child of the root before parsing + xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0; + + // create parser on stack + xml_parser parser(alloc_); + + // save last character and make buffer zero-terminated (speeds up parsing) + char_t endch = buffer[length - 1]; + buffer[length - 1] = 0; + + // skip BOM to make sure it does not end up as part of parse output + char_t* buffer_data = parse_skip_bom(buffer); + + // perform actual parsing + parser.parse_tree(buffer_data, root, optmsk, endch); + + // update allocator state + alloc_ = parser.alloc; + + xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0); + assert(result.offset >= 0 && static_cast(result.offset) <= length); + + if (result) + { + // since we removed last character, we have to handle the only possible false positive (stray <) + if (endch == '<') + return make_parse_result(status_unrecognized_tag, length - 1); + + // check if there are any element nodes parsed + xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child; + + if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed)) + return make_parse_result(status_no_document_element, length - 1); + } + else + { + // roll back offset if it occurs on a null terminator in the source buffer + if (result.offset > 0 && static_cast(result.offset) == length - 1 && endch == 0) + result.offset--; + } + + return result; + } + }; + + // Output facilities + PUGI__FN xml_encoding get_write_native_encoding() + { + #ifdef PUGIXML_WCHAR_MODE + return get_wchar_encoding(); + #else + return encoding_utf8; + #endif + } + + PUGI__FN xml_encoding get_write_encoding(xml_encoding encoding) + { + // replace wchar encoding with utf implementation + if (encoding == encoding_wchar) return get_wchar_encoding(); + + // replace utf16 encoding with utf16 with specific endianness + if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + // replace utf32 encoding with utf32 with specific endianness + if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + // only do autodetection if no explicit encoding is requested + if (encoding != encoding_auto) return encoding; + + // assume utf8 encoding + return encoding_utf8; + } + +#ifdef PUGIXML_WCHAR_MODE + PUGI__FN size_t get_valid_length(const char_t* data, size_t length) + { + if (length < 1) return 0; + + // discard last character if it's the lead of a surrogate pair + return (sizeof(wchar_t) == 2 && static_cast(static_cast(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length; + } + + PUGI__FN size_t convert_buffer_output(char_t* r_char, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding) + { + // only endian-swapping is required + if (need_endian_swap_utf(encoding, get_wchar_encoding())) + { + convert_wchar_endian_swap(r_char, data, length); + + return length * sizeof(char_t); + } + + // convert to utf8 + if (encoding == encoding_utf8) + { + uint8_t* dest = r_u8; + uint8_t* end = utf_decoder::decode_wchar_block(data, length, dest); + + return static_cast(end - dest); + } + + // convert to utf16 + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) + { + uint16_t* dest = r_u16; + + // convert to native utf16 + uint16_t* end = utf_decoder::decode_wchar_block(data, length, dest); + + // swap if necessary + xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast(end - dest)); + + return static_cast(end - dest) * sizeof(uint16_t); + } + + // convert to utf32 + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) + { + uint32_t* dest = r_u32; + + // convert to native utf32 + uint32_t* end = utf_decoder::decode_wchar_block(data, length, dest); + + // swap if necessary + xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast(end - dest)); + + return static_cast(end - dest) * sizeof(uint32_t); + } + + // convert to latin1 + if (encoding == encoding_latin1) + { + uint8_t* dest = r_u8; + uint8_t* end = utf_decoder::decode_wchar_block(data, length, dest); + + return static_cast(end - dest); + } + + assert(!"Invalid encoding"); + return 0; + } +#else + PUGI__FN size_t get_valid_length(const char_t* data, size_t length) + { + if (length < 5) return 0; + + for (size_t i = 1; i <= 4; ++i) + { + uint8_t ch = static_cast(data[length - i]); + + // either a standalone character or a leading one + if ((ch & 0xc0) != 0x80) return length - i; + } + + // there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk + return length; + } + + PUGI__FN size_t convert_buffer_output(char_t* /* r_char */, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding) + { + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) + { + uint16_t* dest = r_u16; + + // convert to native utf16 + uint16_t* end = utf_decoder::decode_utf8_block(reinterpret_cast(data), length, dest); + + // swap if necessary + xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast(end - dest)); + + return static_cast(end - dest) * sizeof(uint16_t); + } + + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) + { + uint32_t* dest = r_u32; + + // convert to native utf32 + uint32_t* end = utf_decoder::decode_utf8_block(reinterpret_cast(data), length, dest); + + // swap if necessary + xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast(end - dest)); + + return static_cast(end - dest) * sizeof(uint32_t); + } + + if (encoding == encoding_latin1) + { + uint8_t* dest = r_u8; + uint8_t* end = utf_decoder::decode_utf8_block(reinterpret_cast(data), length, dest); + + return static_cast(end - dest); + } + + assert(!"Invalid encoding"); + return 0; + } +#endif + + class xml_buffered_writer + { + xml_buffered_writer(const xml_buffered_writer&); + xml_buffered_writer& operator=(const xml_buffered_writer&); + + public: + xml_buffered_writer(xml_writer& writer_, xml_encoding user_encoding): writer(writer_), bufsize(0), encoding(get_write_encoding(user_encoding)) + { + PUGI__STATIC_ASSERT(bufcapacity >= 8); + } + + ~xml_buffered_writer() + { + flush(); + } + + size_t flush() + { + flush(buffer, bufsize); + bufsize = 0; + return 0; + } + + void flush(const char_t* data, size_t size) + { + if (size == 0) return; + + // fast path, just write data + if (encoding == get_write_native_encoding()) + writer.write(data, size * sizeof(char_t)); + else + { + // convert chunk + size_t result = convert_buffer_output(scratch.data_char, scratch.data_u8, scratch.data_u16, scratch.data_u32, data, size, encoding); + assert(result <= sizeof(scratch)); + + // write data + writer.write(scratch.data_u8, result); + } + } + + void write_direct(const char_t* data, size_t length) + { + // flush the remaining buffer contents + flush(); + + // handle large chunks + if (length > bufcapacity) + { + if (encoding == get_write_native_encoding()) + { + // fast path, can just write data chunk + writer.write(data, length * sizeof(char_t)); + return; + } + + // need to convert in suitable chunks + while (length > bufcapacity) + { + // get chunk size by selecting such number of characters that are guaranteed to fit into scratch buffer + // and form a complete codepoint sequence (i.e. discard start of last codepoint if necessary) + size_t chunk_size = get_valid_length(data, bufcapacity); + assert(chunk_size); + + // convert chunk and write + flush(data, chunk_size); + + // iterate + data += chunk_size; + length -= chunk_size; + } + + // small tail is copied below + bufsize = 0; + } + + memcpy(buffer + bufsize, data, length * sizeof(char_t)); + bufsize += length; + } + + void write_buffer(const char_t* data, size_t length) + { + size_t offset = bufsize; + + if (offset + length <= bufcapacity) + { + memcpy(buffer + offset, data, length * sizeof(char_t)); + bufsize = offset + length; + } + else + { + write_direct(data, length); + } + } + + void write_string(const char_t* data) + { + // write the part of the string that fits in the buffer + size_t offset = bufsize; + + while (*data && offset < bufcapacity) + buffer[offset++] = *data++; + + // write the rest + if (offset < bufcapacity) + { + bufsize = offset; + } + else + { + // backtrack a bit if we have split the codepoint + size_t length = offset - bufsize; + size_t extra = length - get_valid_length(data - length, length); + + bufsize = offset - extra; + + write_direct(data - extra, strlength(data) + extra); + } + } + + void write(char_t d0) + { + size_t offset = bufsize; + if (offset > bufcapacity - 1) offset = flush(); + + buffer[offset + 0] = d0; + bufsize = offset + 1; + } + + void write(char_t d0, char_t d1) + { + size_t offset = bufsize; + if (offset > bufcapacity - 2) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + bufsize = offset + 2; + } + + void write(char_t d0, char_t d1, char_t d2) + { + size_t offset = bufsize; + if (offset > bufcapacity - 3) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + buffer[offset + 2] = d2; + bufsize = offset + 3; + } + + void write(char_t d0, char_t d1, char_t d2, char_t d3) + { + size_t offset = bufsize; + if (offset > bufcapacity - 4) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + buffer[offset + 2] = d2; + buffer[offset + 3] = d3; + bufsize = offset + 4; + } + + void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4) + { + size_t offset = bufsize; + if (offset > bufcapacity - 5) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + buffer[offset + 2] = d2; + buffer[offset + 3] = d3; + buffer[offset + 4] = d4; + bufsize = offset + 5; + } + + void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4, char_t d5) + { + size_t offset = bufsize; + if (offset > bufcapacity - 6) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + buffer[offset + 2] = d2; + buffer[offset + 3] = d3; + buffer[offset + 4] = d4; + buffer[offset + 5] = d5; + bufsize = offset + 6; + } + + // utf8 maximum expansion: x4 (-> utf32) + // utf16 maximum expansion: x2 (-> utf32) + // utf32 maximum expansion: x1 + enum + { + bufcapacitybytes = + #ifdef PUGIXML_MEMORY_OUTPUT_STACK + PUGIXML_MEMORY_OUTPUT_STACK + #else + 10240 + #endif + , + bufcapacity = bufcapacitybytes / (sizeof(char_t) + 4) + }; + + char_t buffer[bufcapacity]; + + union + { + uint8_t data_u8[4 * bufcapacity]; + uint16_t data_u16[2 * bufcapacity]; + uint32_t data_u32[bufcapacity]; + char_t data_char[bufcapacity]; + } scratch; + + xml_writer& writer; + size_t bufsize; + xml_encoding encoding; + }; + + PUGI__FN void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type) + { + while (*s) + { + const char_t* prev = s; + + // While *s is a usual symbol + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPEX(ss, type)); + + writer.write_buffer(prev, static_cast(s - prev)); + + switch (*s) + { + case 0: break; + case '&': + writer.write('&', 'a', 'm', 'p', ';'); + ++s; + break; + case '<': + writer.write('&', 'l', 't', ';'); + ++s; + break; + case '>': + writer.write('&', 'g', 't', ';'); + ++s; + break; + case '"': + writer.write('&', 'q', 'u', 'o', 't', ';'); + ++s; + break; + default: // s is not a usual symbol + { + unsigned int ch = static_cast(*s++); + assert(ch < 32); + + writer.write('&', '#', static_cast((ch / 10) + '0'), static_cast((ch % 10) + '0'), ';'); + } + } + } + } + + PUGI__FN void text_output(xml_buffered_writer& writer, const char_t* s, chartypex_t type, unsigned int flags) + { + if (flags & format_no_escapes) + writer.write_string(s); + else + text_output_escaped(writer, s, type); + } + + PUGI__FN void text_output_cdata(xml_buffered_writer& writer, const char_t* s) + { + do + { + writer.write('<', '!', '[', 'C', 'D'); + writer.write('A', 'T', 'A', '['); + + const char_t* prev = s; + + // look for ]]> sequence - we can't output it as is since it terminates CDATA + while (*s && !(s[0] == ']' && s[1] == ']' && s[2] == '>')) ++s; + + // skip ]] if we stopped at ]]>, > will go to the next CDATA section + if (*s) s += 2; + + writer.write_buffer(prev, static_cast(s - prev)); + + writer.write(']', ']', '>'); + } + while (*s); + } + + PUGI__FN void text_output_indent(xml_buffered_writer& writer, const char_t* indent, size_t indent_length, unsigned int depth) + { + switch (indent_length) + { + case 1: + { + for (unsigned int i = 0; i < depth; ++i) + writer.write(indent[0]); + break; + } + + case 2: + { + for (unsigned int i = 0; i < depth; ++i) + writer.write(indent[0], indent[1]); + break; + } + + case 3: + { + for (unsigned int i = 0; i < depth; ++i) + writer.write(indent[0], indent[1], indent[2]); + break; + } + + case 4: + { + for (unsigned int i = 0; i < depth; ++i) + writer.write(indent[0], indent[1], indent[2], indent[3]); + break; + } + + default: + { + for (unsigned int i = 0; i < depth; ++i) + writer.write_buffer(indent, indent_length); + } + } + } + + PUGI__FN void node_output_comment(xml_buffered_writer& writer, const char_t* s) + { + writer.write('<', '!', '-', '-'); + + while (*s) + { + const char_t* prev = s; + + // look for -\0 or -- sequence - we can't output it since -- is illegal in comment body + while (*s && !(s[0] == '-' && (s[1] == '-' || s[1] == 0))) ++s; + + writer.write_buffer(prev, static_cast(s - prev)); + + if (*s) + { + assert(*s == '-'); + + writer.write('-', ' '); + ++s; + } + } + + writer.write('-', '-', '>'); + } + + PUGI__FN void node_output_attributes(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags) + { + const char_t* default_name = PUGIXML_TEXT(":anonymous"); + + for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute) + { + writer.write(' '); + writer.write_string(a->name ? a->name : default_name); + writer.write('=', '"'); + + if (a->value) + text_output(writer, a->value, ctx_special_attr, flags); + + writer.write('"'); + } + } + + PUGI__FN bool node_output_start(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags) + { + const char_t* default_name = PUGIXML_TEXT(":anonymous"); + const char_t* name = node->name ? node->name : default_name; + + writer.write('<'); + writer.write_string(name); + + if (node->first_attribute) + node_output_attributes(writer, node, flags); + + if (flags & format_raw) + { + if (!node->first_child) + writer.write(' ', '/', '>'); + else + { + writer.write('>'); + + return true; + } + } + else + { + xml_node_struct* first = node->first_child; + + if (!first) + writer.write(' ', '/', '>', '\n'); + else if (!first->next_sibling && (PUGI__NODETYPE(first) == node_pcdata || PUGI__NODETYPE(first) == node_cdata)) + { + writer.write('>'); + + const char_t* value = first->value ? first->value : PUGIXML_TEXT(""); + + if (PUGI__NODETYPE(first) == node_pcdata) + text_output(writer, value, ctx_special_pcdata, flags); + else + text_output_cdata(writer, value); + + writer.write('<', '/'); + writer.write_string(name); + writer.write('>', '\n'); + } + else + { + writer.write('>', '\n'); + + return true; + } + } + + return false; + } + + PUGI__FN void node_output_end(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags) + { + const char_t* default_name = PUGIXML_TEXT(":anonymous"); + const char_t* name = node->name ? node->name : default_name; + + writer.write('<', '/'); + writer.write_string(name); + + if (flags & format_raw) + writer.write('>'); + else + writer.write('>', '\n'); + } + + PUGI__FN void node_output_simple(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags) + { + const char_t* default_name = PUGIXML_TEXT(":anonymous"); + + switch (PUGI__NODETYPE(node)) + { + case node_pcdata: + text_output(writer, node->value ? node->value : PUGIXML_TEXT(""), ctx_special_pcdata, flags); + if ((flags & format_raw) == 0) writer.write('\n'); + break; + + case node_cdata: + text_output_cdata(writer, node->value ? node->value : PUGIXML_TEXT("")); + if ((flags & format_raw) == 0) writer.write('\n'); + break; + + case node_comment: + node_output_comment(writer, node->value ? node->value : PUGIXML_TEXT("")); + if ((flags & format_raw) == 0) writer.write('\n'); + break; + + case node_pi: + writer.write('<', '?'); + writer.write_string(node->name ? node->name : default_name); + + if (node->value) + { + writer.write(' '); + writer.write_string(node->value); + } + + writer.write('?', '>'); + if ((flags & format_raw) == 0) writer.write('\n'); + break; + + case node_declaration: + writer.write('<', '?'); + writer.write_string(node->name ? node->name : default_name); + node_output_attributes(writer, node, flags); + writer.write('?', '>'); + if ((flags & format_raw) == 0) writer.write('\n'); + break; + + case node_doctype: + writer.write('<', '!', 'D', 'O', 'C'); + writer.write('T', 'Y', 'P', 'E'); + + if (node->value) + { + writer.write(' '); + writer.write_string(node->value); + } + + writer.write('>'); + if ((flags & format_raw) == 0) writer.write('\n'); + break; + + default: + assert(!"Invalid node type"); + } + } + + PUGI__FN void node_output(xml_buffered_writer& writer, xml_node_struct* root, const char_t* indent, unsigned int flags, unsigned int depth) + { + size_t indent_length = ((flags & (format_indent | format_raw)) == format_indent) ? strlength(indent) : 0; + + xml_node_struct* node = root; + + do + { + assert(node); + + // begin writing current node + if (indent_length) + text_output_indent(writer, indent, indent_length, depth); + + if (PUGI__NODETYPE(node) == node_element) + { + if (node_output_start(writer, node, flags)) + { + node = node->first_child; + depth++; + continue; + } + } + else if (PUGI__NODETYPE(node) == node_document) + { + if (node->first_child) + { + node = node->first_child; + continue; + } + } + else + { + node_output_simple(writer, node, flags); + } + + // continue to the next node + while (node != root) + { + if (node->next_sibling) + { + node = node->next_sibling; + break; + } + + node = node->parent; + + // write closing node + if (PUGI__NODETYPE(node) == node_element) + { + depth--; + + if (indent_length) + text_output_indent(writer, indent, indent_length, depth); + + node_output_end(writer, node, flags); + } + } + } + while (node != root); + } + + PUGI__FN bool has_declaration(xml_node_struct* node) + { + for (xml_node_struct* child = node->first_child; child; child = child->next_sibling) + { + xml_node_type type = PUGI__NODETYPE(child); + + if (type == node_declaration) return true; + if (type == node_element) return false; + } + + return false; + } + + PUGI__FN bool is_attribute_of(xml_attribute_struct* attr, xml_node_struct* node) + { + for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute) + if (a == attr) + return true; + + return false; + } + + PUGI__FN bool allow_insert_attribute(xml_node_type parent) + { + return parent == node_element || parent == node_declaration; + } + + PUGI__FN bool allow_insert_child(xml_node_type parent, xml_node_type child) + { + if (parent != node_document && parent != node_element) return false; + if (child == node_document || child == node_null) return false; + if (parent != node_document && (child == node_declaration || child == node_doctype)) return false; + + return true; + } + + PUGI__FN bool allow_move(xml_node parent, xml_node child) + { + // check that child can be a child of parent + if (!allow_insert_child(parent.type(), child.type())) + return false; + + // check that node is not moved between documents + if (parent.root() != child.root()) + return false; + + // check that new parent is not in the child subtree + xml_node cur = parent; + + while (cur) + { + if (cur == child) + return false; + + cur = cur.parent(); + } + + return true; + } + + PUGI__FN void node_copy_string(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char_t* source, uintptr_t& source_header, xml_allocator* alloc) + { + assert(!dest && (header & header_mask) == 0); + + if (source) + { + if (alloc && (source_header & header_mask) == 0) + { + dest = source; + + // since strcpy_insitu can reuse document buffer memory we need to mark both source and dest as shared + header |= xml_memory_page_contents_shared_mask; + source_header |= xml_memory_page_contents_shared_mask; + } + else + strcpy_insitu(dest, header, header_mask, source); + } + } + + PUGI__FN void node_copy_contents(xml_node_struct* dn, xml_node_struct* sn, xml_allocator* shared_alloc) + { + node_copy_string(dn->name, dn->header, xml_memory_page_name_allocated_mask, sn->name, sn->header, shared_alloc); + node_copy_string(dn->value, dn->header, xml_memory_page_value_allocated_mask, sn->value, sn->header, shared_alloc); + + for (xml_attribute_struct* sa = sn->first_attribute; sa; sa = sa->next_attribute) + { + xml_attribute_struct* da = append_new_attribute(dn, get_allocator(dn)); + + if (da) + { + node_copy_string(da->name, da->header, xml_memory_page_name_allocated_mask, sa->name, sa->header, shared_alloc); + node_copy_string(da->value, da->header, xml_memory_page_value_allocated_mask, sa->value, sa->header, shared_alloc); + } + } + } + + PUGI__FN void node_copy_tree(xml_node_struct* dn, xml_node_struct* sn) + { + xml_allocator& alloc = get_allocator(dn); + xml_allocator* shared_alloc = (&alloc == &get_allocator(sn)) ? &alloc : 0; + + node_copy_contents(dn, sn, shared_alloc); + + xml_node_struct* dit = dn; + xml_node_struct* sit = sn->first_child; + + while (sit && sit != sn) + { + if (sit != dn) + { + xml_node_struct* copy = append_new_node(dit, alloc, PUGI__NODETYPE(sit)); + + if (copy) + { + node_copy_contents(copy, sit, shared_alloc); + + if (sit->first_child) + { + dit = copy; + sit = sit->first_child; + continue; + } + } + } + + // continue to the next node + do + { + if (sit->next_sibling) + { + sit = sit->next_sibling; + break; + } + + sit = sit->parent; + dit = dit->parent; + } + while (sit != sn); + } + } + + inline bool is_text_node(xml_node_struct* node) + { + xml_node_type type = PUGI__NODETYPE(node); + + return type == node_pcdata || type == node_cdata; + } + + // get value with conversion functions + PUGI__FN int get_integer_base(const char_t* value) + { + const char_t* s = value; + + while (PUGI__IS_CHARTYPE(*s, ct_space)) + s++; + + if (*s == '-') + s++; + + return (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) ? 16 : 10; + } + + PUGI__FN int get_value_int(const char_t* value, int def) + { + if (!value) return def; + + int base = get_integer_base(value); + + #ifdef PUGIXML_WCHAR_MODE + return static_cast(wcstol(value, 0, base)); + #else + return static_cast(strtol(value, 0, base)); + #endif + } + + PUGI__FN unsigned int get_value_uint(const char_t* value, unsigned int def) + { + if (!value) return def; + + int base = get_integer_base(value); + + #ifdef PUGIXML_WCHAR_MODE + return static_cast(wcstoul(value, 0, base)); + #else + return static_cast(strtoul(value, 0, base)); + #endif + } + + PUGI__FN double get_value_double(const char_t* value, double def) + { + if (!value) return def; + + #ifdef PUGIXML_WCHAR_MODE + return wcstod(value, 0); + #else + return strtod(value, 0); + #endif + } + + PUGI__FN float get_value_float(const char_t* value, float def) + { + if (!value) return def; + + #ifdef PUGIXML_WCHAR_MODE + return static_cast(wcstod(value, 0)); + #else + return static_cast(strtod(value, 0)); + #endif + } + + PUGI__FN bool get_value_bool(const char_t* value, bool def) + { + if (!value) return def; + + // only look at first char + char_t first = *value; + + // 1*, t* (true), T* (True), y* (yes), Y* (YES) + return (first == '1' || first == 't' || first == 'T' || first == 'y' || first == 'Y'); + } + +#ifdef PUGIXML_HAS_LONG_LONG + PUGI__FN long long get_value_llong(const char_t* value, long long def) + { + if (!value) return def; + + int base = get_integer_base(value); + + #ifdef PUGIXML_WCHAR_MODE + #ifdef PUGI__MSVC_CRT_VERSION + return _wcstoi64(value, 0, base); + #else + return wcstoll(value, 0, base); + #endif + #else + #ifdef PUGI__MSVC_CRT_VERSION + return _strtoi64(value, 0, base); + #else + return strtoll(value, 0, base); + #endif + #endif + } + + PUGI__FN unsigned long long get_value_ullong(const char_t* value, unsigned long long def) + { + if (!value) return def; + + int base = get_integer_base(value); + + #ifdef PUGIXML_WCHAR_MODE + #ifdef PUGI__MSVC_CRT_VERSION + return _wcstoui64(value, 0, base); + #else + return wcstoull(value, 0, base); + #endif + #else + #ifdef PUGI__MSVC_CRT_VERSION + return _strtoui64(value, 0, base); + #else + return strtoull(value, 0, base); + #endif + #endif + } +#endif + + // set value with conversion functions + PUGI__FN bool set_value_buffer(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char (&buf)[128]) + { + #ifdef PUGIXML_WCHAR_MODE + char_t wbuf[128]; + impl::widen_ascii(wbuf, buf); + + return strcpy_insitu(dest, header, header_mask, wbuf); + #else + return strcpy_insitu(dest, header, header_mask, buf); + #endif + } + + PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, int value) + { + char buf[128]; + sprintf(buf, "%d", value); + + return set_value_buffer(dest, header, header_mask, buf); + } + + PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned int value) + { + char buf[128]; + sprintf(buf, "%u", value); + + return set_value_buffer(dest, header, header_mask, buf); + } + + PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, float value) + { + char buf[128]; + sprintf(buf, "%.9g", value); + + return set_value_buffer(dest, header, header_mask, buf); + } + + PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, double value) + { + char buf[128]; + sprintf(buf, "%.17g", value); + + return set_value_buffer(dest, header, header_mask, buf); + } + + PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, bool value) + { + return strcpy_insitu(dest, header, header_mask, value ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false")); + } + +#ifdef PUGIXML_HAS_LONG_LONG + PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, long long value) + { + char buf[128]; + sprintf(buf, "%lld", value); + + return set_value_buffer(dest, header, header_mask, buf); + } + + PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned long long value) + { + char buf[128]; + sprintf(buf, "%llu", value); + + return set_value_buffer(dest, header, header_mask, buf); + } +#endif + + // we need to get length of entire file to load it in memory; the only (relatively) sane way to do it is via seek/tell trick + PUGI__FN xml_parse_status get_file_size(FILE* file, size_t& out_result) + { + #if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE) + // there are 64-bit versions of fseek/ftell, let's use them + typedef __int64 length_type; + + _fseeki64(file, 0, SEEK_END); + length_type length = _ftelli64(file); + _fseeki64(file, 0, SEEK_SET); + #elif defined(__MINGW32__) && !defined(__NO_MINGW_LFS) && (!defined(__STRICT_ANSI__) || defined(__MINGW64_VERSION_MAJOR)) + // there are 64-bit versions of fseek/ftell, let's use them + typedef off64_t length_type; + + fseeko64(file, 0, SEEK_END); + length_type length = ftello64(file); + fseeko64(file, 0, SEEK_SET); + #else + // if this is a 32-bit OS, long is enough; if this is a unix system, long is 64-bit, which is enough; otherwise we can't do anything anyway. + typedef long length_type; + + fseek(file, 0, SEEK_END); + length_type length = ftell(file); + fseek(file, 0, SEEK_SET); + #endif + + // check for I/O errors + if (length < 0) return status_io_error; + + // check for overflow + size_t result = static_cast(length); + + if (static_cast(result) != length) return status_out_of_memory; + + // finalize + out_result = result; + + return status_ok; + } + + PUGI__FN size_t zero_terminate_buffer(void* buffer, size_t size, xml_encoding encoding) + { + // We only need to zero-terminate if encoding conversion does not do it for us + #ifdef PUGIXML_WCHAR_MODE + xml_encoding wchar_encoding = get_wchar_encoding(); + + if (encoding == wchar_encoding || need_endian_swap_utf(encoding, wchar_encoding)) + { + size_t length = size / sizeof(char_t); + + static_cast(buffer)[length] = 0; + return (length + 1) * sizeof(char_t); + } + #else + if (encoding == encoding_utf8) + { + static_cast(buffer)[size] = 0; + return size + 1; + } + #endif + + return size; + } + + PUGI__FN xml_parse_result load_file_impl(xml_document& doc, FILE* file, unsigned int options, xml_encoding encoding) + { + if (!file) return make_parse_result(status_file_not_found); + + // get file size (can result in I/O errors) + size_t size = 0; + xml_parse_status size_status = get_file_size(file, size); + + if (size_status != status_ok) + { + fclose(file); + return make_parse_result(size_status); + } + + size_t max_suffix_size = sizeof(char_t); + + // allocate buffer for the whole file + char* contents = static_cast(xml_memory::allocate(size + max_suffix_size)); + + if (!contents) + { + fclose(file); + return make_parse_result(status_out_of_memory); + } + + // read file in memory + size_t read_size = fread(contents, 1, size, file); + fclose(file); + + if (read_size != size) + { + xml_memory::deallocate(contents); + return make_parse_result(status_io_error); + } + + xml_encoding real_encoding = get_buffer_encoding(encoding, contents, size); + + return doc.load_buffer_inplace_own(contents, zero_terminate_buffer(contents, size, real_encoding), options, real_encoding); + } + +#ifndef PUGIXML_NO_STL + template struct xml_stream_chunk + { + static xml_stream_chunk* create() + { + void* memory = xml_memory::allocate(sizeof(xml_stream_chunk)); + + return new (memory) xml_stream_chunk(); + } + + static void destroy(void* ptr) + { + xml_stream_chunk* chunk = static_cast(ptr); + + // free chunk chain + while (chunk) + { + xml_stream_chunk* next_ = chunk->next; + + xml_memory::deallocate(chunk); + + chunk = next_; + } + } + + xml_stream_chunk(): next(0), size(0) + { + } + + xml_stream_chunk* next; + size_t size; + + T data[xml_memory_page_size / sizeof(T)]; + }; + + template PUGI__FN xml_parse_status load_stream_data_noseek(std::basic_istream& stream, void** out_buffer, size_t* out_size) + { + buffer_holder chunks(0, xml_stream_chunk::destroy); + + // read file to a chunk list + size_t total = 0; + xml_stream_chunk* last = 0; + + while (!stream.eof()) + { + // allocate new chunk + xml_stream_chunk* chunk = xml_stream_chunk::create(); + if (!chunk) return status_out_of_memory; + + // append chunk to list + if (last) last = last->next = chunk; + else chunks.data = last = chunk; + + // read data to chunk + stream.read(chunk->data, static_cast(sizeof(chunk->data) / sizeof(T))); + chunk->size = static_cast(stream.gcount()) * sizeof(T); + + // read may set failbit | eofbit in case gcount() is less than read length, so check for other I/O errors + if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error; + + // guard against huge files (chunk size is small enough to make this overflow check work) + if (total + chunk->size < total) return status_out_of_memory; + total += chunk->size; + } + + size_t max_suffix_size = sizeof(char_t); + + // copy chunk list to a contiguous buffer + char* buffer = static_cast(xml_memory::allocate(total + max_suffix_size)); + if (!buffer) return status_out_of_memory; + + char* write = buffer; + + for (xml_stream_chunk* chunk = static_cast*>(chunks.data); chunk; chunk = chunk->next) + { + assert(write + chunk->size <= buffer + total); + memcpy(write, chunk->data, chunk->size); + write += chunk->size; + } + + assert(write == buffer + total); + + // return buffer + *out_buffer = buffer; + *out_size = total; + + return status_ok; + } + + template PUGI__FN xml_parse_status load_stream_data_seek(std::basic_istream& stream, void** out_buffer, size_t* out_size) + { + // get length of remaining data in stream + typename std::basic_istream::pos_type pos = stream.tellg(); + stream.seekg(0, std::ios::end); + std::streamoff length = stream.tellg() - pos; + stream.seekg(pos); + + if (stream.fail() || pos < 0) return status_io_error; + + // guard against huge files + size_t read_length = static_cast(length); + + if (static_cast(read_length) != length || length < 0) return status_out_of_memory; + + size_t max_suffix_size = sizeof(char_t); + + // read stream data into memory (guard against stream exceptions with buffer holder) + buffer_holder buffer(xml_memory::allocate(read_length * sizeof(T) + max_suffix_size), xml_memory::deallocate); + if (!buffer.data) return status_out_of_memory; + + stream.read(static_cast(buffer.data), static_cast(read_length)); + + // read may set failbit | eofbit in case gcount() is less than read_length (i.e. line ending conversion), so check for other I/O errors + if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error; + + // return buffer + size_t actual_length = static_cast(stream.gcount()); + assert(actual_length <= read_length); + + *out_buffer = buffer.release(); + *out_size = actual_length * sizeof(T); + + return status_ok; + } + + template PUGI__FN xml_parse_result load_stream_impl(xml_document& doc, std::basic_istream& stream, unsigned int options, xml_encoding encoding) + { + void* buffer = 0; + size_t size = 0; + xml_parse_status status = status_ok; + + // if stream has an error bit set, bail out (otherwise tellg() can fail and we'll clear error bits) + if (stream.fail()) return make_parse_result(status_io_error); + + // load stream to memory (using seek-based implementation if possible, since it's faster and takes less memory) + if (stream.tellg() < 0) + { + stream.clear(); // clear error flags that could be set by a failing tellg + status = load_stream_data_noseek(stream, &buffer, &size); + } + else + status = load_stream_data_seek(stream, &buffer, &size); + + if (status != status_ok) return make_parse_result(status); + + xml_encoding real_encoding = get_buffer_encoding(encoding, buffer, size); + + return doc.load_buffer_inplace_own(buffer, zero_terminate_buffer(buffer, size, real_encoding), options, real_encoding); + } +#endif + +#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) || (defined(__MINGW32__) && (!defined(__STRICT_ANSI__) || defined(__MINGW64_VERSION_MAJOR))) + PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode) + { + return _wfopen(path, mode); + } +#else + PUGI__FN char* convert_path_heap(const wchar_t* str) + { + assert(str); + + // first pass: get length in utf8 characters + size_t length = strlength_wide(str); + size_t size = as_utf8_begin(str, length); + + // allocate resulting string + char* result = static_cast(xml_memory::allocate(size + 1)); + if (!result) return 0; + + // second pass: convert to utf8 + as_utf8_end(result, size, str, length); + + return result; + } + + PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode) + { + // there is no standard function to open wide paths, so our best bet is to try utf8 path + char* path_utf8 = convert_path_heap(path); + if (!path_utf8) return 0; + + // convert mode to ASCII (we mirror _wfopen interface) + char mode_ascii[4] = {0}; + for (size_t i = 0; mode[i]; ++i) mode_ascii[i] = static_cast(mode[i]); + + // try to open the utf8 path + FILE* result = fopen(path_utf8, mode_ascii); + + // free dummy buffer + xml_memory::deallocate(path_utf8); + + return result; + } +#endif + + PUGI__FN bool save_file_impl(const xml_document& doc, FILE* file, const char_t* indent, unsigned int flags, xml_encoding encoding) + { + if (!file) return false; + + xml_writer_file writer(file); + doc.save(writer, indent, flags, encoding); + + int result = ferror(file); + + fclose(file); + + return result == 0; + } + + PUGI__FN xml_parse_result load_buffer_impl(xml_document_struct* doc, xml_node_struct* root, void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own, char_t** out_buffer) + { + // check input buffer + assert(contents || size == 0); + + // get actual encoding + xml_encoding buffer_encoding = impl::get_buffer_encoding(encoding, contents, size); + + // get private buffer + char_t* buffer = 0; + size_t length = 0; + + if (!impl::convert_buffer(buffer, length, buffer_encoding, contents, size, is_mutable)) return impl::make_parse_result(status_out_of_memory); + + // delete original buffer if we performed a conversion + if (own && buffer != contents && contents) impl::xml_memory::deallocate(contents); + + // store buffer for offset_debug + doc->buffer = buffer; + + // parse + xml_parse_result res = impl::xml_parser::parse(buffer, length, doc, root, options); + + // remember encoding + res.encoding = buffer_encoding; + + // grab onto buffer if it's our buffer, user is responsible for deallocating contents himself + if (own || buffer != contents) *out_buffer = buffer; + + return res; + } +PUGI__NS_END + +namespace pugi +{ + PUGI__FN xml_writer_file::xml_writer_file(void* file_): file(file_) + { + } + + PUGI__FN void xml_writer_file::write(const void* data, size_t size) + { + size_t result = fwrite(data, 1, size, static_cast(file)); + (void)!result; // unfortunately we can't do proper error handling here + } + +#ifndef PUGIXML_NO_STL + PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream >& stream): narrow_stream(&stream), wide_stream(0) + { + } + + PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream >& stream): narrow_stream(0), wide_stream(&stream) + { + } + + PUGI__FN void xml_writer_stream::write(const void* data, size_t size) + { + if (narrow_stream) + { + assert(!wide_stream); + narrow_stream->write(reinterpret_cast(data), static_cast(size)); + } + else + { + assert(wide_stream); + assert(size % sizeof(wchar_t) == 0); + + wide_stream->write(reinterpret_cast(data), static_cast(size / sizeof(wchar_t))); + } + } +#endif + + PUGI__FN xml_tree_walker::xml_tree_walker(): _depth(0) + { + } + + PUGI__FN xml_tree_walker::~xml_tree_walker() + { + } + + PUGI__FN int xml_tree_walker::depth() const + { + return _depth; + } + + PUGI__FN bool xml_tree_walker::begin(xml_node&) + { + return true; + } + + PUGI__FN bool xml_tree_walker::end(xml_node&) + { + return true; + } + + PUGI__FN xml_attribute::xml_attribute(): _attr(0) + { + } + + PUGI__FN xml_attribute::xml_attribute(xml_attribute_struct* attr): _attr(attr) + { + } + + PUGI__FN static void unspecified_bool_xml_attribute(xml_attribute***) + { + } + + PUGI__FN xml_attribute::operator xml_attribute::unspecified_bool_type() const + { + return _attr ? unspecified_bool_xml_attribute : 0; + } + + PUGI__FN bool xml_attribute::operator!() const + { + return !_attr; + } + + PUGI__FN bool xml_attribute::operator==(const xml_attribute& r) const + { + return (_attr == r._attr); + } + + PUGI__FN bool xml_attribute::operator!=(const xml_attribute& r) const + { + return (_attr != r._attr); + } + + PUGI__FN bool xml_attribute::operator<(const xml_attribute& r) const + { + return (_attr < r._attr); + } + + PUGI__FN bool xml_attribute::operator>(const xml_attribute& r) const + { + return (_attr > r._attr); + } + + PUGI__FN bool xml_attribute::operator<=(const xml_attribute& r) const + { + return (_attr <= r._attr); + } + + PUGI__FN bool xml_attribute::operator>=(const xml_attribute& r) const + { + return (_attr >= r._attr); + } + + PUGI__FN xml_attribute xml_attribute::next_attribute() const + { + return _attr ? xml_attribute(_attr->next_attribute) : xml_attribute(); + } + + PUGI__FN xml_attribute xml_attribute::previous_attribute() const + { + return _attr && _attr->prev_attribute_c->next_attribute ? xml_attribute(_attr->prev_attribute_c) : xml_attribute(); + } + + PUGI__FN const char_t* xml_attribute::as_string(const char_t* def) const + { + return (_attr && _attr->value) ? _attr->value : def; + } + + PUGI__FN int xml_attribute::as_int(int def) const + { + return impl::get_value_int(_attr ? _attr->value : 0, def); + } + + PUGI__FN unsigned int xml_attribute::as_uint(unsigned int def) const + { + return impl::get_value_uint(_attr ? _attr->value : 0, def); + } + + PUGI__FN double xml_attribute::as_double(double def) const + { + return impl::get_value_double(_attr ? _attr->value : 0, def); + } + + PUGI__FN float xml_attribute::as_float(float def) const + { + return impl::get_value_float(_attr ? _attr->value : 0, def); + } + + PUGI__FN bool xml_attribute::as_bool(bool def) const + { + return impl::get_value_bool(_attr ? _attr->value : 0, def); + } + +#ifdef PUGIXML_HAS_LONG_LONG + PUGI__FN long long xml_attribute::as_llong(long long def) const + { + return impl::get_value_llong(_attr ? _attr->value : 0, def); + } + + PUGI__FN unsigned long long xml_attribute::as_ullong(unsigned long long def) const + { + return impl::get_value_ullong(_attr ? _attr->value : 0, def); + } +#endif + + PUGI__FN bool xml_attribute::empty() const + { + return !_attr; + } + + PUGI__FN const char_t* xml_attribute::name() const + { + return (_attr && _attr->name) ? _attr->name : PUGIXML_TEXT(""); + } + + PUGI__FN const char_t* xml_attribute::value() const + { + return (_attr && _attr->value) ? _attr->value : PUGIXML_TEXT(""); + } + + PUGI__FN size_t xml_attribute::hash_value() const + { + return static_cast(reinterpret_cast(_attr) / sizeof(xml_attribute_struct)); + } + + PUGI__FN xml_attribute_struct* xml_attribute::internal_object() const + { + return _attr; + } + + PUGI__FN xml_attribute& xml_attribute::operator=(const char_t* rhs) + { + set_value(rhs); + return *this; + } + + PUGI__FN xml_attribute& xml_attribute::operator=(int rhs) + { + set_value(rhs); + return *this; + } + + PUGI__FN xml_attribute& xml_attribute::operator=(unsigned int rhs) + { + set_value(rhs); + return *this; + } + + PUGI__FN xml_attribute& xml_attribute::operator=(double rhs) + { + set_value(rhs); + return *this; + } + + PUGI__FN xml_attribute& xml_attribute::operator=(float rhs) + { + set_value(rhs); + return *this; + } + + PUGI__FN xml_attribute& xml_attribute::operator=(bool rhs) + { + set_value(rhs); + return *this; + } + +#ifdef PUGIXML_HAS_LONG_LONG + PUGI__FN xml_attribute& xml_attribute::operator=(long long rhs) + { + set_value(rhs); + return *this; + } + + PUGI__FN xml_attribute& xml_attribute::operator=(unsigned long long rhs) + { + set_value(rhs); + return *this; + } +#endif + + PUGI__FN bool xml_attribute::set_name(const char_t* rhs) + { + if (!_attr) return false; + + return impl::strcpy_insitu(_attr->name, _attr->header, impl::xml_memory_page_name_allocated_mask, rhs); + } + + PUGI__FN bool xml_attribute::set_value(const char_t* rhs) + { + if (!_attr) return false; + + return impl::strcpy_insitu(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); + } + + PUGI__FN bool xml_attribute::set_value(int rhs) + { + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); + } + + PUGI__FN bool xml_attribute::set_value(unsigned int rhs) + { + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); + } + + PUGI__FN bool xml_attribute::set_value(double rhs) + { + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); + } + + PUGI__FN bool xml_attribute::set_value(float rhs) + { + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); + } + + PUGI__FN bool xml_attribute::set_value(bool rhs) + { + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); + } + +#ifdef PUGIXML_HAS_LONG_LONG + PUGI__FN bool xml_attribute::set_value(long long rhs) + { + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); + } + + PUGI__FN bool xml_attribute::set_value(unsigned long long rhs) + { + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); + } +#endif + +#ifdef __BORLANDC__ + PUGI__FN bool operator&&(const xml_attribute& lhs, bool rhs) + { + return (bool)lhs && rhs; + } + + PUGI__FN bool operator||(const xml_attribute& lhs, bool rhs) + { + return (bool)lhs || rhs; + } +#endif + + PUGI__FN xml_node::xml_node(): _root(0) + { + } + + PUGI__FN xml_node::xml_node(xml_node_struct* p): _root(p) + { + } + + PUGI__FN static void unspecified_bool_xml_node(xml_node***) + { + } + + PUGI__FN xml_node::operator xml_node::unspecified_bool_type() const + { + return _root ? unspecified_bool_xml_node : 0; + } + + PUGI__FN bool xml_node::operator!() const + { + return !_root; + } + + PUGI__FN xml_node::iterator xml_node::begin() const + { + return iterator(_root ? _root->first_child : 0, _root); + } + + PUGI__FN xml_node::iterator xml_node::end() const + { + return iterator(0, _root); + } + + PUGI__FN xml_node::attribute_iterator xml_node::attributes_begin() const + { + return attribute_iterator(_root ? _root->first_attribute : 0, _root); + } + + PUGI__FN xml_node::attribute_iterator xml_node::attributes_end() const + { + return attribute_iterator(0, _root); + } + + PUGI__FN xml_object_range xml_node::children() const + { + return xml_object_range(begin(), end()); + } + + PUGI__FN xml_object_range xml_node::children(const char_t* name_) const + { + return xml_object_range(xml_named_node_iterator(child(name_)._root, _root, name_), xml_named_node_iterator(0, _root, name_)); + } + + PUGI__FN xml_object_range xml_node::attributes() const + { + return xml_object_range(attributes_begin(), attributes_end()); + } + + PUGI__FN bool xml_node::operator==(const xml_node& r) const + { + return (_root == r._root); + } + + PUGI__FN bool xml_node::operator!=(const xml_node& r) const + { + return (_root != r._root); + } + + PUGI__FN bool xml_node::operator<(const xml_node& r) const + { + return (_root < r._root); + } + + PUGI__FN bool xml_node::operator>(const xml_node& r) const + { + return (_root > r._root); + } + + PUGI__FN bool xml_node::operator<=(const xml_node& r) const + { + return (_root <= r._root); + } + + PUGI__FN bool xml_node::operator>=(const xml_node& r) const + { + return (_root >= r._root); + } + + PUGI__FN bool xml_node::empty() const + { + return !_root; + } + + PUGI__FN const char_t* xml_node::name() const + { + return (_root && _root->name) ? _root->name : PUGIXML_TEXT(""); + } + + PUGI__FN xml_node_type xml_node::type() const + { + return _root ? PUGI__NODETYPE(_root) : node_null; + } + + PUGI__FN const char_t* xml_node::value() const + { + return (_root && _root->value) ? _root->value : PUGIXML_TEXT(""); + } + + PUGI__FN xml_node xml_node::child(const char_t* name_) const + { + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (i->name && impl::strequal(name_, i->name)) return xml_node(i); + + return xml_node(); + } + + PUGI__FN xml_attribute xml_node::attribute(const char_t* name_) const + { + if (!_root) return xml_attribute(); + + for (xml_attribute_struct* i = _root->first_attribute; i; i = i->next_attribute) + if (i->name && impl::strequal(name_, i->name)) + return xml_attribute(i); + + return xml_attribute(); + } + + PUGI__FN xml_node xml_node::next_sibling(const char_t* name_) const + { + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->next_sibling; i; i = i->next_sibling) + if (i->name && impl::strequal(name_, i->name)) return xml_node(i); + + return xml_node(); + } + + PUGI__FN xml_node xml_node::next_sibling() const + { + return _root ? xml_node(_root->next_sibling) : xml_node(); + } + + PUGI__FN xml_node xml_node::previous_sibling(const char_t* name_) const + { + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->prev_sibling_c; i->next_sibling; i = i->prev_sibling_c) + if (i->name && impl::strequal(name_, i->name)) return xml_node(i); + + return xml_node(); + } + + PUGI__FN xml_node xml_node::previous_sibling() const + { + if (!_root) return xml_node(); + + if (_root->prev_sibling_c->next_sibling) return xml_node(_root->prev_sibling_c); + else return xml_node(); + } + + PUGI__FN xml_node xml_node::parent() const + { + return _root ? xml_node(_root->parent) : xml_node(); + } + + PUGI__FN xml_node xml_node::root() const + { + return _root ? xml_node(&impl::get_document(_root)) : xml_node(); + } + + PUGI__FN xml_text xml_node::text() const + { + return xml_text(_root); + } + + PUGI__FN const char_t* xml_node::child_value() const + { + if (!_root) return PUGIXML_TEXT(""); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (i->value && impl::is_text_node(i)) + return i->value; + + return PUGIXML_TEXT(""); + } + + PUGI__FN const char_t* xml_node::child_value(const char_t* name_) const + { + return child(name_).child_value(); + } + + PUGI__FN xml_attribute xml_node::first_attribute() const + { + return _root ? xml_attribute(_root->first_attribute) : xml_attribute(); + } + + PUGI__FN xml_attribute xml_node::last_attribute() const + { + return _root && _root->first_attribute ? xml_attribute(_root->first_attribute->prev_attribute_c) : xml_attribute(); + } + + PUGI__FN xml_node xml_node::first_child() const + { + return _root ? xml_node(_root->first_child) : xml_node(); + } + + PUGI__FN xml_node xml_node::last_child() const + { + return _root && _root->first_child ? xml_node(_root->first_child->prev_sibling_c) : xml_node(); + } + + PUGI__FN bool xml_node::set_name(const char_t* rhs) + { + switch (type()) + { + case node_pi: + case node_declaration: + case node_element: + return impl::strcpy_insitu(_root->name, _root->header, impl::xml_memory_page_name_allocated_mask, rhs); + + default: + return false; + } + } + + PUGI__FN bool xml_node::set_value(const char_t* rhs) + { + switch (type()) + { + case node_pi: + case node_cdata: + case node_pcdata: + case node_comment: + case node_doctype: + return impl::strcpy_insitu(_root->value, _root->header, impl::xml_memory_page_value_allocated_mask, rhs); + + default: + return false; + } + } + + PUGI__FN xml_attribute xml_node::append_attribute(const char_t* name_) + { + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root))); + if (!a) return xml_attribute(); + + impl::append_attribute(a._attr, _root); + + a.set_name(name_); + + return a; + } + + PUGI__FN xml_attribute xml_node::prepend_attribute(const char_t* name_) + { + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root))); + if (!a) return xml_attribute(); + + impl::prepend_attribute(a._attr, _root); + + a.set_name(name_); + + return a; + } + + PUGI__FN xml_attribute xml_node::insert_attribute_after(const char_t* name_, const xml_attribute& attr) + { + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root))); + if (!a) return xml_attribute(); + + impl::insert_attribute_after(a._attr, attr._attr, _root); + + a.set_name(name_); + + return a; + } + + PUGI__FN xml_attribute xml_node::insert_attribute_before(const char_t* name_, const xml_attribute& attr) + { + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root))); + if (!a) return xml_attribute(); + + impl::insert_attribute_before(a._attr, attr._attr, _root); + + a.set_name(name_); + + return a; + } + + PUGI__FN xml_attribute xml_node::append_copy(const xml_attribute& proto) + { + if (!proto) return xml_attribute(); + + xml_attribute result = append_attribute(proto.name()); + result.set_value(proto.value()); + + return result; + } + + PUGI__FN xml_attribute xml_node::prepend_copy(const xml_attribute& proto) + { + if (!proto) return xml_attribute(); + + xml_attribute result = prepend_attribute(proto.name()); + result.set_value(proto.value()); + + return result; + } + + PUGI__FN xml_attribute xml_node::insert_copy_after(const xml_attribute& proto, const xml_attribute& attr) + { + if (!proto) return xml_attribute(); + + xml_attribute result = insert_attribute_after(proto.name(), attr); + result.set_value(proto.value()); + + return result; + } + + PUGI__FN xml_attribute xml_node::insert_copy_before(const xml_attribute& proto, const xml_attribute& attr) + { + if (!proto) return xml_attribute(); + + xml_attribute result = insert_attribute_before(proto.name(), attr); + result.set_value(proto.value()); + + return result; + } + + PUGI__FN xml_node xml_node::append_child(xml_node_type type_) + { + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + + xml_node n(impl::allocate_node(impl::get_allocator(_root), type_)); + if (!n) return xml_node(); + + impl::append_node(n._root, _root); + + if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml")); + + return n; + } + + PUGI__FN xml_node xml_node::prepend_child(xml_node_type type_) + { + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + + xml_node n(impl::allocate_node(impl::get_allocator(_root), type_)); + if (!n) return xml_node(); + + impl::prepend_node(n._root, _root); + + if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml")); + + return n; + } + + PUGI__FN xml_node xml_node::insert_child_before(xml_node_type type_, const xml_node& node) + { + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + + xml_node n(impl::allocate_node(impl::get_allocator(_root), type_)); + if (!n) return xml_node(); + + impl::insert_node_before(n._root, node._root); + + if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml")); + + return n; + } + + PUGI__FN xml_node xml_node::insert_child_after(xml_node_type type_, const xml_node& node) + { + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + + xml_node n(impl::allocate_node(impl::get_allocator(_root), type_)); + if (!n) return xml_node(); + + impl::insert_node_after(n._root, node._root); + + if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml")); + + return n; + } + + PUGI__FN xml_node xml_node::append_child(const char_t* name_) + { + xml_node result = append_child(node_element); + + result.set_name(name_); + + return result; + } + + PUGI__FN xml_node xml_node::prepend_child(const char_t* name_) + { + xml_node result = prepend_child(node_element); + + result.set_name(name_); + + return result; + } + + PUGI__FN xml_node xml_node::insert_child_after(const char_t* name_, const xml_node& node) + { + xml_node result = insert_child_after(node_element, node); + + result.set_name(name_); + + return result; + } + + PUGI__FN xml_node xml_node::insert_child_before(const char_t* name_, const xml_node& node) + { + xml_node result = insert_child_before(node_element, node); + + result.set_name(name_); + + return result; + } + + PUGI__FN xml_node xml_node::append_copy(const xml_node& proto) + { + xml_node_type type_ = proto.type(); + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + + xml_node n(impl::allocate_node(impl::get_allocator(_root), type_)); + if (!n) return xml_node(); + + impl::append_node(n._root, _root); + impl::node_copy_tree(n._root, proto._root); + + return n; + } + + PUGI__FN xml_node xml_node::prepend_copy(const xml_node& proto) + { + xml_node_type type_ = proto.type(); + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + + xml_node n(impl::allocate_node(impl::get_allocator(_root), type_)); + if (!n) return xml_node(); + + impl::prepend_node(n._root, _root); + impl::node_copy_tree(n._root, proto._root); + + return n; + } + + PUGI__FN xml_node xml_node::insert_copy_after(const xml_node& proto, const xml_node& node) + { + xml_node_type type_ = proto.type(); + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + + xml_node n(impl::allocate_node(impl::get_allocator(_root), type_)); + if (!n) return xml_node(); + + impl::insert_node_after(n._root, node._root); + impl::node_copy_tree(n._root, proto._root); + + return n; + } + + PUGI__FN xml_node xml_node::insert_copy_before(const xml_node& proto, const xml_node& node) + { + xml_node_type type_ = proto.type(); + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + + xml_node n(impl::allocate_node(impl::get_allocator(_root), type_)); + if (!n) return xml_node(); + + impl::insert_node_before(n._root, node._root); + impl::node_copy_tree(n._root, proto._root); + + return n; + } + + PUGI__FN xml_node xml_node::append_move(const xml_node& moved) + { + if (!impl::allow_move(*this, moved)) return xml_node(); + + // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers + impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask; + + impl::remove_node(moved._root); + impl::append_node(moved._root, _root); + + return moved; + } + + PUGI__FN xml_node xml_node::prepend_move(const xml_node& moved) + { + if (!impl::allow_move(*this, moved)) return xml_node(); + + // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers + impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask; + + impl::remove_node(moved._root); + impl::prepend_node(moved._root, _root); + + return moved; + } + + PUGI__FN xml_node xml_node::insert_move_after(const xml_node& moved, const xml_node& node) + { + if (!impl::allow_move(*this, moved)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + if (moved._root == node._root) return xml_node(); + + // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers + impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask; + + impl::remove_node(moved._root); + impl::insert_node_after(moved._root, node._root); + + return moved; + } + + PUGI__FN xml_node xml_node::insert_move_before(const xml_node& moved, const xml_node& node) + { + if (!impl::allow_move(*this, moved)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + if (moved._root == node._root) return xml_node(); + + // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers + impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask; + + impl::remove_node(moved._root); + impl::insert_node_before(moved._root, node._root); + + return moved; + } + + PUGI__FN bool xml_node::remove_attribute(const char_t* name_) + { + return remove_attribute(attribute(name_)); + } + + PUGI__FN bool xml_node::remove_attribute(const xml_attribute& a) + { + if (!_root || !a._attr) return false; + if (!impl::is_attribute_of(a._attr, _root)) return false; + + impl::remove_attribute(a._attr, _root); + impl::destroy_attribute(a._attr, impl::get_allocator(_root)); + + return true; + } + + PUGI__FN bool xml_node::remove_child(const char_t* name_) + { + return remove_child(child(name_)); + } + + PUGI__FN bool xml_node::remove_child(const xml_node& n) + { + if (!_root || !n._root || n._root->parent != _root) return false; + + impl::remove_node(n._root); + impl::destroy_node(n._root, impl::get_allocator(_root)); + + return true; + } + + PUGI__FN xml_parse_result xml_node::append_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding) + { + // append_buffer is only valid for elements/documents + if (!impl::allow_insert_child(type(), node_element)) return impl::make_parse_result(status_append_invalid_root); + + // get document node + impl::xml_document_struct* doc = &impl::get_document(_root); + + // disable document_buffer_order optimization since in a document with multiple buffers comparing buffer pointers does not make sense + doc->header |= impl::xml_memory_page_contents_shared_mask; + + // get extra buffer element (we'll store the document fragment buffer there so that we can deallocate it later) + impl::xml_memory_page* page = 0; + impl::xml_extra_buffer* extra = static_cast(doc->allocate_memory(sizeof(impl::xml_extra_buffer), page)); + (void)page; + + if (!extra) return impl::make_parse_result(status_out_of_memory); + + // save name; name of the root has to be NULL before parsing - otherwise closing node mismatches will not be detected at the top level + char_t* rootname = _root->name; + _root->name = 0; + + // parse + char_t* buffer = 0; + xml_parse_result res = impl::load_buffer_impl(doc, _root, const_cast(contents), size, options, encoding, false, false, &buffer); + + // restore name + _root->name = rootname; + + // add extra buffer to the list + extra->buffer = buffer; + extra->next = doc->extra_buffers; + doc->extra_buffers = extra; + + return res; + } + + PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* name_, const char_t* attr_name, const char_t* attr_value) const + { + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (i->name && impl::strequal(name_, i->name)) + { + for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute) + if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value : PUGIXML_TEXT(""))) + return xml_node(i); + } + + return xml_node(); + } + + PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const + { + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute) + if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value : PUGIXML_TEXT(""))) + return xml_node(i); + + return xml_node(); + } + +#ifndef PUGIXML_NO_STL + PUGI__FN string_t xml_node::path(char_t delimiter) const + { + xml_node cursor = *this; // Make a copy. + + string_t result = cursor.name(); + + while (cursor.parent()) + { + cursor = cursor.parent(); + + string_t temp = cursor.name(); + temp += delimiter; + temp += result; + result.swap(temp); + } + + return result; + } +#endif + + PUGI__FN xml_node xml_node::first_element_by_path(const char_t* path_, char_t delimiter) const + { + xml_node found = *this; // Current search context. + + if (!_root || !path_ || !path_[0]) return found; + + if (path_[0] == delimiter) + { + // Absolute path; e.g. '/foo/bar' + found = found.root(); + ++path_; + } + + const char_t* path_segment = path_; + + while (*path_segment == delimiter) ++path_segment; + + const char_t* path_segment_end = path_segment; + + while (*path_segment_end && *path_segment_end != delimiter) ++path_segment_end; + + if (path_segment == path_segment_end) return found; + + const char_t* next_segment = path_segment_end; + + while (*next_segment == delimiter) ++next_segment; + + if (*path_segment == '.' && path_segment + 1 == path_segment_end) + return found.first_element_by_path(next_segment, delimiter); + else if (*path_segment == '.' && *(path_segment+1) == '.' && path_segment + 2 == path_segment_end) + return found.parent().first_element_by_path(next_segment, delimiter); + else + { + for (xml_node_struct* j = found._root->first_child; j; j = j->next_sibling) + { + if (j->name && impl::strequalrange(j->name, path_segment, static_cast(path_segment_end - path_segment))) + { + xml_node subsearch = xml_node(j).first_element_by_path(next_segment, delimiter); + + if (subsearch) return subsearch; + } + } + + return xml_node(); + } + } + + PUGI__FN bool xml_node::traverse(xml_tree_walker& walker) + { + walker._depth = -1; + + xml_node arg_begin = *this; + if (!walker.begin(arg_begin)) return false; + + xml_node cur = first_child(); + + if (cur) + { + ++walker._depth; + + do + { + xml_node arg_for_each = cur; + if (!walker.for_each(arg_for_each)) + return false; + + if (cur.first_child()) + { + ++walker._depth; + cur = cur.first_child(); + } + else if (cur.next_sibling()) + cur = cur.next_sibling(); + else + { + // Borland C++ workaround + while (!cur.next_sibling() && cur != *this && !cur.parent().empty()) + { + --walker._depth; + cur = cur.parent(); + } + + if (cur != *this) + cur = cur.next_sibling(); + } + } + while (cur && cur != *this); + } + + assert(walker._depth == -1); + + xml_node arg_end = *this; + return walker.end(arg_end); + } + + PUGI__FN size_t xml_node::hash_value() const + { + return static_cast(reinterpret_cast(_root) / sizeof(xml_node_struct)); + } + + PUGI__FN xml_node_struct* xml_node::internal_object() const + { + return _root; + } + + PUGI__FN void xml_node::print(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const + { + if (!_root) return; + + impl::xml_buffered_writer buffered_writer(writer, encoding); + + impl::node_output(buffered_writer, _root, indent, flags, depth); + } + +#ifndef PUGIXML_NO_STL + PUGI__FN void xml_node::print(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const + { + xml_writer_stream writer(stream); + + print(writer, indent, flags, encoding, depth); + } + + PUGI__FN void xml_node::print(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, unsigned int depth) const + { + xml_writer_stream writer(stream); + + print(writer, indent, flags, encoding_wchar, depth); + } +#endif + + PUGI__FN ptrdiff_t xml_node::offset_debug() const + { + if (!_root) return -1; + + impl::xml_document_struct& doc = impl::get_document(_root); + + // we can determine the offset reliably only if there is exactly once parse buffer + if (!doc.buffer || doc.extra_buffers) return -1; + + switch (type()) + { + case node_document: + return 0; + + case node_element: + case node_declaration: + case node_pi: + return _root->name && (_root->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0 ? _root->name - doc.buffer : -1; + + case node_pcdata: + case node_cdata: + case node_comment: + case node_doctype: + return _root->value && (_root->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0 ? _root->value - doc.buffer : -1; + + default: + return -1; + } + } + +#ifdef __BORLANDC__ + PUGI__FN bool operator&&(const xml_node& lhs, bool rhs) + { + return (bool)lhs && rhs; + } + + PUGI__FN bool operator||(const xml_node& lhs, bool rhs) + { + return (bool)lhs || rhs; + } +#endif + + PUGI__FN xml_text::xml_text(xml_node_struct* root): _root(root) + { + } + + PUGI__FN xml_node_struct* xml_text::_data() const + { + if (!_root || impl::is_text_node(_root)) return _root; + + for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling) + if (impl::is_text_node(node)) + return node; + + return 0; + } + + PUGI__FN xml_node_struct* xml_text::_data_new() + { + xml_node_struct* d = _data(); + if (d) return d; + + return xml_node(_root).append_child(node_pcdata).internal_object(); + } + + PUGI__FN xml_text::xml_text(): _root(0) + { + } + + PUGI__FN static void unspecified_bool_xml_text(xml_text***) + { + } + + PUGI__FN xml_text::operator xml_text::unspecified_bool_type() const + { + return _data() ? unspecified_bool_xml_text : 0; + } + + PUGI__FN bool xml_text::operator!() const + { + return !_data(); + } + + PUGI__FN bool xml_text::empty() const + { + return _data() == 0; + } + + PUGI__FN const char_t* xml_text::get() const + { + xml_node_struct* d = _data(); + + return (d && d->value) ? d->value : PUGIXML_TEXT(""); + } + + PUGI__FN const char_t* xml_text::as_string(const char_t* def) const + { + xml_node_struct* d = _data(); + + return (d && d->value) ? d->value : def; + } + + PUGI__FN int xml_text::as_int(int def) const + { + xml_node_struct* d = _data(); + + return impl::get_value_int(d ? d->value : 0, def); + } + + PUGI__FN unsigned int xml_text::as_uint(unsigned int def) const + { + xml_node_struct* d = _data(); + + return impl::get_value_uint(d ? d->value : 0, def); + } + + PUGI__FN double xml_text::as_double(double def) const + { + xml_node_struct* d = _data(); + + return impl::get_value_double(d ? d->value : 0, def); + } + + PUGI__FN float xml_text::as_float(float def) const + { + xml_node_struct* d = _data(); + + return impl::get_value_float(d ? d->value : 0, def); + } + + PUGI__FN bool xml_text::as_bool(bool def) const + { + xml_node_struct* d = _data(); + + return impl::get_value_bool(d ? d->value : 0, def); + } + +#ifdef PUGIXML_HAS_LONG_LONG + PUGI__FN long long xml_text::as_llong(long long def) const + { + xml_node_struct* d = _data(); + + return impl::get_value_llong(d ? d->value : 0, def); + } + + PUGI__FN unsigned long long xml_text::as_ullong(unsigned long long def) const + { + xml_node_struct* d = _data(); + + return impl::get_value_ullong(d ? d->value : 0, def); + } +#endif + + PUGI__FN bool xml_text::set(const char_t* rhs) + { + xml_node_struct* dn = _data_new(); + + return dn ? impl::strcpy_insitu(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; + } + + PUGI__FN bool xml_text::set(int rhs) + { + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; + } + + PUGI__FN bool xml_text::set(unsigned int rhs) + { + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; + } + + PUGI__FN bool xml_text::set(float rhs) + { + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; + } + + PUGI__FN bool xml_text::set(double rhs) + { + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; + } + + PUGI__FN bool xml_text::set(bool rhs) + { + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; + } + +#ifdef PUGIXML_HAS_LONG_LONG + PUGI__FN bool xml_text::set(long long rhs) + { + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; + } + + PUGI__FN bool xml_text::set(unsigned long long rhs) + { + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; + } +#endif + + PUGI__FN xml_text& xml_text::operator=(const char_t* rhs) + { + set(rhs); + return *this; + } + + PUGI__FN xml_text& xml_text::operator=(int rhs) + { + set(rhs); + return *this; + } + + PUGI__FN xml_text& xml_text::operator=(unsigned int rhs) + { + set(rhs); + return *this; + } + + PUGI__FN xml_text& xml_text::operator=(double rhs) + { + set(rhs); + return *this; + } + + PUGI__FN xml_text& xml_text::operator=(float rhs) + { + set(rhs); + return *this; + } + + PUGI__FN xml_text& xml_text::operator=(bool rhs) + { + set(rhs); + return *this; + } + +#ifdef PUGIXML_HAS_LONG_LONG + PUGI__FN xml_text& xml_text::operator=(long long rhs) + { + set(rhs); + return *this; + } + + PUGI__FN xml_text& xml_text::operator=(unsigned long long rhs) + { + set(rhs); + return *this; + } +#endif + + PUGI__FN xml_node xml_text::data() const + { + return xml_node(_data()); + } + +#ifdef __BORLANDC__ + PUGI__FN bool operator&&(const xml_text& lhs, bool rhs) + { + return (bool)lhs && rhs; + } + + PUGI__FN bool operator||(const xml_text& lhs, bool rhs) + { + return (bool)lhs || rhs; + } +#endif + + PUGI__FN xml_node_iterator::xml_node_iterator() + { + } + + PUGI__FN xml_node_iterator::xml_node_iterator(const xml_node& node): _wrap(node), _parent(node.parent()) + { + } + + PUGI__FN xml_node_iterator::xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent) + { + } + + PUGI__FN bool xml_node_iterator::operator==(const xml_node_iterator& rhs) const + { + return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root; + } + + PUGI__FN bool xml_node_iterator::operator!=(const xml_node_iterator& rhs) const + { + return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root; + } + + PUGI__FN xml_node& xml_node_iterator::operator*() const + { + assert(_wrap._root); + return _wrap; + } + + PUGI__FN xml_node* xml_node_iterator::operator->() const + { + assert(_wrap._root); + return const_cast(&_wrap); // BCC32 workaround + } + + PUGI__FN const xml_node_iterator& xml_node_iterator::operator++() + { + assert(_wrap._root); + _wrap._root = _wrap._root->next_sibling; + return *this; + } + + PUGI__FN xml_node_iterator xml_node_iterator::operator++(int) + { + xml_node_iterator temp = *this; + ++*this; + return temp; + } + + PUGI__FN const xml_node_iterator& xml_node_iterator::operator--() + { + _wrap = _wrap._root ? _wrap.previous_sibling() : _parent.last_child(); + return *this; + } + + PUGI__FN xml_node_iterator xml_node_iterator::operator--(int) + { + xml_node_iterator temp = *this; + --*this; + return temp; + } + + PUGI__FN xml_attribute_iterator::xml_attribute_iterator() + { + } + + PUGI__FN xml_attribute_iterator::xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent): _wrap(attr), _parent(parent) + { + } + + PUGI__FN xml_attribute_iterator::xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent) + { + } + + PUGI__FN bool xml_attribute_iterator::operator==(const xml_attribute_iterator& rhs) const + { + return _wrap._attr == rhs._wrap._attr && _parent._root == rhs._parent._root; + } + + PUGI__FN bool xml_attribute_iterator::operator!=(const xml_attribute_iterator& rhs) const + { + return _wrap._attr != rhs._wrap._attr || _parent._root != rhs._parent._root; + } + + PUGI__FN xml_attribute& xml_attribute_iterator::operator*() const + { + assert(_wrap._attr); + return _wrap; + } + + PUGI__FN xml_attribute* xml_attribute_iterator::operator->() const + { + assert(_wrap._attr); + return const_cast(&_wrap); // BCC32 workaround + } + + PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator++() + { + assert(_wrap._attr); + _wrap._attr = _wrap._attr->next_attribute; + return *this; + } + + PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator++(int) + { + xml_attribute_iterator temp = *this; + ++*this; + return temp; + } + + PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator--() + { + _wrap = _wrap._attr ? _wrap.previous_attribute() : _parent.last_attribute(); + return *this; + } + + PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator--(int) + { + xml_attribute_iterator temp = *this; + --*this; + return temp; + } + + PUGI__FN xml_named_node_iterator::xml_named_node_iterator(): _name(0) + { + } + + PUGI__FN xml_named_node_iterator::xml_named_node_iterator(const xml_node& node, const char_t* name): _wrap(node), _parent(node.parent()), _name(name) + { + } + + PUGI__FN xml_named_node_iterator::xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name): _wrap(ref), _parent(parent), _name(name) + { + } + + PUGI__FN bool xml_named_node_iterator::operator==(const xml_named_node_iterator& rhs) const + { + return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root; + } + + PUGI__FN bool xml_named_node_iterator::operator!=(const xml_named_node_iterator& rhs) const + { + return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root; + } + + PUGI__FN xml_node& xml_named_node_iterator::operator*() const + { + assert(_wrap._root); + return _wrap; + } + + PUGI__FN xml_node* xml_named_node_iterator::operator->() const + { + assert(_wrap._root); + return const_cast(&_wrap); // BCC32 workaround + } + + PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator++() + { + assert(_wrap._root); + _wrap = _wrap.next_sibling(_name); + return *this; + } + + PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator++(int) + { + xml_named_node_iterator temp = *this; + ++*this; + return temp; + } + + PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator--() + { + if (_wrap._root) + _wrap = _wrap.previous_sibling(_name); + else + { + _wrap = _parent.last_child(); + + if (!impl::strequal(_wrap.name(), _name)) + _wrap = _wrap.previous_sibling(_name); + } + + return *this; + } + + PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator--(int) + { + xml_named_node_iterator temp = *this; + --*this; + return temp; + } + + PUGI__FN xml_parse_result::xml_parse_result(): status(status_internal_error), offset(0), encoding(encoding_auto) + { + } + + PUGI__FN xml_parse_result::operator bool() const + { + return status == status_ok; + } + + PUGI__FN const char* xml_parse_result::description() const + { + switch (status) + { + case status_ok: return "No error"; + + case status_file_not_found: return "File was not found"; + case status_io_error: return "Error reading from file/stream"; + case status_out_of_memory: return "Could not allocate memory"; + case status_internal_error: return "Internal error occurred"; + + case status_unrecognized_tag: return "Could not determine tag type"; + + case status_bad_pi: return "Error parsing document declaration/processing instruction"; + case status_bad_comment: return "Error parsing comment"; + case status_bad_cdata: return "Error parsing CDATA section"; + case status_bad_doctype: return "Error parsing document type declaration"; + case status_bad_pcdata: return "Error parsing PCDATA section"; + case status_bad_start_element: return "Error parsing start element tag"; + case status_bad_attribute: return "Error parsing element attribute"; + case status_bad_end_element: return "Error parsing end element tag"; + case status_end_element_mismatch: return "Start-end tags mismatch"; + + case status_append_invalid_root: return "Unable to append nodes: root is not an element or document"; + + case status_no_document_element: return "No document element found"; + + default: return "Unknown error"; + } + } + + PUGI__FN xml_document::xml_document(): _buffer(0) + { + create(); + } + + PUGI__FN xml_document::~xml_document() + { + destroy(); + } + + PUGI__FN void xml_document::reset() + { + destroy(); + create(); + } + + PUGI__FN void xml_document::reset(const xml_document& proto) + { + reset(); + + for (xml_node cur = proto.first_child(); cur; cur = cur.next_sibling()) + append_copy(cur); + } + + PUGI__FN void xml_document::create() + { + assert(!_root); + + // initialize sentinel page + PUGI__STATIC_ASSERT(sizeof(impl::xml_memory_page) + sizeof(impl::xml_document_struct) + impl::xml_memory_page_alignment - sizeof(void*) <= sizeof(_memory)); + + // align upwards to page boundary + void* page_memory = reinterpret_cast((reinterpret_cast(_memory) + (impl::xml_memory_page_alignment - 1)) & ~(impl::xml_memory_page_alignment - 1)); + + // prepare page structure + impl::xml_memory_page* page = impl::xml_memory_page::construct(page_memory); + assert(page); + + page->busy_size = impl::xml_memory_page_size; + + // allocate new root + _root = new (reinterpret_cast(page) + sizeof(impl::xml_memory_page)) impl::xml_document_struct(page); + _root->prev_sibling_c = _root; + + // setup sentinel page + page->allocator = static_cast(_root); + + // verify the document allocation + assert(reinterpret_cast(_root) + sizeof(impl::xml_document_struct) <= _memory + sizeof(_memory)); + } + + PUGI__FN void xml_document::destroy() + { + assert(_root); + + // destroy static storage + if (_buffer) + { + impl::xml_memory::deallocate(_buffer); + _buffer = 0; + } + + // destroy extra buffers (note: no need to destroy linked list nodes, they're allocated using document allocator) + for (impl::xml_extra_buffer* extra = static_cast(_root)->extra_buffers; extra; extra = extra->next) + { + if (extra->buffer) impl::xml_memory::deallocate(extra->buffer); + } + + // destroy dynamic storage, leave sentinel page (it's in static memory) + impl::xml_memory_page* root_page = reinterpret_cast(_root->header & impl::xml_memory_page_pointer_mask); + assert(root_page && !root_page->prev); + assert(reinterpret_cast(root_page) >= _memory && reinterpret_cast(root_page) < _memory + sizeof(_memory)); + + for (impl::xml_memory_page* page = root_page->next; page; ) + { + impl::xml_memory_page* next = page->next; + + impl::xml_allocator::deallocate_page(page); + + page = next; + } + + _root = 0; + } + +#ifndef PUGIXML_NO_STL + PUGI__FN xml_parse_result xml_document::load(std::basic_istream >& stream, unsigned int options, xml_encoding encoding) + { + reset(); + + return impl::load_stream_impl(*this, stream, options, encoding); + } + + PUGI__FN xml_parse_result xml_document::load(std::basic_istream >& stream, unsigned int options) + { + reset(); + + return impl::load_stream_impl(*this, stream, options, encoding_wchar); + } +#endif + + PUGI__FN xml_parse_result xml_document::load_string(const char_t* contents, unsigned int options) + { + // Force native encoding (skip autodetection) + #ifdef PUGIXML_WCHAR_MODE + xml_encoding encoding = encoding_wchar; + #else + xml_encoding encoding = encoding_utf8; + #endif + + return load_buffer(contents, impl::strlength(contents) * sizeof(char_t), options, encoding); + } + + PUGI__FN xml_parse_result xml_document::load(const char_t* contents, unsigned int options) + { + return load_string(contents, options); + } + + PUGI__FN xml_parse_result xml_document::load_file(const char* path_, unsigned int options, xml_encoding encoding) + { + reset(); + + FILE* file = fopen(path_, "rb"); + + return impl::load_file_impl(*this, file, options, encoding); + } + + PUGI__FN xml_parse_result xml_document::load_file(const wchar_t* path_, unsigned int options, xml_encoding encoding) + { + reset(); + + FILE* file = impl::open_file_wide(path_, L"rb"); + + return impl::load_file_impl(*this, file, options, encoding); + } + + PUGI__FN xml_parse_result xml_document::load_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding) + { + reset(); + + return impl::load_buffer_impl(static_cast(_root), _root, const_cast(contents), size, options, encoding, false, false, &_buffer); + } + + PUGI__FN xml_parse_result xml_document::load_buffer_inplace(void* contents, size_t size, unsigned int options, xml_encoding encoding) + { + reset(); + + return impl::load_buffer_impl(static_cast(_root), _root, contents, size, options, encoding, true, false, &_buffer); + } + + PUGI__FN xml_parse_result xml_document::load_buffer_inplace_own(void* contents, size_t size, unsigned int options, xml_encoding encoding) + { + reset(); + + return impl::load_buffer_impl(static_cast(_root), _root, contents, size, options, encoding, true, true, &_buffer); + } + + PUGI__FN void xml_document::save(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding) const + { + impl::xml_buffered_writer buffered_writer(writer, encoding); + + if ((flags & format_write_bom) && encoding != encoding_latin1) + { + // BOM always represents the codepoint U+FEFF, so just write it in native encoding + #ifdef PUGIXML_WCHAR_MODE + unsigned int bom = 0xfeff; + buffered_writer.write(static_cast(bom)); + #else + buffered_writer.write('\xef', '\xbb', '\xbf'); + #endif + } + + if (!(flags & format_no_declaration) && !impl::has_declaration(_root)) + { + buffered_writer.write_string(PUGIXML_TEXT("'); + if (!(flags & format_raw)) buffered_writer.write('\n'); + } + + impl::node_output(buffered_writer, _root, indent, flags, 0); + } + +#ifndef PUGIXML_NO_STL + PUGI__FN void xml_document::save(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding) const + { + xml_writer_stream writer(stream); + + save(writer, indent, flags, encoding); + } + + PUGI__FN void xml_document::save(std::basic_ostream >& stream, const char_t* indent, unsigned int flags) const + { + xml_writer_stream writer(stream); + + save(writer, indent, flags, encoding_wchar); + } +#endif + + PUGI__FN bool xml_document::save_file(const char* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const + { + FILE* file = fopen(path_, (flags & format_save_file_text) ? "w" : "wb"); + return impl::save_file_impl(*this, file, indent, flags, encoding); + } + + PUGI__FN bool xml_document::save_file(const wchar_t* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const + { + FILE* file = impl::open_file_wide(path_, (flags & format_save_file_text) ? L"w" : L"wb"); + return impl::save_file_impl(*this, file, indent, flags, encoding); + } + + PUGI__FN xml_node xml_document::document_element() const + { + assert(_root); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (PUGI__NODETYPE(i) == node_element) + return xml_node(i); + + return xml_node(); + } + +#ifndef PUGIXML_NO_STL + PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str) + { + assert(str); + + return impl::as_utf8_impl(str, impl::strlength_wide(str)); + } + + PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const std::basic_string& str) + { + return impl::as_utf8_impl(str.c_str(), str.size()); + } + + PUGI__FN std::basic_string PUGIXML_FUNCTION as_wide(const char* str) + { + assert(str); + + return impl::as_wide_impl(str, strlen(str)); + } + + PUGI__FN std::basic_string PUGIXML_FUNCTION as_wide(const std::string& str) + { + return impl::as_wide_impl(str.c_str(), str.size()); + } +#endif + + PUGI__FN void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate) + { + impl::xml_memory::allocate = allocate; + impl::xml_memory::deallocate = deallocate; + } + + PUGI__FN allocation_function PUGIXML_FUNCTION get_memory_allocation_function() + { + return impl::xml_memory::allocate; + } + + PUGI__FN deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function() + { + return impl::xml_memory::deallocate; + } +} + +#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC)) +namespace std +{ + // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier) + PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_node_iterator&) + { + return std::bidirectional_iterator_tag(); + } + + PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_attribute_iterator&) + { + return std::bidirectional_iterator_tag(); + } + + PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_named_node_iterator&) + { + return std::bidirectional_iterator_tag(); + } +} +#endif + +#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC) +namespace std +{ + // Workarounds for (non-standard) iterator category detection + PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_node_iterator&) + { + return std::bidirectional_iterator_tag(); + } + + PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_attribute_iterator&) + { + return std::bidirectional_iterator_tag(); + } + + PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_named_node_iterator&) + { + return std::bidirectional_iterator_tag(); + } +} +#endif + +#ifndef PUGIXML_NO_XPATH +// STL replacements +PUGI__NS_BEGIN + struct equal_to + { + template bool operator()(const T& lhs, const T& rhs) const + { + return lhs == rhs; + } + }; + + struct not_equal_to + { + template bool operator()(const T& lhs, const T& rhs) const + { + return lhs != rhs; + } + }; + + struct less + { + template bool operator()(const T& lhs, const T& rhs) const + { + return lhs < rhs; + } + }; + + struct less_equal + { + template bool operator()(const T& lhs, const T& rhs) const + { + return lhs <= rhs; + } + }; + + template void swap(T& lhs, T& rhs) + { + T temp = lhs; + lhs = rhs; + rhs = temp; + } + + template I min_element(I begin, I end, const Pred& pred) + { + I result = begin; + + for (I it = begin + 1; it != end; ++it) + if (pred(*it, *result)) + result = it; + + return result; + } + + template void reverse(I begin, I end) + { + while (end - begin > 1) swap(*begin++, *--end); + } + + template I unique(I begin, I end) + { + // fast skip head + while (end - begin > 1 && *begin != *(begin + 1)) begin++; + + if (begin == end) return begin; + + // last written element + I write = begin++; + + // merge unique elements + while (begin != end) + { + if (*begin != *write) + *++write = *begin++; + else + begin++; + } + + // past-the-end (write points to live element) + return write + 1; + } + + template void copy_backwards(I begin, I end, I target) + { + while (begin != end) *--target = *--end; + } + + template void insertion_sort(I begin, I end, const Pred& pred, T*) + { + assert(begin != end); + + for (I it = begin + 1; it != end; ++it) + { + T val = *it; + + if (pred(val, *begin)) + { + // move to front + copy_backwards(begin, it, it + 1); + *begin = val; + } + else + { + I hole = it; + + // move hole backwards + while (pred(val, *(hole - 1))) + { + *hole = *(hole - 1); + hole--; + } + + // fill hole with element + *hole = val; + } + } + } + + // std variant for elements with == + template void partition(I begin, I middle, I end, const Pred& pred, I* out_eqbeg, I* out_eqend) + { + I eqbeg = middle, eqend = middle + 1; + + // expand equal range + while (eqbeg != begin && *(eqbeg - 1) == *eqbeg) --eqbeg; + while (eqend != end && *eqend == *eqbeg) ++eqend; + + // process outer elements + I ltend = eqbeg, gtbeg = eqend; + + for (;;) + { + // find the element from the right side that belongs to the left one + for (; gtbeg != end; ++gtbeg) + if (!pred(*eqbeg, *gtbeg)) + { + if (*gtbeg == *eqbeg) swap(*gtbeg, *eqend++); + else break; + } + + // find the element from the left side that belongs to the right one + for (; ltend != begin; --ltend) + if (!pred(*(ltend - 1), *eqbeg)) + { + if (*eqbeg == *(ltend - 1)) swap(*(ltend - 1), *--eqbeg); + else break; + } + + // scanned all elements + if (gtbeg == end && ltend == begin) + { + *out_eqbeg = eqbeg; + *out_eqend = eqend; + return; + } + + // make room for elements by moving equal area + if (gtbeg == end) + { + if (--ltend != --eqbeg) swap(*ltend, *eqbeg); + swap(*eqbeg, *--eqend); + } + else if (ltend == begin) + { + if (eqend != gtbeg) swap(*eqbeg, *eqend); + ++eqend; + swap(*gtbeg++, *eqbeg++); + } + else swap(*gtbeg++, *--ltend); + } + } + + template void median3(I first, I middle, I last, const Pred& pred) + { + if (pred(*middle, *first)) swap(*middle, *first); + if (pred(*last, *middle)) swap(*last, *middle); + if (pred(*middle, *first)) swap(*middle, *first); + } + + template void median(I first, I middle, I last, const Pred& pred) + { + if (last - first <= 40) + { + // median of three for small chunks + median3(first, middle, last, pred); + } + else + { + // median of nine + size_t step = (last - first + 1) / 8; + + median3(first, first + step, first + 2 * step, pred); + median3(middle - step, middle, middle + step, pred); + median3(last - 2 * step, last - step, last, pred); + median3(first + step, middle, last - step, pred); + } + } + + template void sort(I begin, I end, const Pred& pred) + { + // sort large chunks + while (end - begin > 32) + { + // find median element + I middle = begin + (end - begin) / 2; + median(begin, middle, end - 1, pred); + + // partition in three chunks (< = >) + I eqbeg, eqend; + partition(begin, middle, end, pred, &eqbeg, &eqend); + + // loop on larger half + if (eqbeg - begin > end - eqend) + { + sort(eqend, end, pred); + end = eqbeg; + } + else + { + sort(begin, eqbeg, pred); + begin = eqend; + } + } + + // insertion sort small chunk + if (begin != end) insertion_sort(begin, end, pred, &*begin); + } +PUGI__NS_END + +// Allocator used for AST and evaluation stacks +PUGI__NS_BEGIN + struct xpath_memory_block + { + xpath_memory_block* next; + size_t capacity; + + char data[ + #ifdef PUGIXML_MEMORY_XPATH_PAGE_SIZE + PUGIXML_MEMORY_XPATH_PAGE_SIZE + #else + 4096 + #endif + ]; + }; + + class xpath_allocator + { + xpath_memory_block* _root; + size_t _root_size; + + public: + #ifdef PUGIXML_NO_EXCEPTIONS + jmp_buf* error_handler; + #endif + + xpath_allocator(xpath_memory_block* root, size_t root_size = 0): _root(root), _root_size(root_size) + { + #ifdef PUGIXML_NO_EXCEPTIONS + error_handler = 0; + #endif + } + + void* allocate_nothrow(size_t size) + { + // align size so that we're able to store pointers in subsequent blocks + size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1); + + if (_root_size + size <= _root->capacity) + { + void* buf = _root->data + _root_size; + _root_size += size; + return buf; + } + else + { + // make sure we have at least 1/4th of the page free after allocation to satisfy subsequent allocation requests + size_t block_capacity_base = sizeof(_root->data); + size_t block_capacity_req = size + block_capacity_base / 4; + size_t block_capacity = (block_capacity_base > block_capacity_req) ? block_capacity_base : block_capacity_req; + + size_t block_size = block_capacity + offsetof(xpath_memory_block, data); + + xpath_memory_block* block = static_cast(xml_memory::allocate(block_size)); + if (!block) return 0; + + block->next = _root; + block->capacity = block_capacity; + + _root = block; + _root_size = size; + + return block->data; + } + } + + void* allocate(size_t size) + { + void* result = allocate_nothrow(size); + + if (!result) + { + #ifdef PUGIXML_NO_EXCEPTIONS + assert(error_handler); + longjmp(*error_handler, 1); + #else + throw std::bad_alloc(); + #endif + } + + return result; + } + + void* reallocate(void* ptr, size_t old_size, size_t new_size) + { + // align size so that we're able to store pointers in subsequent blocks + old_size = (old_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1); + new_size = (new_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1); + + // we can only reallocate the last object + assert(ptr == 0 || static_cast(ptr) + old_size == _root->data + _root_size); + + // adjust root size so that we have not allocated the object at all + bool only_object = (_root_size == old_size); + + if (ptr) _root_size -= old_size; + + // allocate a new version (this will obviously reuse the memory if possible) + void* result = allocate(new_size); + assert(result); + + // we have a new block + if (result != ptr && ptr) + { + // copy old data + assert(new_size >= old_size); + memcpy(result, ptr, old_size); + + // free the previous page if it had no other objects + if (only_object) + { + assert(_root->data == result); + assert(_root->next); + + xpath_memory_block* next = _root->next->next; + + if (next) + { + // deallocate the whole page, unless it was the first one + xml_memory::deallocate(_root->next); + _root->next = next; + } + } + } + + return result; + } + + void revert(const xpath_allocator& state) + { + // free all new pages + xpath_memory_block* cur = _root; + + while (cur != state._root) + { + xpath_memory_block* next = cur->next; + + xml_memory::deallocate(cur); + + cur = next; + } + + // restore state + _root = state._root; + _root_size = state._root_size; + } + + void release() + { + xpath_memory_block* cur = _root; + assert(cur); + + while (cur->next) + { + xpath_memory_block* next = cur->next; + + xml_memory::deallocate(cur); + + cur = next; + } + } + }; + + struct xpath_allocator_capture + { + xpath_allocator_capture(xpath_allocator* alloc): _target(alloc), _state(*alloc) + { + } + + ~xpath_allocator_capture() + { + _target->revert(_state); + } + + xpath_allocator* _target; + xpath_allocator _state; + }; + + struct xpath_stack + { + xpath_allocator* result; + xpath_allocator* temp; + }; + + struct xpath_stack_data + { + xpath_memory_block blocks[2]; + xpath_allocator result; + xpath_allocator temp; + xpath_stack stack; + + #ifdef PUGIXML_NO_EXCEPTIONS + jmp_buf error_handler; + #endif + + xpath_stack_data(): result(blocks + 0), temp(blocks + 1) + { + blocks[0].next = blocks[1].next = 0; + blocks[0].capacity = blocks[1].capacity = sizeof(blocks[0].data); + + stack.result = &result; + stack.temp = &temp; + + #ifdef PUGIXML_NO_EXCEPTIONS + result.error_handler = temp.error_handler = &error_handler; + #endif + } + + ~xpath_stack_data() + { + result.release(); + temp.release(); + } + }; +PUGI__NS_END + +// String class +PUGI__NS_BEGIN + class xpath_string + { + const char_t* _buffer; + bool _uses_heap; + size_t _length_heap; + + static char_t* duplicate_string(const char_t* string, size_t length, xpath_allocator* alloc) + { + char_t* result = static_cast(alloc->allocate((length + 1) * sizeof(char_t))); + assert(result); + + memcpy(result, string, length * sizeof(char_t)); + result[length] = 0; + + return result; + } + + xpath_string(const char_t* buffer, bool uses_heap_, size_t length_heap): _buffer(buffer), _uses_heap(uses_heap_), _length_heap(length_heap) + { + } + + public: + static xpath_string from_const(const char_t* str) + { + return xpath_string(str, false, 0); + } + + static xpath_string from_heap_preallocated(const char_t* begin, const char_t* end) + { + assert(begin <= end && *end == 0); + + return xpath_string(begin, true, static_cast(end - begin)); + } + + static xpath_string from_heap(const char_t* begin, const char_t* end, xpath_allocator* alloc) + { + assert(begin <= end); + + size_t length = static_cast(end - begin); + + return length == 0 ? xpath_string() : xpath_string(duplicate_string(begin, length, alloc), true, length); + } + + xpath_string(): _buffer(PUGIXML_TEXT("")), _uses_heap(false), _length_heap(0) + { + } + + void append(const xpath_string& o, xpath_allocator* alloc) + { + // skip empty sources + if (!*o._buffer) return; + + // fast append for constant empty target and constant source + if (!*_buffer && !_uses_heap && !o._uses_heap) + { + _buffer = o._buffer; + } + else + { + // need to make heap copy + size_t target_length = length(); + size_t source_length = o.length(); + size_t result_length = target_length + source_length; + + // allocate new buffer + char_t* result = static_cast(alloc->reallocate(_uses_heap ? const_cast(_buffer) : 0, (target_length + 1) * sizeof(char_t), (result_length + 1) * sizeof(char_t))); + assert(result); + + // append first string to the new buffer in case there was no reallocation + if (!_uses_heap) memcpy(result, _buffer, target_length * sizeof(char_t)); + + // append second string to the new buffer + memcpy(result + target_length, o._buffer, source_length * sizeof(char_t)); + result[result_length] = 0; + + // finalize + _buffer = result; + _uses_heap = true; + _length_heap = result_length; + } + } + + const char_t* c_str() const + { + return _buffer; + } + + size_t length() const + { + return _uses_heap ? _length_heap : strlength(_buffer); + } + + char_t* data(xpath_allocator* alloc) + { + // make private heap copy + if (!_uses_heap) + { + size_t length_ = strlength(_buffer); + + _buffer = duplicate_string(_buffer, length_, alloc); + _uses_heap = true; + _length_heap = length_; + } + + return const_cast(_buffer); + } + + bool empty() const + { + return *_buffer == 0; + } + + bool operator==(const xpath_string& o) const + { + return strequal(_buffer, o._buffer); + } + + bool operator!=(const xpath_string& o) const + { + return !strequal(_buffer, o._buffer); + } + + bool uses_heap() const + { + return _uses_heap; + } + }; +PUGI__NS_END + +PUGI__NS_BEGIN + PUGI__FN bool starts_with(const char_t* string, const char_t* pattern) + { + while (*pattern && *string == *pattern) + { + string++; + pattern++; + } + + return *pattern == 0; + } + + PUGI__FN const char_t* find_char(const char_t* s, char_t c) + { + #ifdef PUGIXML_WCHAR_MODE + return wcschr(s, c); + #else + return strchr(s, c); + #endif + } + + PUGI__FN const char_t* find_substring(const char_t* s, const char_t* p) + { + #ifdef PUGIXML_WCHAR_MODE + // MSVC6 wcsstr bug workaround (if s is empty it always returns 0) + return (*p == 0) ? s : wcsstr(s, p); + #else + return strstr(s, p); + #endif + } + + // Converts symbol to lower case, if it is an ASCII one + PUGI__FN char_t tolower_ascii(char_t ch) + { + return static_cast(ch - 'A') < 26 ? static_cast(ch | ' ') : ch; + } + + PUGI__FN xpath_string string_value(const xpath_node& na, xpath_allocator* alloc) + { + if (na.attribute()) + return xpath_string::from_const(na.attribute().value()); + else + { + xml_node n = na.node(); + + switch (n.type()) + { + case node_pcdata: + case node_cdata: + case node_comment: + case node_pi: + return xpath_string::from_const(n.value()); + + case node_document: + case node_element: + { + xpath_string result; + + xml_node cur = n.first_child(); + + while (cur && cur != n) + { + if (cur.type() == node_pcdata || cur.type() == node_cdata) + result.append(xpath_string::from_const(cur.value()), alloc); + + if (cur.first_child()) + cur = cur.first_child(); + else if (cur.next_sibling()) + cur = cur.next_sibling(); + else + { + while (!cur.next_sibling() && cur != n) + cur = cur.parent(); + + if (cur != n) cur = cur.next_sibling(); + } + } + + return result; + } + + default: + return xpath_string(); + } + } + } + + PUGI__FN bool node_is_before_sibling(xml_node_struct* ln, xml_node_struct* rn) + { + assert(ln->parent == rn->parent); + + // there is no common ancestor (the shared parent is null), nodes are from different documents + if (!ln->parent) return ln < rn; + + // determine sibling order + xml_node_struct* ls = ln; + xml_node_struct* rs = rn; + + while (ls && rs) + { + if (ls == rn) return true; + if (rs == ln) return false; + + ls = ls->next_sibling; + rs = rs->next_sibling; + } + + // if rn sibling chain ended ln must be before rn + return !rs; + } + + PUGI__FN bool node_is_before(xml_node_struct* ln, xml_node_struct* rn) + { + // find common ancestor at the same depth, if any + xml_node_struct* lp = ln; + xml_node_struct* rp = rn; + + while (lp && rp && lp->parent != rp->parent) + { + lp = lp->parent; + rp = rp->parent; + } + + // parents are the same! + if (lp && rp) return node_is_before_sibling(lp, rp); + + // nodes are at different depths, need to normalize heights + bool left_higher = !lp; + + while (lp) + { + lp = lp->parent; + ln = ln->parent; + } + + while (rp) + { + rp = rp->parent; + rn = rn->parent; + } + + // one node is the ancestor of the other + if (ln == rn) return left_higher; + + // find common ancestor... again + while (ln->parent != rn->parent) + { + ln = ln->parent; + rn = rn->parent; + } + + return node_is_before_sibling(ln, rn); + } + + PUGI__FN bool node_is_ancestor(xml_node_struct* parent, xml_node_struct* node) + { + while (node && node != parent) node = node->parent; + + return parent && node == parent; + } + + PUGI__FN const void* document_buffer_order(const xpath_node& xnode) + { + xml_node_struct* node = xnode.node().internal_object(); + + if (node) + { + if ((get_document(node).header & xml_memory_page_contents_shared_mask) == 0) + { + if (node->name && (node->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0) return node->name; + if (node->value && (node->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0) return node->value; + } + + return 0; + } + + xml_attribute_struct* attr = xnode.attribute().internal_object(); + + if (attr) + { + if ((get_document(attr).header & xml_memory_page_contents_shared_mask) == 0) + { + if ((attr->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0) return attr->name; + if ((attr->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0) return attr->value; + } + + return 0; + } + + return 0; + } + + struct document_order_comparator + { + bool operator()(const xpath_node& lhs, const xpath_node& rhs) const + { + // optimized document order based check + const void* lo = document_buffer_order(lhs); + const void* ro = document_buffer_order(rhs); + + if (lo && ro) return lo < ro; + + // slow comparison + xml_node ln = lhs.node(), rn = rhs.node(); + + // compare attributes + if (lhs.attribute() && rhs.attribute()) + { + // shared parent + if (lhs.parent() == rhs.parent()) + { + // determine sibling order + for (xml_attribute a = lhs.attribute(); a; a = a.next_attribute()) + if (a == rhs.attribute()) + return true; + + return false; + } + + // compare attribute parents + ln = lhs.parent(); + rn = rhs.parent(); + } + else if (lhs.attribute()) + { + // attributes go after the parent element + if (lhs.parent() == rhs.node()) return false; + + ln = lhs.parent(); + } + else if (rhs.attribute()) + { + // attributes go after the parent element + if (rhs.parent() == lhs.node()) return true; + + rn = rhs.parent(); + } + + if (ln == rn) return false; + + if (!ln || !rn) return ln < rn; + + return node_is_before(ln.internal_object(), rn.internal_object()); + } + }; + + struct duplicate_comparator + { + bool operator()(const xpath_node& lhs, const xpath_node& rhs) const + { + if (lhs.attribute()) return rhs.attribute() ? lhs.attribute() < rhs.attribute() : true; + else return rhs.attribute() ? false : lhs.node() < rhs.node(); + } + }; + + PUGI__FN double gen_nan() + { + #if defined(__STDC_IEC_559__) || ((FLT_RADIX - 0 == 2) && (FLT_MAX_EXP - 0 == 128) && (FLT_MANT_DIG - 0 == 24)) + union { float f; uint32_t i; } u[sizeof(float) == sizeof(uint32_t) ? 1 : -1]; + u[0].i = 0x7fc00000; + return u[0].f; + #else + // fallback + const volatile double zero = 0.0; + return zero / zero; + #endif + } + + PUGI__FN bool is_nan(double value) + { + #if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) + return !!_isnan(value); + #elif defined(fpclassify) && defined(FP_NAN) + return fpclassify(value) == FP_NAN; + #else + // fallback + const volatile double v = value; + return v != v; + #endif + } + + PUGI__FN const char_t* convert_number_to_string_special(double value) + { + #if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) + if (_finite(value)) return (value == 0) ? PUGIXML_TEXT("0") : 0; + if (_isnan(value)) return PUGIXML_TEXT("NaN"); + return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity"); + #elif defined(fpclassify) && defined(FP_NAN) && defined(FP_INFINITE) && defined(FP_ZERO) + switch (fpclassify(value)) + { + case FP_NAN: + return PUGIXML_TEXT("NaN"); + + case FP_INFINITE: + return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity"); + + case FP_ZERO: + return PUGIXML_TEXT("0"); + + default: + return 0; + } + #else + // fallback + const volatile double v = value; + + if (v == 0) return PUGIXML_TEXT("0"); + if (v != v) return PUGIXML_TEXT("NaN"); + if (v * 2 == v) return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity"); + return 0; + #endif + } + + PUGI__FN bool convert_number_to_boolean(double value) + { + return (value != 0 && !is_nan(value)); + } + + PUGI__FN void truncate_zeros(char* begin, char* end) + { + while (begin != end && end[-1] == '0') end--; + + *end = 0; + } + + // gets mantissa digits in the form of 0.xxxxx with 0. implied and the exponent +#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE) + PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent) + { + // get base values + int sign, exponent; + _ecvt_s(buffer, buffer_size, value, DBL_DIG + 1, &exponent, &sign); + + // truncate redundant zeros + truncate_zeros(buffer, buffer + strlen(buffer)); + + // fill results + *out_mantissa = buffer; + *out_exponent = exponent; + } +#else + PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent) + { + // get a scientific notation value with IEEE DBL_DIG decimals + sprintf(buffer, "%.*e", DBL_DIG, value); + assert(strlen(buffer) < buffer_size); + (void)!buffer_size; + + // get the exponent (possibly negative) + char* exponent_string = strchr(buffer, 'e'); + assert(exponent_string); + + int exponent = atoi(exponent_string + 1); + + // extract mantissa string: skip sign + char* mantissa = buffer[0] == '-' ? buffer + 1 : buffer; + assert(mantissa[0] != '0' && mantissa[1] == '.'); + + // divide mantissa by 10 to eliminate integer part + mantissa[1] = mantissa[0]; + mantissa++; + exponent++; + + // remove extra mantissa digits and zero-terminate mantissa + truncate_zeros(mantissa, exponent_string); + + // fill results + *out_mantissa = mantissa; + *out_exponent = exponent; + } +#endif + + PUGI__FN xpath_string convert_number_to_string(double value, xpath_allocator* alloc) + { + // try special number conversion + const char_t* special = convert_number_to_string_special(value); + if (special) return xpath_string::from_const(special); + + // get mantissa + exponent form + char mantissa_buffer[32]; + + char* mantissa; + int exponent; + convert_number_to_mantissa_exponent(value, mantissa_buffer, sizeof(mantissa_buffer), &mantissa, &exponent); + + // allocate a buffer of suitable length for the number + size_t result_size = strlen(mantissa_buffer) + (exponent > 0 ? exponent : -exponent) + 4; + char_t* result = static_cast(alloc->allocate(sizeof(char_t) * result_size)); + assert(result); + + // make the number! + char_t* s = result; + + // sign + if (value < 0) *s++ = '-'; + + // integer part + if (exponent <= 0) + { + *s++ = '0'; + } + else + { + while (exponent > 0) + { + assert(*mantissa == 0 || static_cast(static_cast(*mantissa) - '0') <= 9); + *s++ = *mantissa ? *mantissa++ : '0'; + exponent--; + } + } + + // fractional part + if (*mantissa) + { + // decimal point + *s++ = '.'; + + // extra zeroes from negative exponent + while (exponent < 0) + { + *s++ = '0'; + exponent++; + } + + // extra mantissa digits + while (*mantissa) + { + assert(static_cast(*mantissa - '0') <= 9); + *s++ = *mantissa++; + } + } + + // zero-terminate + assert(s < result + result_size); + *s = 0; + + return xpath_string::from_heap_preallocated(result, s); + } + + PUGI__FN bool check_string_to_number_format(const char_t* string) + { + // parse leading whitespace + while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string; + + // parse sign + if (*string == '-') ++string; + + if (!*string) return false; + + // if there is no integer part, there should be a decimal part with at least one digit + if (!PUGI__IS_CHARTYPEX(string[0], ctx_digit) && (string[0] != '.' || !PUGI__IS_CHARTYPEX(string[1], ctx_digit))) return false; + + // parse integer part + while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string; + + // parse decimal part + if (*string == '.') + { + ++string; + + while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string; + } + + // parse trailing whitespace + while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string; + + return *string == 0; + } + + PUGI__FN double convert_string_to_number(const char_t* string) + { + // check string format + if (!check_string_to_number_format(string)) return gen_nan(); + + // parse string + #ifdef PUGIXML_WCHAR_MODE + return wcstod(string, 0); + #else + return atof(string); + #endif + } + + PUGI__FN bool convert_string_to_number_scratch(char_t (&buffer)[32], const char_t* begin, const char_t* end, double* out_result) + { + size_t length = static_cast(end - begin); + char_t* scratch = buffer; + + if (length >= sizeof(buffer) / sizeof(buffer[0])) + { + // need to make dummy on-heap copy + scratch = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!scratch) return false; + } + + // copy string to zero-terminated buffer and perform conversion + memcpy(scratch, begin, length * sizeof(char_t)); + scratch[length] = 0; + + *out_result = convert_string_to_number(scratch); + + // free dummy buffer + if (scratch != buffer) xml_memory::deallocate(scratch); + + return true; + } + + PUGI__FN double round_nearest(double value) + { + return floor(value + 0.5); + } + + PUGI__FN double round_nearest_nzero(double value) + { + // same as round_nearest, but returns -0 for [-0.5, -0] + // ceil is used to differentiate between +0 and -0 (we return -0 for [-0.5, -0] and +0 for +0) + return (value >= -0.5 && value <= 0) ? ceil(value) : floor(value + 0.5); + } + + PUGI__FN const char_t* qualified_name(const xpath_node& node) + { + return node.attribute() ? node.attribute().name() : node.node().name(); + } + + PUGI__FN const char_t* local_name(const xpath_node& node) + { + const char_t* name = qualified_name(node); + const char_t* p = find_char(name, ':'); + + return p ? p + 1 : name; + } + + struct namespace_uri_predicate + { + const char_t* prefix; + size_t prefix_length; + + namespace_uri_predicate(const char_t* name) + { + const char_t* pos = find_char(name, ':'); + + prefix = pos ? name : 0; + prefix_length = pos ? static_cast(pos - name) : 0; + } + + bool operator()(xml_attribute a) const + { + const char_t* name = a.name(); + + if (!starts_with(name, PUGIXML_TEXT("xmlns"))) return false; + + return prefix ? name[5] == ':' && strequalrange(name + 6, prefix, prefix_length) : name[5] == 0; + } + }; + + PUGI__FN const char_t* namespace_uri(xml_node node) + { + namespace_uri_predicate pred = node.name(); + + xml_node p = node; + + while (p) + { + xml_attribute a = p.find_attribute(pred); + + if (a) return a.value(); + + p = p.parent(); + } + + return PUGIXML_TEXT(""); + } + + PUGI__FN const char_t* namespace_uri(xml_attribute attr, xml_node parent) + { + namespace_uri_predicate pred = attr.name(); + + // Default namespace does not apply to attributes + if (!pred.prefix) return PUGIXML_TEXT(""); + + xml_node p = parent; + + while (p) + { + xml_attribute a = p.find_attribute(pred); + + if (a) return a.value(); + + p = p.parent(); + } + + return PUGIXML_TEXT(""); + } + + PUGI__FN const char_t* namespace_uri(const xpath_node& node) + { + return node.attribute() ? namespace_uri(node.attribute(), node.parent()) : namespace_uri(node.node()); + } + + PUGI__FN void normalize_space(char_t* buffer) + { + char_t* write = buffer; + + for (char_t* it = buffer; *it; ) + { + char_t ch = *it++; + + if (PUGI__IS_CHARTYPE(ch, ct_space)) + { + // replace whitespace sequence with single space + while (PUGI__IS_CHARTYPE(*it, ct_space)) it++; + + // avoid leading spaces + if (write != buffer) *write++ = ' '; + } + else *write++ = ch; + } + + // remove trailing space + if (write != buffer && PUGI__IS_CHARTYPE(write[-1], ct_space)) write--; + + // zero-terminate + *write = 0; + } + + PUGI__FN void translate(char_t* buffer, const char_t* from, const char_t* to, size_t to_length) + { + char_t* write = buffer; + + while (*buffer) + { + PUGI__DMC_VOLATILE char_t ch = *buffer++; + + const char_t* pos = find_char(from, ch); + + if (!pos) + *write++ = ch; // do not process + else if (static_cast(pos - from) < to_length) + *write++ = to[pos - from]; // replace + } + + // zero-terminate + *write = 0; + } + + PUGI__FN unsigned char* translate_table_generate(xpath_allocator* alloc, const char_t* from, const char_t* to) + { + unsigned char table[128] = {0}; + + while (*from) + { + unsigned int fc = static_cast(*from); + unsigned int tc = static_cast(*to); + + if (fc >= 128 || tc >= 128) + return 0; + + // code=128 means "skip character" + if (!table[fc]) + table[fc] = static_cast(tc ? tc : 128); + + from++; + if (tc) to++; + } + + for (int i = 0; i < 128; ++i) + if (!table[i]) + table[i] = static_cast(i); + + void* result = alloc->allocate_nothrow(sizeof(table)); + + if (result) + { + memcpy(result, table, sizeof(table)); + } + + return static_cast(result); + } + + PUGI__FN void translate_table(char_t* buffer, const unsigned char* table) + { + char_t* write = buffer; + + while (*buffer) + { + char_t ch = *buffer++; + unsigned int index = static_cast(ch); + + if (index < 128) + { + unsigned char code = table[index]; + + // code=128 means "skip character" (table size is 128 so 128 can be a special value) + // this code skips these characters without extra branches + *write = static_cast(code); + write += 1 - (code >> 7); + } + else + { + *write++ = ch; + } + } + + // zero-terminate + *write = 0; + } + + inline bool is_xpath_attribute(const char_t* name) + { + return !(starts_with(name, PUGIXML_TEXT("xmlns")) && (name[5] == 0 || name[5] == ':')); + } + + struct xpath_variable_boolean: xpath_variable + { + xpath_variable_boolean(): value(false) + { + } + + bool value; + char_t name[1]; + }; + + struct xpath_variable_number: xpath_variable + { + xpath_variable_number(): value(0) + { + } + + double value; + char_t name[1]; + }; + + struct xpath_variable_string: xpath_variable + { + xpath_variable_string(): value(0) + { + } + + ~xpath_variable_string() + { + if (value) xml_memory::deallocate(value); + } + + char_t* value; + char_t name[1]; + }; + + struct xpath_variable_node_set: xpath_variable + { + xpath_node_set value; + char_t name[1]; + }; + + static const xpath_node_set dummy_node_set; + + PUGI__FN unsigned int hash_string(const char_t* str) + { + // Jenkins one-at-a-time hash (http://en.wikipedia.org/wiki/Jenkins_hash_function#one-at-a-time) + unsigned int result = 0; + + while (*str) + { + result += static_cast(*str++); + result += result << 10; + result ^= result >> 6; + } + + result += result << 3; + result ^= result >> 11; + result += result << 15; + + return result; + } + + template PUGI__FN T* new_xpath_variable(const char_t* name) + { + size_t length = strlength(name); + if (length == 0) return 0; // empty variable names are invalid + + // $$ we can't use offsetof(T, name) because T is non-POD, so we just allocate additional length characters + void* memory = xml_memory::allocate(sizeof(T) + length * sizeof(char_t)); + if (!memory) return 0; + + T* result = new (memory) T(); + + memcpy(result->name, name, (length + 1) * sizeof(char_t)); + + return result; + } + + PUGI__FN xpath_variable* new_xpath_variable(xpath_value_type type, const char_t* name) + { + switch (type) + { + case xpath_type_node_set: + return new_xpath_variable(name); + + case xpath_type_number: + return new_xpath_variable(name); + + case xpath_type_string: + return new_xpath_variable(name); + + case xpath_type_boolean: + return new_xpath_variable(name); + + default: + return 0; + } + } + + template PUGI__FN void delete_xpath_variable(T* var) + { + var->~T(); + xml_memory::deallocate(var); + } + + PUGI__FN void delete_xpath_variable(xpath_value_type type, xpath_variable* var) + { + switch (type) + { + case xpath_type_node_set: + delete_xpath_variable(static_cast(var)); + break; + + case xpath_type_number: + delete_xpath_variable(static_cast(var)); + break; + + case xpath_type_string: + delete_xpath_variable(static_cast(var)); + break; + + case xpath_type_boolean: + delete_xpath_variable(static_cast(var)); + break; + + default: + assert(!"Invalid variable type"); + } + } + + PUGI__FN xpath_variable* get_variable_scratch(char_t (&buffer)[32], xpath_variable_set* set, const char_t* begin, const char_t* end) + { + size_t length = static_cast(end - begin); + char_t* scratch = buffer; + + if (length >= sizeof(buffer) / sizeof(buffer[0])) + { + // need to make dummy on-heap copy + scratch = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!scratch) return 0; + } + + // copy string to zero-terminated buffer and perform lookup + memcpy(scratch, begin, length * sizeof(char_t)); + scratch[length] = 0; + + xpath_variable* result = set->get(scratch); + + // free dummy buffer + if (scratch != buffer) xml_memory::deallocate(scratch); + + return result; + } +PUGI__NS_END + +// Internal node set class +PUGI__NS_BEGIN + PUGI__FN xpath_node_set::type_t xpath_get_order(const xpath_node* begin, const xpath_node* end) + { + if (end - begin < 2) + return xpath_node_set::type_sorted; + + document_order_comparator cmp; + + bool first = cmp(begin[0], begin[1]); + + for (const xpath_node* it = begin + 1; it + 1 < end; ++it) + if (cmp(it[0], it[1]) != first) + return xpath_node_set::type_unsorted; + + return first ? xpath_node_set::type_sorted : xpath_node_set::type_sorted_reverse; + } + + PUGI__FN xpath_node_set::type_t xpath_sort(xpath_node* begin, xpath_node* end, xpath_node_set::type_t type, bool rev) + { + xpath_node_set::type_t order = rev ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted; + + if (type == xpath_node_set::type_unsorted) + { + xpath_node_set::type_t sorted = xpath_get_order(begin, end); + + if (sorted == xpath_node_set::type_unsorted) + { + sort(begin, end, document_order_comparator()); + + type = xpath_node_set::type_sorted; + } + else + type = sorted; + } + + if (type != order) reverse(begin, end); + + return order; + } + + PUGI__FN xpath_node xpath_first(const xpath_node* begin, const xpath_node* end, xpath_node_set::type_t type) + { + if (begin == end) return xpath_node(); + + switch (type) + { + case xpath_node_set::type_sorted: + return *begin; + + case xpath_node_set::type_sorted_reverse: + return *(end - 1); + + case xpath_node_set::type_unsorted: + return *min_element(begin, end, document_order_comparator()); + + default: + assert(!"Invalid node set type"); + return xpath_node(); + } + } + + class xpath_node_set_raw + { + xpath_node_set::type_t _type; + + xpath_node* _begin; + xpath_node* _end; + xpath_node* _eos; + + public: + xpath_node_set_raw(): _type(xpath_node_set::type_unsorted), _begin(0), _end(0), _eos(0) + { + } + + xpath_node* begin() const + { + return _begin; + } + + xpath_node* end() const + { + return _end; + } + + bool empty() const + { + return _begin == _end; + } + + size_t size() const + { + return static_cast(_end - _begin); + } + + xpath_node first() const + { + return xpath_first(_begin, _end, _type); + } + + void push_back_grow(const xpath_node& node, xpath_allocator* alloc); + + void push_back(const xpath_node& node, xpath_allocator* alloc) + { + if (_end != _eos) + *_end++ = node; + else + push_back_grow(node, alloc); + } + + void append(const xpath_node* begin_, const xpath_node* end_, xpath_allocator* alloc) + { + if (begin_ == end_) return; + + size_t size_ = static_cast(_end - _begin); + size_t capacity = static_cast(_eos - _begin); + size_t count = static_cast(end_ - begin_); + + if (size_ + count > capacity) + { + // reallocate the old array or allocate a new one + xpath_node* data = static_cast(alloc->reallocate(_begin, capacity * sizeof(xpath_node), (size_ + count) * sizeof(xpath_node))); + assert(data); + + // finalize + _begin = data; + _end = data + size_; + _eos = data + size_ + count; + } + + memcpy(_end, begin_, count * sizeof(xpath_node)); + _end += count; + } + + void sort_do() + { + _type = xpath_sort(_begin, _end, _type, false); + } + + void truncate(xpath_node* pos) + { + assert(_begin <= pos && pos <= _end); + + _end = pos; + } + + void remove_duplicates() + { + if (_type == xpath_node_set::type_unsorted) + sort(_begin, _end, duplicate_comparator()); + + _end = unique(_begin, _end); + } + + xpath_node_set::type_t type() const + { + return _type; + } + + void set_type(xpath_node_set::type_t value) + { + _type = value; + } + }; + + PUGI__FN_NO_INLINE void xpath_node_set_raw::push_back_grow(const xpath_node& node, xpath_allocator* alloc) + { + size_t capacity = static_cast(_eos - _begin); + + // get new capacity (1.5x rule) + size_t new_capacity = capacity + capacity / 2 + 1; + + // reallocate the old array or allocate a new one + xpath_node* data = static_cast(alloc->reallocate(_begin, capacity * sizeof(xpath_node), new_capacity * sizeof(xpath_node))); + assert(data); + + // finalize + _begin = data; + _end = data + capacity; + _eos = data + new_capacity; + + // push + *_end++ = node; + } +PUGI__NS_END + +PUGI__NS_BEGIN + struct xpath_context + { + xpath_node n; + size_t position, size; + + xpath_context(const xpath_node& n_, size_t position_, size_t size_): n(n_), position(position_), size(size_) + { + } + }; + + enum lexeme_t + { + lex_none = 0, + lex_equal, + lex_not_equal, + lex_less, + lex_greater, + lex_less_or_equal, + lex_greater_or_equal, + lex_plus, + lex_minus, + lex_multiply, + lex_union, + lex_var_ref, + lex_open_brace, + lex_close_brace, + lex_quoted_string, + lex_number, + lex_slash, + lex_double_slash, + lex_open_square_brace, + lex_close_square_brace, + lex_string, + lex_comma, + lex_axis_attribute, + lex_dot, + lex_double_dot, + lex_double_colon, + lex_eof + }; + + struct xpath_lexer_string + { + const char_t* begin; + const char_t* end; + + xpath_lexer_string(): begin(0), end(0) + { + } + + bool operator==(const char_t* other) const + { + size_t length = static_cast(end - begin); + + return strequalrange(other, begin, length); + } + }; + + class xpath_lexer + { + const char_t* _cur; + const char_t* _cur_lexeme_pos; + xpath_lexer_string _cur_lexeme_contents; + + lexeme_t _cur_lexeme; + + public: + explicit xpath_lexer(const char_t* query): _cur(query) + { + next(); + } + + const char_t* state() const + { + return _cur; + } + + void next() + { + const char_t* cur = _cur; + + while (PUGI__IS_CHARTYPE(*cur, ct_space)) ++cur; + + // save lexeme position for error reporting + _cur_lexeme_pos = cur; + + switch (*cur) + { + case 0: + _cur_lexeme = lex_eof; + break; + + case '>': + if (*(cur+1) == '=') + { + cur += 2; + _cur_lexeme = lex_greater_or_equal; + } + else + { + cur += 1; + _cur_lexeme = lex_greater; + } + break; + + case '<': + if (*(cur+1) == '=') + { + cur += 2; + _cur_lexeme = lex_less_or_equal; + } + else + { + cur += 1; + _cur_lexeme = lex_less; + } + break; + + case '!': + if (*(cur+1) == '=') + { + cur += 2; + _cur_lexeme = lex_not_equal; + } + else + { + _cur_lexeme = lex_none; + } + break; + + case '=': + cur += 1; + _cur_lexeme = lex_equal; + + break; + + case '+': + cur += 1; + _cur_lexeme = lex_plus; + + break; + + case '-': + cur += 1; + _cur_lexeme = lex_minus; + + break; + + case '*': + cur += 1; + _cur_lexeme = lex_multiply; + + break; + + case '|': + cur += 1; + _cur_lexeme = lex_union; + + break; + + case '$': + cur += 1; + + if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol)) + { + _cur_lexeme_contents.begin = cur; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++; + + if (cur[0] == ':' && PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // qname + { + cur++; // : + + while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++; + } + + _cur_lexeme_contents.end = cur; + + _cur_lexeme = lex_var_ref; + } + else + { + _cur_lexeme = lex_none; + } + + break; + + case '(': + cur += 1; + _cur_lexeme = lex_open_brace; + + break; + + case ')': + cur += 1; + _cur_lexeme = lex_close_brace; + + break; + + case '[': + cur += 1; + _cur_lexeme = lex_open_square_brace; + + break; + + case ']': + cur += 1; + _cur_lexeme = lex_close_square_brace; + + break; + + case ',': + cur += 1; + _cur_lexeme = lex_comma; + + break; + + case '/': + if (*(cur+1) == '/') + { + cur += 2; + _cur_lexeme = lex_double_slash; + } + else + { + cur += 1; + _cur_lexeme = lex_slash; + } + break; + + case '.': + if (*(cur+1) == '.') + { + cur += 2; + _cur_lexeme = lex_double_dot; + } + else if (PUGI__IS_CHARTYPEX(*(cur+1), ctx_digit)) + { + _cur_lexeme_contents.begin = cur; // . + + ++cur; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++; + + _cur_lexeme_contents.end = cur; + + _cur_lexeme = lex_number; + } + else + { + cur += 1; + _cur_lexeme = lex_dot; + } + break; + + case '@': + cur += 1; + _cur_lexeme = lex_axis_attribute; + + break; + + case '"': + case '\'': + { + char_t terminator = *cur; + + ++cur; + + _cur_lexeme_contents.begin = cur; + while (*cur && *cur != terminator) cur++; + _cur_lexeme_contents.end = cur; + + if (!*cur) + _cur_lexeme = lex_none; + else + { + cur += 1; + _cur_lexeme = lex_quoted_string; + } + + break; + } + + case ':': + if (*(cur+1) == ':') + { + cur += 2; + _cur_lexeme = lex_double_colon; + } + else + { + _cur_lexeme = lex_none; + } + break; + + default: + if (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) + { + _cur_lexeme_contents.begin = cur; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++; + + if (*cur == '.') + { + cur++; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++; + } + + _cur_lexeme_contents.end = cur; + + _cur_lexeme = lex_number; + } + else if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol)) + { + _cur_lexeme_contents.begin = cur; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++; + + if (cur[0] == ':') + { + if (cur[1] == '*') // namespace test ncname:* + { + cur += 2; // :* + } + else if (PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // namespace test qname + { + cur++; // : + + while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++; + } + } + + _cur_lexeme_contents.end = cur; + + _cur_lexeme = lex_string; + } + else + { + _cur_lexeme = lex_none; + } + } + + _cur = cur; + } + + lexeme_t current() const + { + return _cur_lexeme; + } + + const char_t* current_pos() const + { + return _cur_lexeme_pos; + } + + const xpath_lexer_string& contents() const + { + assert(_cur_lexeme == lex_var_ref || _cur_lexeme == lex_number || _cur_lexeme == lex_string || _cur_lexeme == lex_quoted_string); + + return _cur_lexeme_contents; + } + }; + + enum ast_type_t + { + ast_unknown, + ast_op_or, // left or right + ast_op_and, // left and right + ast_op_equal, // left = right + ast_op_not_equal, // left != right + ast_op_less, // left < right + ast_op_greater, // left > right + ast_op_less_or_equal, // left <= right + ast_op_greater_or_equal, // left >= right + ast_op_add, // left + right + ast_op_subtract, // left - right + ast_op_multiply, // left * right + ast_op_divide, // left / right + ast_op_mod, // left % right + ast_op_negate, // left - right + ast_op_union, // left | right + ast_predicate, // apply predicate to set; next points to next predicate + ast_filter, // select * from left where right + ast_string_constant, // string constant + ast_number_constant, // number constant + ast_variable, // variable + ast_func_last, // last() + ast_func_position, // position() + ast_func_count, // count(left) + ast_func_id, // id(left) + ast_func_local_name_0, // local-name() + ast_func_local_name_1, // local-name(left) + ast_func_namespace_uri_0, // namespace-uri() + ast_func_namespace_uri_1, // namespace-uri(left) + ast_func_name_0, // name() + ast_func_name_1, // name(left) + ast_func_string_0, // string() + ast_func_string_1, // string(left) + ast_func_concat, // concat(left, right, siblings) + ast_func_starts_with, // starts_with(left, right) + ast_func_contains, // contains(left, right) + ast_func_substring_before, // substring-before(left, right) + ast_func_substring_after, // substring-after(left, right) + ast_func_substring_2, // substring(left, right) + ast_func_substring_3, // substring(left, right, third) + ast_func_string_length_0, // string-length() + ast_func_string_length_1, // string-length(left) + ast_func_normalize_space_0, // normalize-space() + ast_func_normalize_space_1, // normalize-space(left) + ast_func_translate, // translate(left, right, third) + ast_func_boolean, // boolean(left) + ast_func_not, // not(left) + ast_func_true, // true() + ast_func_false, // false() + ast_func_lang, // lang(left) + ast_func_number_0, // number() + ast_func_number_1, // number(left) + ast_func_sum, // sum(left) + ast_func_floor, // floor(left) + ast_func_ceiling, // ceiling(left) + ast_func_round, // round(left) + ast_step, // process set left with step + ast_step_root, // select root node + + ast_opt_translate_table, // translate(left, right, third) where right/third are constants + ast_opt_compare_attribute // @name = 'string' + }; + + enum axis_t + { + axis_ancestor, + axis_ancestor_or_self, + axis_attribute, + axis_child, + axis_descendant, + axis_descendant_or_self, + axis_following, + axis_following_sibling, + axis_namespace, + axis_parent, + axis_preceding, + axis_preceding_sibling, + axis_self + }; + + enum nodetest_t + { + nodetest_none, + nodetest_name, + nodetest_type_node, + nodetest_type_comment, + nodetest_type_pi, + nodetest_type_text, + nodetest_pi, + nodetest_all, + nodetest_all_in_namespace + }; + + enum predicate_t + { + predicate_default, + predicate_posinv, + predicate_constant, + predicate_constant_one + }; + + enum nodeset_eval_t + { + nodeset_eval_all, + nodeset_eval_any, + nodeset_eval_first + }; + + template struct axis_to_type + { + static const axis_t axis; + }; + + template const axis_t axis_to_type::axis = N; + + class xpath_ast_node + { + private: + // node type + char _type; + char _rettype; + + // for ast_step + char _axis; + + // for ast_step/ast_predicate/ast_filter + char _test; + + // tree node structure + xpath_ast_node* _left; + xpath_ast_node* _right; + xpath_ast_node* _next; + + union + { + // value for ast_string_constant + const char_t* string; + // value for ast_number_constant + double number; + // variable for ast_variable + xpath_variable* variable; + // node test for ast_step (node name/namespace/node type/pi target) + const char_t* nodetest; + // table for ast_opt_translate_table + const unsigned char* table; + } _data; + + xpath_ast_node(const xpath_ast_node&); + xpath_ast_node& operator=(const xpath_ast_node&); + + template static bool compare_eq(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp) + { + xpath_value_type lt = lhs->rettype(), rt = rhs->rettype(); + + if (lt != xpath_type_node_set && rt != xpath_type_node_set) + { + if (lt == xpath_type_boolean || rt == xpath_type_boolean) + return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack)); + else if (lt == xpath_type_number || rt == xpath_type_number) + return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack)); + else if (lt == xpath_type_string || rt == xpath_type_string) + { + xpath_allocator_capture cr(stack.result); + + xpath_string ls = lhs->eval_string(c, stack); + xpath_string rs = rhs->eval_string(c, stack); + + return comp(ls, rs); + } + } + else if (lt == xpath_type_node_set && rt == xpath_type_node_set) + { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* li = ls.begin(); li != ls.end(); ++li) + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) + { + xpath_allocator_capture cri(stack.result); + + if (comp(string_value(*li, stack.result), string_value(*ri, stack.result))) + return true; + } + + return false; + } + else + { + if (lt == xpath_type_node_set) + { + swap(lhs, rhs); + swap(lt, rt); + } + + if (lt == xpath_type_boolean) + return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack)); + else if (lt == xpath_type_number) + { + xpath_allocator_capture cr(stack.result); + + double l = lhs->eval_number(c, stack); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) + { + xpath_allocator_capture cri(stack.result); + + if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str()))) + return true; + } + + return false; + } + else if (lt == xpath_type_string) + { + xpath_allocator_capture cr(stack.result); + + xpath_string l = lhs->eval_string(c, stack); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) + { + xpath_allocator_capture cri(stack.result); + + if (comp(l, string_value(*ri, stack.result))) + return true; + } + + return false; + } + } + + assert(!"Wrong types"); + return false; + } + + static bool eval_once(xpath_node_set::type_t type, nodeset_eval_t eval) + { + return type == xpath_node_set::type_sorted ? eval != nodeset_eval_all : eval == nodeset_eval_any; + } + + template static bool compare_rel(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp) + { + xpath_value_type lt = lhs->rettype(), rt = rhs->rettype(); + + if (lt != xpath_type_node_set && rt != xpath_type_node_set) + return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack)); + else if (lt == xpath_type_node_set && rt == xpath_type_node_set) + { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* li = ls.begin(); li != ls.end(); ++li) + { + xpath_allocator_capture cri(stack.result); + + double l = convert_string_to_number(string_value(*li, stack.result).c_str()); + + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) + { + xpath_allocator_capture crii(stack.result); + + if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str()))) + return true; + } + } + + return false; + } + else if (lt != xpath_type_node_set && rt == xpath_type_node_set) + { + xpath_allocator_capture cr(stack.result); + + double l = lhs->eval_number(c, stack); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) + { + xpath_allocator_capture cri(stack.result); + + if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str()))) + return true; + } + + return false; + } + else if (lt == xpath_type_node_set && rt != xpath_type_node_set) + { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all); + double r = rhs->eval_number(c, stack); + + for (const xpath_node* li = ls.begin(); li != ls.end(); ++li) + { + xpath_allocator_capture cri(stack.result); + + if (comp(convert_string_to_number(string_value(*li, stack.result).c_str()), r)) + return true; + } + + return false; + } + else + { + assert(!"Wrong types"); + return false; + } + } + + static void apply_predicate_boolean(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack, bool once) + { + assert(ns.size() >= first); + assert(expr->rettype() != xpath_type_number); + + size_t i = 1; + size_t size = ns.size() - first; + + xpath_node* last = ns.begin() + first; + + // remove_if... or well, sort of + for (xpath_node* it = last; it != ns.end(); ++it, ++i) + { + xpath_context c(*it, i, size); + + if (expr->eval_boolean(c, stack)) + { + *last++ = *it; + + if (once) break; + } + } + + ns.truncate(last); + } + + static void apply_predicate_number(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack, bool once) + { + assert(ns.size() >= first); + assert(expr->rettype() == xpath_type_number); + + size_t i = 1; + size_t size = ns.size() - first; + + xpath_node* last = ns.begin() + first; + + // remove_if... or well, sort of + for (xpath_node* it = last; it != ns.end(); ++it, ++i) + { + xpath_context c(*it, i, size); + + if (expr->eval_number(c, stack) == i) + { + *last++ = *it; + + if (once) break; + } + } + + ns.truncate(last); + } + + static void apply_predicate_number_const(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack) + { + assert(ns.size() >= first); + assert(expr->rettype() == xpath_type_number); + + size_t size = ns.size() - first; + + xpath_node* last = ns.begin() + first; + + xpath_context c(xpath_node(), 1, size); + + double er = expr->eval_number(c, stack); + + if (er >= 1.0 && er <= size) + { + size_t eri = static_cast(er); + + if (er == eri) + { + xpath_node r = last[eri - 1]; + + *last++ = r; + } + } + + ns.truncate(last); + } + + void apply_predicate(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, bool once) + { + if (ns.size() == first) return; + + assert(_type == ast_filter || _type == ast_predicate); + + if (_test == predicate_constant || _test == predicate_constant_one) + apply_predicate_number_const(ns, first, _right, stack); + else if (_right->rettype() == xpath_type_number) + apply_predicate_number(ns, first, _right, stack, once); + else + apply_predicate_boolean(ns, first, _right, stack, once); + } + + void apply_predicates(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, nodeset_eval_t eval) + { + if (ns.size() == first) return; + + bool last_once = eval_once(ns.type(), eval); + + for (xpath_ast_node* pred = _right; pred; pred = pred->_next) + pred->apply_predicate(ns, first, stack, !pred->_next && last_once); + } + + bool step_push(xpath_node_set_raw& ns, xml_attribute_struct* a, xml_node_struct* parent, xpath_allocator* alloc) + { + assert(a); + + const char_t* name = a->name ? a->name : PUGIXML_TEXT(""); + + switch (_test) + { + case nodetest_name: + if (strequal(name, _data.nodetest) && is_xpath_attribute(name)) + { + ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc); + return true; + } + break; + + case nodetest_type_node: + case nodetest_all: + if (is_xpath_attribute(name)) + { + ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc); + return true; + } + break; + + case nodetest_all_in_namespace: + if (starts_with(name, _data.nodetest) && is_xpath_attribute(name)) + { + ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc); + return true; + } + break; + + default: + ; + } + + return false; + } + + bool step_push(xpath_node_set_raw& ns, xml_node_struct* n, xpath_allocator* alloc) + { + assert(n); + + xml_node_type type = PUGI__NODETYPE(n); + + switch (_test) + { + case nodetest_name: + if (type == node_element && n->name && strequal(n->name, _data.nodetest)) + { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_type_node: + ns.push_back(xml_node(n), alloc); + return true; + + case nodetest_type_comment: + if (type == node_comment) + { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_type_text: + if (type == node_pcdata || type == node_cdata) + { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_type_pi: + if (type == node_pi) + { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_pi: + if (type == node_pi && n->name && strequal(n->name, _data.nodetest)) + { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_all: + if (type == node_element) + { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_all_in_namespace: + if (type == node_element && n->name && starts_with(n->name, _data.nodetest)) + { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + default: + assert(!"Unknown axis"); + } + + return false; + } + + template void step_fill(xpath_node_set_raw& ns, xml_node_struct* n, xpath_allocator* alloc, bool once, T) + { + const axis_t axis = T::axis; + + switch (axis) + { + case axis_attribute: + { + for (xml_attribute_struct* a = n->first_attribute; a; a = a->next_attribute) + if (step_push(ns, a, n, alloc) & once) + return; + + break; + } + + case axis_child: + { + for (xml_node_struct* c = n->first_child; c; c = c->next_sibling) + if (step_push(ns, c, alloc) & once) + return; + + break; + } + + case axis_descendant: + case axis_descendant_or_self: + { + if (axis == axis_descendant_or_self) + if (step_push(ns, n, alloc) & once) + return; + + xml_node_struct* cur = n->first_child; + + while (cur) + { + if (step_push(ns, cur, alloc) & once) + return; + + if (cur->first_child) + cur = cur->first_child; + else + { + while (!cur->next_sibling) + { + cur = cur->parent; + + if (cur == n) return; + } + + cur = cur->next_sibling; + } + } + + break; + } + + case axis_following_sibling: + { + for (xml_node_struct* c = n->next_sibling; c; c = c->next_sibling) + if (step_push(ns, c, alloc) & once) + return; + + break; + } + + case axis_preceding_sibling: + { + for (xml_node_struct* c = n->prev_sibling_c; c->next_sibling; c = c->prev_sibling_c) + if (step_push(ns, c, alloc) & once) + return; + + break; + } + + case axis_following: + { + xml_node_struct* cur = n; + + // exit from this node so that we don't include descendants + while (!cur->next_sibling) + { + cur = cur->parent; + + if (!cur) return; + } + + cur = cur->next_sibling; + + while (cur) + { + if (step_push(ns, cur, alloc) & once) + return; + + if (cur->first_child) + cur = cur->first_child; + else + { + while (!cur->next_sibling) + { + cur = cur->parent; + + if (!cur) return; + } + + cur = cur->next_sibling; + } + } + + break; + } + + case axis_preceding: + { + xml_node_struct* cur = n; + + // exit from this node so that we don't include descendants + while (!cur->prev_sibling_c->next_sibling) + { + cur = cur->parent; + + if (!cur) return; + } + + cur = cur->prev_sibling_c; + + while (cur) + { + if (cur->first_child) + cur = cur->first_child->prev_sibling_c; + else + { + // leaf node, can't be ancestor + if (step_push(ns, cur, alloc) & once) + return; + + while (!cur->prev_sibling_c->next_sibling) + { + cur = cur->parent; + + if (!cur) return; + + if (!node_is_ancestor(cur, n)) + if (step_push(ns, cur, alloc) & once) + return; + } + + cur = cur->prev_sibling_c; + } + } + + break; + } + + case axis_ancestor: + case axis_ancestor_or_self: + { + if (axis == axis_ancestor_or_self) + if (step_push(ns, n, alloc) & once) + return; + + xml_node_struct* cur = n->parent; + + while (cur) + { + if (step_push(ns, cur, alloc) & once) + return; + + cur = cur->parent; + } + + break; + } + + case axis_self: + { + step_push(ns, n, alloc); + + break; + } + + case axis_parent: + { + if (n->parent) + step_push(ns, n->parent, alloc); + + break; + } + + default: + assert(!"Unimplemented axis"); + } + } + + template void step_fill(xpath_node_set_raw& ns, xml_attribute_struct* a, xml_node_struct* p, xpath_allocator* alloc, bool once, T v) + { + const axis_t axis = T::axis; + + switch (axis) + { + case axis_ancestor: + case axis_ancestor_or_self: + { + if (axis == axis_ancestor_or_self && _test == nodetest_type_node) // reject attributes based on principal node type test + if (step_push(ns, a, p, alloc) & once) + return; + + xml_node_struct* cur = p; + + while (cur) + { + if (step_push(ns, cur, alloc) & once) + return; + + cur = cur->parent; + } + + break; + } + + case axis_descendant_or_self: + case axis_self: + { + if (_test == nodetest_type_node) // reject attributes based on principal node type test + step_push(ns, a, p, alloc); + + break; + } + + case axis_following: + { + xml_node_struct* cur = p; + + while (cur) + { + if (cur->first_child) + cur = cur->first_child; + else + { + while (!cur->next_sibling) + { + cur = cur->parent; + + if (!cur) return; + } + + cur = cur->next_sibling; + } + + if (step_push(ns, cur, alloc) & once) + return; + } + + break; + } + + case axis_parent: + { + step_push(ns, p, alloc); + + break; + } + + case axis_preceding: + { + // preceding:: axis does not include attribute nodes and attribute ancestors (they are the same as parent's ancestors), so we can reuse node preceding + step_fill(ns, p, alloc, once, v); + break; + } + + default: + assert(!"Unimplemented axis"); + } + } + + template void step_fill(xpath_node_set_raw& ns, const xpath_node& xn, xpath_allocator* alloc, bool once, T v) + { + const axis_t axis = T::axis; + const bool axis_has_attributes = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_descendant_or_self || axis == axis_following || axis == axis_parent || axis == axis_preceding || axis == axis_self); + + if (xn.node()) + step_fill(ns, xn.node().internal_object(), alloc, once, v); + else if (axis_has_attributes && xn.attribute() && xn.parent()) + step_fill(ns, xn.attribute().internal_object(), xn.parent().internal_object(), alloc, once, v); + } + + template xpath_node_set_raw step_do(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval, T v) + { + const axis_t axis = T::axis; + const bool axis_reverse = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling); + const xpath_node_set::type_t axis_type = axis_reverse ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted; + + bool once = + (axis == axis_attribute && _test == nodetest_name) || + (!_right && eval_once(axis_type, eval)) || + (_right && !_right->_next && _right->_test == predicate_constant_one); + + xpath_node_set_raw ns; + ns.set_type(axis_type); + + if (_left) + { + xpath_node_set_raw s = _left->eval_node_set(c, stack, nodeset_eval_all); + + // self axis preserves the original order + if (axis == axis_self) ns.set_type(s.type()); + + for (const xpath_node* it = s.begin(); it != s.end(); ++it) + { + size_t size = ns.size(); + + // in general, all axes generate elements in a particular order, but there is no order guarantee if axis is applied to two nodes + if (axis != axis_self && size != 0) ns.set_type(xpath_node_set::type_unsorted); + + step_fill(ns, *it, stack.result, once, v); + if (_right) apply_predicates(ns, size, stack, eval); + } + } + else + { + step_fill(ns, c.n, stack.result, once, v); + if (_right) apply_predicates(ns, 0, stack, eval); + } + + // child, attribute and self axes always generate unique set of nodes + // for other axis, if the set stayed sorted, it stayed unique because the traversal algorithms do not visit the same node twice + if (axis != axis_child && axis != axis_attribute && axis != axis_self && ns.type() == xpath_node_set::type_unsorted) + ns.remove_duplicates(); + + return ns; + } + + public: + xpath_ast_node(ast_type_t type, xpath_value_type rettype_, const char_t* value): + _type(static_cast(type)), _rettype(static_cast(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0) + { + assert(type == ast_string_constant); + _data.string = value; + } + + xpath_ast_node(ast_type_t type, xpath_value_type rettype_, double value): + _type(static_cast(type)), _rettype(static_cast(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0) + { + assert(type == ast_number_constant); + _data.number = value; + } + + xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_variable* value): + _type(static_cast(type)), _rettype(static_cast(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0) + { + assert(type == ast_variable); + _data.variable = value; + } + + xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_ast_node* left = 0, xpath_ast_node* right = 0): + _type(static_cast(type)), _rettype(static_cast(rettype_)), _axis(0), _test(0), _left(left), _right(right), _next(0) + { + } + + xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents): + _type(static_cast(type)), _rettype(xpath_type_node_set), _axis(static_cast(axis)), _test(static_cast(test)), _left(left), _right(0), _next(0) + { + assert(type == ast_step); + _data.nodetest = contents; + } + + xpath_ast_node(ast_type_t type, xpath_ast_node* left, xpath_ast_node* right, predicate_t test): + _type(static_cast(type)), _rettype(xpath_type_node_set), _axis(0), _test(static_cast(test)), _left(left), _right(right), _next(0) + { + assert(type == ast_filter || type == ast_predicate); + } + + void set_next(xpath_ast_node* value) + { + _next = value; + } + + void set_right(xpath_ast_node* value) + { + _right = value; + } + + bool eval_boolean(const xpath_context& c, const xpath_stack& stack) + { + switch (_type) + { + case ast_op_or: + return _left->eval_boolean(c, stack) || _right->eval_boolean(c, stack); + + case ast_op_and: + return _left->eval_boolean(c, stack) && _right->eval_boolean(c, stack); + + case ast_op_equal: + return compare_eq(_left, _right, c, stack, equal_to()); + + case ast_op_not_equal: + return compare_eq(_left, _right, c, stack, not_equal_to()); + + case ast_op_less: + return compare_rel(_left, _right, c, stack, less()); + + case ast_op_greater: + return compare_rel(_right, _left, c, stack, less()); + + case ast_op_less_or_equal: + return compare_rel(_left, _right, c, stack, less_equal()); + + case ast_op_greater_or_equal: + return compare_rel(_right, _left, c, stack, less_equal()); + + case ast_func_starts_with: + { + xpath_allocator_capture cr(stack.result); + + xpath_string lr = _left->eval_string(c, stack); + xpath_string rr = _right->eval_string(c, stack); + + return starts_with(lr.c_str(), rr.c_str()); + } + + case ast_func_contains: + { + xpath_allocator_capture cr(stack.result); + + xpath_string lr = _left->eval_string(c, stack); + xpath_string rr = _right->eval_string(c, stack); + + return find_substring(lr.c_str(), rr.c_str()) != 0; + } + + case ast_func_boolean: + return _left->eval_boolean(c, stack); + + case ast_func_not: + return !_left->eval_boolean(c, stack); + + case ast_func_true: + return true; + + case ast_func_false: + return false; + + case ast_func_lang: + { + if (c.n.attribute()) return false; + + xpath_allocator_capture cr(stack.result); + + xpath_string lang = _left->eval_string(c, stack); + + for (xml_node n = c.n.node(); n; n = n.parent()) + { + xml_attribute a = n.attribute(PUGIXML_TEXT("xml:lang")); + + if (a) + { + const char_t* value = a.value(); + + // strnicmp / strncasecmp is not portable + for (const char_t* lit = lang.c_str(); *lit; ++lit) + { + if (tolower_ascii(*lit) != tolower_ascii(*value)) return false; + ++value; + } + + return *value == 0 || *value == '-'; + } + } + + return false; + } + + case ast_opt_compare_attribute: + { + const char_t* value = (_right->_type == ast_string_constant) ? _right->_data.string : _right->_data.variable->get_string(); + + xml_attribute attr = c.n.node().attribute(_left->_data.nodetest); + + return attr && strequal(attr.value(), value) && is_xpath_attribute(attr.name()); + } + + case ast_variable: + { + assert(_rettype == _data.variable->type()); + + if (_rettype == xpath_type_boolean) + return _data.variable->get_boolean(); + + // fallthrough to type conversion + } + + default: + { + switch (_rettype) + { + case xpath_type_number: + return convert_number_to_boolean(eval_number(c, stack)); + + case xpath_type_string: + { + xpath_allocator_capture cr(stack.result); + + return !eval_string(c, stack).empty(); + } + + case xpath_type_node_set: + { + xpath_allocator_capture cr(stack.result); + + return !eval_node_set(c, stack, nodeset_eval_any).empty(); + } + + default: + assert(!"Wrong expression for return type boolean"); + return false; + } + } + } + } + + double eval_number(const xpath_context& c, const xpath_stack& stack) + { + switch (_type) + { + case ast_op_add: + return _left->eval_number(c, stack) + _right->eval_number(c, stack); + + case ast_op_subtract: + return _left->eval_number(c, stack) - _right->eval_number(c, stack); + + case ast_op_multiply: + return _left->eval_number(c, stack) * _right->eval_number(c, stack); + + case ast_op_divide: + return _left->eval_number(c, stack) / _right->eval_number(c, stack); + + case ast_op_mod: + return fmod(_left->eval_number(c, stack), _right->eval_number(c, stack)); + + case ast_op_negate: + return -_left->eval_number(c, stack); + + case ast_number_constant: + return _data.number; + + case ast_func_last: + return static_cast(c.size); + + case ast_func_position: + return static_cast(c.position); + + case ast_func_count: + { + xpath_allocator_capture cr(stack.result); + + return static_cast(_left->eval_node_set(c, stack, nodeset_eval_all).size()); + } + + case ast_func_string_length_0: + { + xpath_allocator_capture cr(stack.result); + + return static_cast(string_value(c.n, stack.result).length()); + } + + case ast_func_string_length_1: + { + xpath_allocator_capture cr(stack.result); + + return static_cast(_left->eval_string(c, stack).length()); + } + + case ast_func_number_0: + { + xpath_allocator_capture cr(stack.result); + + return convert_string_to_number(string_value(c.n, stack.result).c_str()); + } + + case ast_func_number_1: + return _left->eval_number(c, stack); + + case ast_func_sum: + { + xpath_allocator_capture cr(stack.result); + + double r = 0; + + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* it = ns.begin(); it != ns.end(); ++it) + { + xpath_allocator_capture cri(stack.result); + + r += convert_string_to_number(string_value(*it, stack.result).c_str()); + } + + return r; + } + + case ast_func_floor: + { + double r = _left->eval_number(c, stack); + + return r == r ? floor(r) : r; + } + + case ast_func_ceiling: + { + double r = _left->eval_number(c, stack); + + return r == r ? ceil(r) : r; + } + + case ast_func_round: + return round_nearest_nzero(_left->eval_number(c, stack)); + + case ast_variable: + { + assert(_rettype == _data.variable->type()); + + if (_rettype == xpath_type_number) + return _data.variable->get_number(); + + // fallthrough to type conversion + } + + default: + { + switch (_rettype) + { + case xpath_type_boolean: + return eval_boolean(c, stack) ? 1 : 0; + + case xpath_type_string: + { + xpath_allocator_capture cr(stack.result); + + return convert_string_to_number(eval_string(c, stack).c_str()); + } + + case xpath_type_node_set: + { + xpath_allocator_capture cr(stack.result); + + return convert_string_to_number(eval_string(c, stack).c_str()); + } + + default: + assert(!"Wrong expression for return type number"); + return 0; + } + + } + } + } + + xpath_string eval_string_concat(const xpath_context& c, const xpath_stack& stack) + { + assert(_type == ast_func_concat); + + xpath_allocator_capture ct(stack.temp); + + // count the string number + size_t count = 1; + for (xpath_ast_node* nc = _right; nc; nc = nc->_next) count++; + + // gather all strings + xpath_string static_buffer[4]; + xpath_string* buffer = static_buffer; + + // allocate on-heap for large concats + if (count > sizeof(static_buffer) / sizeof(static_buffer[0])) + { + buffer = static_cast(stack.temp->allocate(count * sizeof(xpath_string))); + assert(buffer); + } + + // evaluate all strings to temporary stack + xpath_stack swapped_stack = {stack.temp, stack.result}; + + buffer[0] = _left->eval_string(c, swapped_stack); + + size_t pos = 1; + for (xpath_ast_node* n = _right; n; n = n->_next, ++pos) buffer[pos] = n->eval_string(c, swapped_stack); + assert(pos == count); + + // get total length + size_t length = 0; + for (size_t i = 0; i < count; ++i) length += buffer[i].length(); + + // create final string + char_t* result = static_cast(stack.result->allocate((length + 1) * sizeof(char_t))); + assert(result); + + char_t* ri = result; + + for (size_t j = 0; j < count; ++j) + for (const char_t* bi = buffer[j].c_str(); *bi; ++bi) + *ri++ = *bi; + + *ri = 0; + + return xpath_string::from_heap_preallocated(result, ri); + } + + xpath_string eval_string(const xpath_context& c, const xpath_stack& stack) + { + switch (_type) + { + case ast_string_constant: + return xpath_string::from_const(_data.string); + + case ast_func_local_name_0: + { + xpath_node na = c.n; + + return xpath_string::from_const(local_name(na)); + } + + case ast_func_local_name_1: + { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first); + xpath_node na = ns.first(); + + return xpath_string::from_const(local_name(na)); + } + + case ast_func_name_0: + { + xpath_node na = c.n; + + return xpath_string::from_const(qualified_name(na)); + } + + case ast_func_name_1: + { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first); + xpath_node na = ns.first(); + + return xpath_string::from_const(qualified_name(na)); + } + + case ast_func_namespace_uri_0: + { + xpath_node na = c.n; + + return xpath_string::from_const(namespace_uri(na)); + } + + case ast_func_namespace_uri_1: + { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first); + xpath_node na = ns.first(); + + return xpath_string::from_const(namespace_uri(na)); + } + + case ast_func_string_0: + return string_value(c.n, stack.result); + + case ast_func_string_1: + return _left->eval_string(c, stack); + + case ast_func_concat: + return eval_string_concat(c, stack); + + case ast_func_substring_before: + { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, swapped_stack); + xpath_string p = _right->eval_string(c, swapped_stack); + + const char_t* pos = find_substring(s.c_str(), p.c_str()); + + return pos ? xpath_string::from_heap(s.c_str(), pos, stack.result) : xpath_string(); + } + + case ast_func_substring_after: + { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, swapped_stack); + xpath_string p = _right->eval_string(c, swapped_stack); + + const char_t* pos = find_substring(s.c_str(), p.c_str()); + if (!pos) return xpath_string(); + + const char_t* rbegin = pos + p.length(); + const char_t* rend = s.c_str() + s.length(); + + return s.uses_heap() ? xpath_string::from_heap(rbegin, rend, stack.result) : xpath_string::from_const(rbegin); + } + + case ast_func_substring_2: + { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, swapped_stack); + size_t s_length = s.length(); + + double first = round_nearest(_right->eval_number(c, stack)); + + if (is_nan(first)) return xpath_string(); // NaN + else if (first >= s_length + 1) return xpath_string(); + + size_t pos = first < 1 ? 1 : static_cast(first); + assert(1 <= pos && pos <= s_length + 1); + + const char_t* rbegin = s.c_str() + (pos - 1); + const char_t* rend = s.c_str() + s.length(); + + return s.uses_heap() ? xpath_string::from_heap(rbegin, rend, stack.result) : xpath_string::from_const(rbegin); + } + + case ast_func_substring_3: + { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, swapped_stack); + size_t s_length = s.length(); + + double first = round_nearest(_right->eval_number(c, stack)); + double last = first + round_nearest(_right->_next->eval_number(c, stack)); + + if (is_nan(first) || is_nan(last)) return xpath_string(); + else if (first >= s_length + 1) return xpath_string(); + else if (first >= last) return xpath_string(); + else if (last < 1) return xpath_string(); + + size_t pos = first < 1 ? 1 : static_cast(first); + size_t end = last >= s_length + 1 ? s_length + 1 : static_cast(last); + + assert(1 <= pos && pos <= end && end <= s_length + 1); + const char_t* rbegin = s.c_str() + (pos - 1); + const char_t* rend = s.c_str() + (end - 1); + + return (end == s_length + 1 && !s.uses_heap()) ? xpath_string::from_const(rbegin) : xpath_string::from_heap(rbegin, rend, stack.result); + } + + case ast_func_normalize_space_0: + { + xpath_string s = string_value(c.n, stack.result); + + normalize_space(s.data(stack.result)); + + return s; + } + + case ast_func_normalize_space_1: + { + xpath_string s = _left->eval_string(c, stack); + + normalize_space(s.data(stack.result)); + + return s; + } + + case ast_func_translate: + { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, stack); + xpath_string from = _right->eval_string(c, swapped_stack); + xpath_string to = _right->_next->eval_string(c, swapped_stack); + + translate(s.data(stack.result), from.c_str(), to.c_str(), to.length()); + + return s; + } + + case ast_opt_translate_table: + { + xpath_string s = _left->eval_string(c, stack); + + translate_table(s.data(stack.result), _data.table); + + return s; + } + + case ast_variable: + { + assert(_rettype == _data.variable->type()); + + if (_rettype == xpath_type_string) + return xpath_string::from_const(_data.variable->get_string()); + + // fallthrough to type conversion + } + + default: + { + switch (_rettype) + { + case xpath_type_boolean: + return xpath_string::from_const(eval_boolean(c, stack) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false")); + + case xpath_type_number: + return convert_number_to_string(eval_number(c, stack), stack.result); + + case xpath_type_node_set: + { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_node_set_raw ns = eval_node_set(c, swapped_stack, nodeset_eval_first); + return ns.empty() ? xpath_string() : string_value(ns.first(), stack.result); + } + + default: + assert(!"Wrong expression for return type string"); + return xpath_string(); + } + } + } + } + + xpath_node_set_raw eval_node_set(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval) + { + switch (_type) + { + case ast_op_union: + { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack, eval); + xpath_node_set_raw rs = _right->eval_node_set(c, stack, eval); + + // we can optimize merging two sorted sets, but this is a very rare operation, so don't bother + rs.set_type(xpath_node_set::type_unsorted); + + rs.append(ls.begin(), ls.end(), stack.result); + rs.remove_duplicates(); + + return rs; + } + + case ast_filter: + { + xpath_node_set_raw set = _left->eval_node_set(c, stack, _test == predicate_constant_one ? nodeset_eval_first : nodeset_eval_all); + + // either expression is a number or it contains position() call; sort by document order + if (_test != predicate_posinv) set.sort_do(); + + bool once = eval_once(set.type(), eval); + + apply_predicate(set, 0, stack, once); + + return set; + } + + case ast_func_id: + return xpath_node_set_raw(); + + case ast_step: + { + switch (_axis) + { + case axis_ancestor: + return step_do(c, stack, eval, axis_to_type()); + + case axis_ancestor_or_self: + return step_do(c, stack, eval, axis_to_type()); + + case axis_attribute: + return step_do(c, stack, eval, axis_to_type()); + + case axis_child: + return step_do(c, stack, eval, axis_to_type()); + + case axis_descendant: + return step_do(c, stack, eval, axis_to_type()); + + case axis_descendant_or_self: + return step_do(c, stack, eval, axis_to_type()); + + case axis_following: + return step_do(c, stack, eval, axis_to_type()); + + case axis_following_sibling: + return step_do(c, stack, eval, axis_to_type()); + + case axis_namespace: + // namespaced axis is not supported + return xpath_node_set_raw(); + + case axis_parent: + return step_do(c, stack, eval, axis_to_type()); + + case axis_preceding: + return step_do(c, stack, eval, axis_to_type()); + + case axis_preceding_sibling: + return step_do(c, stack, eval, axis_to_type()); + + case axis_self: + return step_do(c, stack, eval, axis_to_type()); + + default: + assert(!"Unknown axis"); + return xpath_node_set_raw(); + } + } + + case ast_step_root: + { + assert(!_right); // root step can't have any predicates + + xpath_node_set_raw ns; + + ns.set_type(xpath_node_set::type_sorted); + + if (c.n.node()) ns.push_back(c.n.node().root(), stack.result); + else if (c.n.attribute()) ns.push_back(c.n.parent().root(), stack.result); + + return ns; + } + + case ast_variable: + { + assert(_rettype == _data.variable->type()); + + if (_rettype == xpath_type_node_set) + { + const xpath_node_set& s = _data.variable->get_node_set(); + + xpath_node_set_raw ns; + + ns.set_type(s.type()); + ns.append(s.begin(), s.end(), stack.result); + + return ns; + } + + // fallthrough to type conversion + } + + default: + assert(!"Wrong expression for return type node set"); + return xpath_node_set_raw(); + } + } + + void optimize(xpath_allocator* alloc) + { + if (_left) _left->optimize(alloc); + if (_right) _right->optimize(alloc); + if (_next) _next->optimize(alloc); + + // Rewrite [position()=expr] with [expr] + // Note that this step has to go before classification to recognize [position()=1] + if ((_type == ast_filter || _type == ast_predicate) && + _right->_type == ast_op_equal && _right->_left->_type == ast_func_position && _right->_right->_rettype == xpath_type_number) + { + _right = _right->_right; + } + + // Classify filter/predicate ops to perform various optimizations during evaluation + if (_type == ast_filter || _type == ast_predicate) + { + assert(_test == predicate_default); + + if (_right->_type == ast_number_constant && _right->_data.number == 1.0) + _test = predicate_constant_one; + else if (_right->_rettype == xpath_type_number && (_right->_type == ast_number_constant || _right->_type == ast_variable || _right->_type == ast_func_last)) + _test = predicate_constant; + else if (_right->_rettype != xpath_type_number && _right->is_posinv_expr()) + _test = predicate_posinv; + } + + // Rewrite descendant-or-self::node()/child::foo with descendant::foo + // The former is a full form of //foo, the latter is much faster since it executes the node test immediately + // Do a similar kind of rewrite for self/descendant/descendant-or-self axes + // Note that we only rewrite positionally invariant steps (//foo[1] != /descendant::foo[1]) + if (_type == ast_step && (_axis == axis_child || _axis == axis_self || _axis == axis_descendant || _axis == axis_descendant_or_self) && _left && + _left->_type == ast_step && _left->_axis == axis_descendant_or_self && _left->_test == nodetest_type_node && !_left->_right && + is_posinv_step()) + { + if (_axis == axis_child || _axis == axis_descendant) + _axis = axis_descendant; + else + _axis = axis_descendant_or_self; + + _left = _left->_left; + } + + // Use optimized lookup table implementation for translate() with constant arguments + if (_type == ast_func_translate && _right->_type == ast_string_constant && _right->_next->_type == ast_string_constant) + { + unsigned char* table = translate_table_generate(alloc, _right->_data.string, _right->_next->_data.string); + + if (table) + { + _type = ast_opt_translate_table; + _data.table = table; + } + } + + // Use optimized path for @attr = 'value' or @attr = $value + if (_type == ast_op_equal && + _left->_type == ast_step && _left->_axis == axis_attribute && _left->_test == nodetest_name && !_left->_left && !_left->_right && + (_right->_type == ast_string_constant || (_right->_type == ast_variable && _right->_rettype == xpath_type_string))) + { + _type = ast_opt_compare_attribute; + } + } + + bool is_posinv_expr() const + { + switch (_type) + { + case ast_func_position: + case ast_func_last: + return false; + + case ast_string_constant: + case ast_number_constant: + case ast_variable: + return true; + + case ast_step: + case ast_step_root: + return true; + + case ast_predicate: + case ast_filter: + return true; + + default: + if (_left && !_left->is_posinv_expr()) return false; + + for (xpath_ast_node* n = _right; n; n = n->_next) + if (!n->is_posinv_expr()) return false; + + return true; + } + } + + bool is_posinv_step() const + { + assert(_type == ast_step); + + for (xpath_ast_node* n = _right; n; n = n->_next) + { + assert(n->_type == ast_predicate); + + if (n->_test != predicate_posinv) + return false; + } + + return true; + } + + xpath_value_type rettype() const + { + return static_cast(_rettype); + } + }; + + struct xpath_parser + { + xpath_allocator* _alloc; + xpath_lexer _lexer; + + const char_t* _query; + xpath_variable_set* _variables; + + xpath_parse_result* _result; + + char_t _scratch[32]; + + #ifdef PUGIXML_NO_EXCEPTIONS + jmp_buf _error_handler; + #endif + + void throw_error(const char* message) + { + _result->error = message; + _result->offset = _lexer.current_pos() - _query; + + #ifdef PUGIXML_NO_EXCEPTIONS + longjmp(_error_handler, 1); + #else + throw xpath_exception(*_result); + #endif + } + + void throw_error_oom() + { + #ifdef PUGIXML_NO_EXCEPTIONS + throw_error("Out of memory"); + #else + throw std::bad_alloc(); + #endif + } + + void* alloc_node() + { + void* result = _alloc->allocate_nothrow(sizeof(xpath_ast_node)); + + if (!result) throw_error_oom(); + + return result; + } + + const char_t* alloc_string(const xpath_lexer_string& value) + { + if (value.begin) + { + size_t length = static_cast(value.end - value.begin); + + char_t* c = static_cast(_alloc->allocate_nothrow((length + 1) * sizeof(char_t))); + if (!c) throw_error_oom(); + assert(c); // workaround for clang static analysis + + memcpy(c, value.begin, length * sizeof(char_t)); + c[length] = 0; + + return c; + } + else return 0; + } + + xpath_ast_node* parse_function_helper(ast_type_t type0, ast_type_t type1, size_t argc, xpath_ast_node* args[2]) + { + assert(argc <= 1); + + if (argc == 1 && args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set"); + + return new (alloc_node()) xpath_ast_node(argc == 0 ? type0 : type1, xpath_type_string, args[0]); + } + + xpath_ast_node* parse_function(const xpath_lexer_string& name, size_t argc, xpath_ast_node* args[2]) + { + switch (name.begin[0]) + { + case 'b': + if (name == PUGIXML_TEXT("boolean") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_boolean, xpath_type_boolean, args[0]); + + break; + + case 'c': + if (name == PUGIXML_TEXT("count") && argc == 1) + { + if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set"); + return new (alloc_node()) xpath_ast_node(ast_func_count, xpath_type_number, args[0]); + } + else if (name == PUGIXML_TEXT("contains") && argc == 2) + return new (alloc_node()) xpath_ast_node(ast_func_contains, xpath_type_boolean, args[0], args[1]); + else if (name == PUGIXML_TEXT("concat") && argc >= 2) + return new (alloc_node()) xpath_ast_node(ast_func_concat, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("ceiling") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_ceiling, xpath_type_number, args[0]); + + break; + + case 'f': + if (name == PUGIXML_TEXT("false") && argc == 0) + return new (alloc_node()) xpath_ast_node(ast_func_false, xpath_type_boolean); + else if (name == PUGIXML_TEXT("floor") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_floor, xpath_type_number, args[0]); + + break; + + case 'i': + if (name == PUGIXML_TEXT("id") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_id, xpath_type_node_set, args[0]); + + break; + + case 'l': + if (name == PUGIXML_TEXT("last") && argc == 0) + return new (alloc_node()) xpath_ast_node(ast_func_last, xpath_type_number); + else if (name == PUGIXML_TEXT("lang") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_lang, xpath_type_boolean, args[0]); + else if (name == PUGIXML_TEXT("local-name") && argc <= 1) + return parse_function_helper(ast_func_local_name_0, ast_func_local_name_1, argc, args); + + break; + + case 'n': + if (name == PUGIXML_TEXT("name") && argc <= 1) + return parse_function_helper(ast_func_name_0, ast_func_name_1, argc, args); + else if (name == PUGIXML_TEXT("namespace-uri") && argc <= 1) + return parse_function_helper(ast_func_namespace_uri_0, ast_func_namespace_uri_1, argc, args); + else if (name == PUGIXML_TEXT("normalize-space") && argc <= 1) + return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("not") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_not, xpath_type_boolean, args[0]); + else if (name == PUGIXML_TEXT("number") && argc <= 1) + return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_number_0 : ast_func_number_1, xpath_type_number, args[0]); + + break; + + case 'p': + if (name == PUGIXML_TEXT("position") && argc == 0) + return new (alloc_node()) xpath_ast_node(ast_func_position, xpath_type_number); + + break; + + case 'r': + if (name == PUGIXML_TEXT("round") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_round, xpath_type_number, args[0]); + + break; + + case 's': + if (name == PUGIXML_TEXT("string") && argc <= 1) + return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_0 : ast_func_string_1, xpath_type_string, args[0]); + else if (name == PUGIXML_TEXT("string-length") && argc <= 1) + return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1, xpath_type_number, args[0]); + else if (name == PUGIXML_TEXT("starts-with") && argc == 2) + return new (alloc_node()) xpath_ast_node(ast_func_starts_with, xpath_type_boolean, args[0], args[1]); + else if (name == PUGIXML_TEXT("substring-before") && argc == 2) + return new (alloc_node()) xpath_ast_node(ast_func_substring_before, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("substring-after") && argc == 2) + return new (alloc_node()) xpath_ast_node(ast_func_substring_after, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("substring") && (argc == 2 || argc == 3)) + return new (alloc_node()) xpath_ast_node(argc == 2 ? ast_func_substring_2 : ast_func_substring_3, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("sum") && argc == 1) + { + if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set"); + return new (alloc_node()) xpath_ast_node(ast_func_sum, xpath_type_number, args[0]); + } + + break; + + case 't': + if (name == PUGIXML_TEXT("translate") && argc == 3) + return new (alloc_node()) xpath_ast_node(ast_func_translate, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("true") && argc == 0) + return new (alloc_node()) xpath_ast_node(ast_func_true, xpath_type_boolean); + + break; + + default: + break; + } + + throw_error("Unrecognized function or wrong parameter count"); + + return 0; + } + + axis_t parse_axis_name(const xpath_lexer_string& name, bool& specified) + { + specified = true; + + switch (name.begin[0]) + { + case 'a': + if (name == PUGIXML_TEXT("ancestor")) + return axis_ancestor; + else if (name == PUGIXML_TEXT("ancestor-or-self")) + return axis_ancestor_or_self; + else if (name == PUGIXML_TEXT("attribute")) + return axis_attribute; + + break; + + case 'c': + if (name == PUGIXML_TEXT("child")) + return axis_child; + + break; + + case 'd': + if (name == PUGIXML_TEXT("descendant")) + return axis_descendant; + else if (name == PUGIXML_TEXT("descendant-or-self")) + return axis_descendant_or_self; + + break; + + case 'f': + if (name == PUGIXML_TEXT("following")) + return axis_following; + else if (name == PUGIXML_TEXT("following-sibling")) + return axis_following_sibling; + + break; + + case 'n': + if (name == PUGIXML_TEXT("namespace")) + return axis_namespace; + + break; + + case 'p': + if (name == PUGIXML_TEXT("parent")) + return axis_parent; + else if (name == PUGIXML_TEXT("preceding")) + return axis_preceding; + else if (name == PUGIXML_TEXT("preceding-sibling")) + return axis_preceding_sibling; + + break; + + case 's': + if (name == PUGIXML_TEXT("self")) + return axis_self; + + break; + + default: + break; + } + + specified = false; + return axis_child; + } + + nodetest_t parse_node_test_type(const xpath_lexer_string& name) + { + switch (name.begin[0]) + { + case 'c': + if (name == PUGIXML_TEXT("comment")) + return nodetest_type_comment; + + break; + + case 'n': + if (name == PUGIXML_TEXT("node")) + return nodetest_type_node; + + break; + + case 'p': + if (name == PUGIXML_TEXT("processing-instruction")) + return nodetest_type_pi; + + break; + + case 't': + if (name == PUGIXML_TEXT("text")) + return nodetest_type_text; + + break; + + default: + break; + } + + return nodetest_none; + } + + // PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall + xpath_ast_node* parse_primary_expression() + { + switch (_lexer.current()) + { + case lex_var_ref: + { + xpath_lexer_string name = _lexer.contents(); + + if (!_variables) + throw_error("Unknown variable: variable set is not provided"); + + xpath_variable* var = get_variable_scratch(_scratch, _variables, name.begin, name.end); + + if (!var) + throw_error("Unknown variable: variable set does not contain the given name"); + + _lexer.next(); + + return new (alloc_node()) xpath_ast_node(ast_variable, var->type(), var); + } + + case lex_open_brace: + { + _lexer.next(); + + xpath_ast_node* n = parse_expression(); + + if (_lexer.current() != lex_close_brace) + throw_error("Unmatched braces"); + + _lexer.next(); + + return n; + } + + case lex_quoted_string: + { + const char_t* value = alloc_string(_lexer.contents()); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_string_constant, xpath_type_string, value); + _lexer.next(); + + return n; + } + + case lex_number: + { + double value = 0; + + if (!convert_string_to_number_scratch(_scratch, _lexer.contents().begin, _lexer.contents().end, &value)) + throw_error_oom(); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_number_constant, xpath_type_number, value); + _lexer.next(); + + return n; + } + + case lex_string: + { + xpath_ast_node* args[2] = {0}; + size_t argc = 0; + + xpath_lexer_string function = _lexer.contents(); + _lexer.next(); + + xpath_ast_node* last_arg = 0; + + if (_lexer.current() != lex_open_brace) + throw_error("Unrecognized function call"); + _lexer.next(); + + if (_lexer.current() != lex_close_brace) + args[argc++] = parse_expression(); + + while (_lexer.current() != lex_close_brace) + { + if (_lexer.current() != lex_comma) + throw_error("No comma between function arguments"); + _lexer.next(); + + xpath_ast_node* n = parse_expression(); + + if (argc < 2) args[argc] = n; + else last_arg->set_next(n); + + argc++; + last_arg = n; + } + + _lexer.next(); + + return parse_function(function, argc, args); + } + + default: + throw_error("Unrecognizable primary expression"); + + return 0; + } + } + + // FilterExpr ::= PrimaryExpr | FilterExpr Predicate + // Predicate ::= '[' PredicateExpr ']' + // PredicateExpr ::= Expr + xpath_ast_node* parse_filter_expression() + { + xpath_ast_node* n = parse_primary_expression(); + + while (_lexer.current() == lex_open_square_brace) + { + _lexer.next(); + + xpath_ast_node* expr = parse_expression(); + + if (n->rettype() != xpath_type_node_set) throw_error("Predicate has to be applied to node set"); + + n = new (alloc_node()) xpath_ast_node(ast_filter, n, expr, predicate_default); + + if (_lexer.current() != lex_close_square_brace) + throw_error("Unmatched square brace"); + + _lexer.next(); + } + + return n; + } + + // Step ::= AxisSpecifier NodeTest Predicate* | AbbreviatedStep + // AxisSpecifier ::= AxisName '::' | '@'? + // NodeTest ::= NameTest | NodeType '(' ')' | 'processing-instruction' '(' Literal ')' + // NameTest ::= '*' | NCName ':' '*' | QName + // AbbreviatedStep ::= '.' | '..' + xpath_ast_node* parse_step(xpath_ast_node* set) + { + if (set && set->rettype() != xpath_type_node_set) + throw_error("Step has to be applied to node set"); + + bool axis_specified = false; + axis_t axis = axis_child; // implied child axis + + if (_lexer.current() == lex_axis_attribute) + { + axis = axis_attribute; + axis_specified = true; + + _lexer.next(); + } + else if (_lexer.current() == lex_dot) + { + _lexer.next(); + + return new (alloc_node()) xpath_ast_node(ast_step, set, axis_self, nodetest_type_node, 0); + } + else if (_lexer.current() == lex_double_dot) + { + _lexer.next(); + + return new (alloc_node()) xpath_ast_node(ast_step, set, axis_parent, nodetest_type_node, 0); + } + + nodetest_t nt_type = nodetest_none; + xpath_lexer_string nt_name; + + if (_lexer.current() == lex_string) + { + // node name test + nt_name = _lexer.contents(); + _lexer.next(); + + // was it an axis name? + if (_lexer.current() == lex_double_colon) + { + // parse axis name + if (axis_specified) throw_error("Two axis specifiers in one step"); + + axis = parse_axis_name(nt_name, axis_specified); + + if (!axis_specified) throw_error("Unknown axis"); + + // read actual node test + _lexer.next(); + + if (_lexer.current() == lex_multiply) + { + nt_type = nodetest_all; + nt_name = xpath_lexer_string(); + _lexer.next(); + } + else if (_lexer.current() == lex_string) + { + nt_name = _lexer.contents(); + _lexer.next(); + } + else throw_error("Unrecognized node test"); + } + + if (nt_type == nodetest_none) + { + // node type test or processing-instruction + if (_lexer.current() == lex_open_brace) + { + _lexer.next(); + + if (_lexer.current() == lex_close_brace) + { + _lexer.next(); + + nt_type = parse_node_test_type(nt_name); + + if (nt_type == nodetest_none) throw_error("Unrecognized node type"); + + nt_name = xpath_lexer_string(); + } + else if (nt_name == PUGIXML_TEXT("processing-instruction")) + { + if (_lexer.current() != lex_quoted_string) + throw_error("Only literals are allowed as arguments to processing-instruction()"); + + nt_type = nodetest_pi; + nt_name = _lexer.contents(); + _lexer.next(); + + if (_lexer.current() != lex_close_brace) + throw_error("Unmatched brace near processing-instruction()"); + _lexer.next(); + } + else + throw_error("Unmatched brace near node type test"); + + } + // QName or NCName:* + else + { + if (nt_name.end - nt_name.begin > 2 && nt_name.end[-2] == ':' && nt_name.end[-1] == '*') // NCName:* + { + nt_name.end--; // erase * + + nt_type = nodetest_all_in_namespace; + } + else nt_type = nodetest_name; + } + } + } + else if (_lexer.current() == lex_multiply) + { + nt_type = nodetest_all; + _lexer.next(); + } + else throw_error("Unrecognized node test"); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step, set, axis, nt_type, alloc_string(nt_name)); + + xpath_ast_node* last = 0; + + while (_lexer.current() == lex_open_square_brace) + { + _lexer.next(); + + xpath_ast_node* expr = parse_expression(); + + xpath_ast_node* pred = new (alloc_node()) xpath_ast_node(ast_predicate, 0, expr, predicate_default); + + if (_lexer.current() != lex_close_square_brace) + throw_error("Unmatched square brace"); + _lexer.next(); + + if (last) last->set_next(pred); + else n->set_right(pred); + + last = pred; + } + + return n; + } + + // RelativeLocationPath ::= Step | RelativeLocationPath '/' Step | RelativeLocationPath '//' Step + xpath_ast_node* parse_relative_location_path(xpath_ast_node* set) + { + xpath_ast_node* n = parse_step(set); + + while (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash) + { + lexeme_t l = _lexer.current(); + _lexer.next(); + + if (l == lex_double_slash) + n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0); + + n = parse_step(n); + } + + return n; + } + + // LocationPath ::= RelativeLocationPath | AbsoluteLocationPath + // AbsoluteLocationPath ::= '/' RelativeLocationPath? | '//' RelativeLocationPath + xpath_ast_node* parse_location_path() + { + if (_lexer.current() == lex_slash) + { + _lexer.next(); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set); + + // relative location path can start from axis_attribute, dot, double_dot, multiply and string lexemes; any other lexeme means standalone root path + lexeme_t l = _lexer.current(); + + if (l == lex_string || l == lex_axis_attribute || l == lex_dot || l == lex_double_dot || l == lex_multiply) + return parse_relative_location_path(n); + else + return n; + } + else if (_lexer.current() == lex_double_slash) + { + _lexer.next(); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set); + n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0); + + return parse_relative_location_path(n); + } + + // else clause moved outside of if because of bogus warning 'control may reach end of non-void function being inlined' in gcc 4.0.1 + return parse_relative_location_path(0); + } + + // PathExpr ::= LocationPath + // | FilterExpr + // | FilterExpr '/' RelativeLocationPath + // | FilterExpr '//' RelativeLocationPath + // UnionExpr ::= PathExpr | UnionExpr '|' PathExpr + // UnaryExpr ::= UnionExpr | '-' UnaryExpr + xpath_ast_node* parse_path_or_unary_expression() + { + // Clarification. + // PathExpr begins with either LocationPath or FilterExpr. + // FilterExpr begins with PrimaryExpr + // PrimaryExpr begins with '$' in case of it being a variable reference, + // '(' in case of it being an expression, string literal, number constant or + // function call. + + if (_lexer.current() == lex_var_ref || _lexer.current() == lex_open_brace || + _lexer.current() == lex_quoted_string || _lexer.current() == lex_number || + _lexer.current() == lex_string) + { + if (_lexer.current() == lex_string) + { + // This is either a function call, or not - if not, we shall proceed with location path + const char_t* state = _lexer.state(); + + while (PUGI__IS_CHARTYPE(*state, ct_space)) ++state; + + if (*state != '(') return parse_location_path(); + + // This looks like a function call; however this still can be a node-test. Check it. + if (parse_node_test_type(_lexer.contents()) != nodetest_none) return parse_location_path(); + } + + xpath_ast_node* n = parse_filter_expression(); + + if (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash) + { + lexeme_t l = _lexer.current(); + _lexer.next(); + + if (l == lex_double_slash) + { + if (n->rettype() != xpath_type_node_set) throw_error("Step has to be applied to node set"); + + n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0); + } + + // select from location path + return parse_relative_location_path(n); + } + + return n; + } + else if (_lexer.current() == lex_minus) + { + _lexer.next(); + + // precedence 7+ - only parses union expressions + xpath_ast_node* expr = parse_expression_rec(parse_path_or_unary_expression(), 7); + + return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr); + } + else + return parse_location_path(); + } + + struct binary_op_t + { + ast_type_t asttype; + xpath_value_type rettype; + int precedence; + + binary_op_t(): asttype(ast_unknown), rettype(xpath_type_none), precedence(0) + { + } + + binary_op_t(ast_type_t asttype_, xpath_value_type rettype_, int precedence_): asttype(asttype_), rettype(rettype_), precedence(precedence_) + { + } + + static binary_op_t parse(xpath_lexer& lexer) + { + switch (lexer.current()) + { + case lex_string: + if (lexer.contents() == PUGIXML_TEXT("or")) + return binary_op_t(ast_op_or, xpath_type_boolean, 1); + else if (lexer.contents() == PUGIXML_TEXT("and")) + return binary_op_t(ast_op_and, xpath_type_boolean, 2); + else if (lexer.contents() == PUGIXML_TEXT("div")) + return binary_op_t(ast_op_divide, xpath_type_number, 6); + else if (lexer.contents() == PUGIXML_TEXT("mod")) + return binary_op_t(ast_op_mod, xpath_type_number, 6); + else + return binary_op_t(); + + case lex_equal: + return binary_op_t(ast_op_equal, xpath_type_boolean, 3); + + case lex_not_equal: + return binary_op_t(ast_op_not_equal, xpath_type_boolean, 3); + + case lex_less: + return binary_op_t(ast_op_less, xpath_type_boolean, 4); + + case lex_greater: + return binary_op_t(ast_op_greater, xpath_type_boolean, 4); + + case lex_less_or_equal: + return binary_op_t(ast_op_less_or_equal, xpath_type_boolean, 4); + + case lex_greater_or_equal: + return binary_op_t(ast_op_greater_or_equal, xpath_type_boolean, 4); + + case lex_plus: + return binary_op_t(ast_op_add, xpath_type_number, 5); + + case lex_minus: + return binary_op_t(ast_op_subtract, xpath_type_number, 5); + + case lex_multiply: + return binary_op_t(ast_op_multiply, xpath_type_number, 6); + + case lex_union: + return binary_op_t(ast_op_union, xpath_type_node_set, 7); + + default: + return binary_op_t(); + } + } + }; + + xpath_ast_node* parse_expression_rec(xpath_ast_node* lhs, int limit) + { + binary_op_t op = binary_op_t::parse(_lexer); + + while (op.asttype != ast_unknown && op.precedence >= limit) + { + _lexer.next(); + + xpath_ast_node* rhs = parse_path_or_unary_expression(); + + binary_op_t nextop = binary_op_t::parse(_lexer); + + while (nextop.asttype != ast_unknown && nextop.precedence > op.precedence) + { + rhs = parse_expression_rec(rhs, nextop.precedence); + + nextop = binary_op_t::parse(_lexer); + } + + if (op.asttype == ast_op_union && (lhs->rettype() != xpath_type_node_set || rhs->rettype() != xpath_type_node_set)) + throw_error("Union operator has to be applied to node sets"); + + lhs = new (alloc_node()) xpath_ast_node(op.asttype, op.rettype, lhs, rhs); + + op = binary_op_t::parse(_lexer); + } + + return lhs; + } + + // Expr ::= OrExpr + // OrExpr ::= AndExpr | OrExpr 'or' AndExpr + // AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr + // EqualityExpr ::= RelationalExpr + // | EqualityExpr '=' RelationalExpr + // | EqualityExpr '!=' RelationalExpr + // RelationalExpr ::= AdditiveExpr + // | RelationalExpr '<' AdditiveExpr + // | RelationalExpr '>' AdditiveExpr + // | RelationalExpr '<=' AdditiveExpr + // | RelationalExpr '>=' AdditiveExpr + // AdditiveExpr ::= MultiplicativeExpr + // | AdditiveExpr '+' MultiplicativeExpr + // | AdditiveExpr '-' MultiplicativeExpr + // MultiplicativeExpr ::= UnaryExpr + // | MultiplicativeExpr '*' UnaryExpr + // | MultiplicativeExpr 'div' UnaryExpr + // | MultiplicativeExpr 'mod' UnaryExpr + xpath_ast_node* parse_expression() + { + return parse_expression_rec(parse_path_or_unary_expression(), 0); + } + + xpath_parser(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result): _alloc(alloc), _lexer(query), _query(query), _variables(variables), _result(result) + { + } + + xpath_ast_node* parse() + { + xpath_ast_node* result = parse_expression(); + + if (_lexer.current() != lex_eof) + { + // there are still unparsed tokens left, error + throw_error("Incorrect query"); + } + + return result; + } + + static xpath_ast_node* parse(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result) + { + xpath_parser parser(query, variables, alloc, result); + + #ifdef PUGIXML_NO_EXCEPTIONS + int error = setjmp(parser._error_handler); + + return (error == 0) ? parser.parse() : 0; + #else + return parser.parse(); + #endif + } + }; + + struct xpath_query_impl + { + static xpath_query_impl* create() + { + void* memory = xml_memory::allocate(sizeof(xpath_query_impl)); + + return new (memory) xpath_query_impl(); + } + + static void destroy(void* ptr) + { + if (!ptr) return; + + // free all allocated pages + static_cast(ptr)->alloc.release(); + + // free allocator memory (with the first page) + xml_memory::deallocate(ptr); + } + + xpath_query_impl(): root(0), alloc(&block) + { + block.next = 0; + block.capacity = sizeof(block.data); + } + + xpath_ast_node* root; + xpath_allocator alloc; + xpath_memory_block block; + }; + + PUGI__FN xpath_string evaluate_string_impl(xpath_query_impl* impl, const xpath_node& n, xpath_stack_data& sd) + { + if (!impl) return xpath_string(); + + #ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return xpath_string(); + #endif + + xpath_context c(n, 1, 1); + + return impl->root->eval_string(c, sd.stack); + } + + PUGI__FN impl::xpath_ast_node* evaluate_node_set_prepare(xpath_query_impl* impl) + { + if (!impl) return 0; + + if (impl->root->rettype() != xpath_type_node_set) + { + #ifdef PUGIXML_NO_EXCEPTIONS + return 0; + #else + xpath_parse_result res; + res.error = "Expression does not evaluate to node set"; + + throw xpath_exception(res); + #endif + } + + return impl->root; + } +PUGI__NS_END + +namespace pugi +{ +#ifndef PUGIXML_NO_EXCEPTIONS + PUGI__FN xpath_exception::xpath_exception(const xpath_parse_result& result_): _result(result_) + { + assert(_result.error); + } + + PUGI__FN const char* xpath_exception::what() const throw() + { + return _result.error; + } + + PUGI__FN const xpath_parse_result& xpath_exception::result() const + { + return _result; + } +#endif + + PUGI__FN xpath_node::xpath_node() + { + } + + PUGI__FN xpath_node::xpath_node(const xml_node& node_): _node(node_) + { + } + + PUGI__FN xpath_node::xpath_node(const xml_attribute& attribute_, const xml_node& parent_): _node(attribute_ ? parent_ : xml_node()), _attribute(attribute_) + { + } + + PUGI__FN xml_node xpath_node::node() const + { + return _attribute ? xml_node() : _node; + } + + PUGI__FN xml_attribute xpath_node::attribute() const + { + return _attribute; + } + + PUGI__FN xml_node xpath_node::parent() const + { + return _attribute ? _node : _node.parent(); + } + + PUGI__FN static void unspecified_bool_xpath_node(xpath_node***) + { + } + + PUGI__FN xpath_node::operator xpath_node::unspecified_bool_type() const + { + return (_node || _attribute) ? unspecified_bool_xpath_node : 0; + } + + PUGI__FN bool xpath_node::operator!() const + { + return !(_node || _attribute); + } + + PUGI__FN bool xpath_node::operator==(const xpath_node& n) const + { + return _node == n._node && _attribute == n._attribute; + } + + PUGI__FN bool xpath_node::operator!=(const xpath_node& n) const + { + return _node != n._node || _attribute != n._attribute; + } + +#ifdef __BORLANDC__ + PUGI__FN bool operator&&(const xpath_node& lhs, bool rhs) + { + return (bool)lhs && rhs; + } + + PUGI__FN bool operator||(const xpath_node& lhs, bool rhs) + { + return (bool)lhs || rhs; + } +#endif + + PUGI__FN void xpath_node_set::_assign(const_iterator begin_, const_iterator end_) + { + assert(begin_ <= end_); + + size_t size_ = static_cast(end_ - begin_); + + if (size_ <= 1) + { + // deallocate old buffer + if (_begin != &_storage) impl::xml_memory::deallocate(_begin); + + // use internal buffer + if (begin_ != end_) _storage = *begin_; + + _begin = &_storage; + _end = &_storage + size_; + } + else + { + // make heap copy + xpath_node* storage = static_cast(impl::xml_memory::allocate(size_ * sizeof(xpath_node))); + + if (!storage) + { + #ifdef PUGIXML_NO_EXCEPTIONS + return; + #else + throw std::bad_alloc(); + #endif + } + + memcpy(storage, begin_, size_ * sizeof(xpath_node)); + + // deallocate old buffer + if (_begin != &_storage) impl::xml_memory::deallocate(_begin); + + // finalize + _begin = storage; + _end = storage + size_; + } + } + + PUGI__FN xpath_node_set::xpath_node_set(): _type(type_unsorted), _begin(&_storage), _end(&_storage) + { + } + + PUGI__FN xpath_node_set::xpath_node_set(const_iterator begin_, const_iterator end_, type_t type_): _type(type_), _begin(&_storage), _end(&_storage) + { + _assign(begin_, end_); + } + + PUGI__FN xpath_node_set::~xpath_node_set() + { + if (_begin != &_storage) impl::xml_memory::deallocate(_begin); + } + + PUGI__FN xpath_node_set::xpath_node_set(const xpath_node_set& ns): _type(ns._type), _begin(&_storage), _end(&_storage) + { + _assign(ns._begin, ns._end); + } + + PUGI__FN xpath_node_set& xpath_node_set::operator=(const xpath_node_set& ns) + { + if (this == &ns) return *this; + + _type = ns._type; + _assign(ns._begin, ns._end); + + return *this; + } + + PUGI__FN xpath_node_set::type_t xpath_node_set::type() const + { + return _type; + } + + PUGI__FN size_t xpath_node_set::size() const + { + return _end - _begin; + } + + PUGI__FN bool xpath_node_set::empty() const + { + return _begin == _end; + } + + PUGI__FN const xpath_node& xpath_node_set::operator[](size_t index) const + { + assert(index < size()); + return _begin[index]; + } + + PUGI__FN xpath_node_set::const_iterator xpath_node_set::begin() const + { + return _begin; + } + + PUGI__FN xpath_node_set::const_iterator xpath_node_set::end() const + { + return _end; + } + + PUGI__FN void xpath_node_set::sort(bool reverse) + { + _type = impl::xpath_sort(_begin, _end, _type, reverse); + } + + PUGI__FN xpath_node xpath_node_set::first() const + { + return impl::xpath_first(_begin, _end, _type); + } + + PUGI__FN xpath_parse_result::xpath_parse_result(): error("Internal error"), offset(0) + { + } + + PUGI__FN xpath_parse_result::operator bool() const + { + return error == 0; + } + + PUGI__FN const char* xpath_parse_result::description() const + { + return error ? error : "No error"; + } + + PUGI__FN xpath_variable::xpath_variable(): _type(xpath_type_none), _next(0) + { + } + + PUGI__FN const char_t* xpath_variable::name() const + { + switch (_type) + { + case xpath_type_node_set: + return static_cast(this)->name; + + case xpath_type_number: + return static_cast(this)->name; + + case xpath_type_string: + return static_cast(this)->name; + + case xpath_type_boolean: + return static_cast(this)->name; + + default: + assert(!"Invalid variable type"); + return 0; + } + } + + PUGI__FN xpath_value_type xpath_variable::type() const + { + return _type; + } + + PUGI__FN bool xpath_variable::get_boolean() const + { + return (_type == xpath_type_boolean) ? static_cast(this)->value : false; + } + + PUGI__FN double xpath_variable::get_number() const + { + return (_type == xpath_type_number) ? static_cast(this)->value : impl::gen_nan(); + } + + PUGI__FN const char_t* xpath_variable::get_string() const + { + const char_t* value = (_type == xpath_type_string) ? static_cast(this)->value : 0; + return value ? value : PUGIXML_TEXT(""); + } + + PUGI__FN const xpath_node_set& xpath_variable::get_node_set() const + { + return (_type == xpath_type_node_set) ? static_cast(this)->value : impl::dummy_node_set; + } + + PUGI__FN bool xpath_variable::set(bool value) + { + if (_type != xpath_type_boolean) return false; + + static_cast(this)->value = value; + return true; + } + + PUGI__FN bool xpath_variable::set(double value) + { + if (_type != xpath_type_number) return false; + + static_cast(this)->value = value; + return true; + } + + PUGI__FN bool xpath_variable::set(const char_t* value) + { + if (_type != xpath_type_string) return false; + + impl::xpath_variable_string* var = static_cast(this); + + // duplicate string + size_t size = (impl::strlength(value) + 1) * sizeof(char_t); + + char_t* copy = static_cast(impl::xml_memory::allocate(size)); + if (!copy) return false; + + memcpy(copy, value, size); + + // replace old string + if (var->value) impl::xml_memory::deallocate(var->value); + var->value = copy; + + return true; + } + + PUGI__FN bool xpath_variable::set(const xpath_node_set& value) + { + if (_type != xpath_type_node_set) return false; + + static_cast(this)->value = value; + return true; + } + + PUGI__FN xpath_variable_set::xpath_variable_set() + { + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) _data[i] = 0; + } + + PUGI__FN xpath_variable_set::~xpath_variable_set() + { + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) + { + xpath_variable* var = _data[i]; + + while (var) + { + xpath_variable* next = var->_next; + + impl::delete_xpath_variable(var->_type, var); + + var = next; + } + } + } + + PUGI__FN xpath_variable* xpath_variable_set::find(const char_t* name) const + { + const size_t hash_size = sizeof(_data) / sizeof(_data[0]); + size_t hash = impl::hash_string(name) % hash_size; + + // look for existing variable + for (xpath_variable* var = _data[hash]; var; var = var->_next) + if (impl::strequal(var->name(), name)) + return var; + + return 0; + } + + PUGI__FN xpath_variable* xpath_variable_set::add(const char_t* name, xpath_value_type type) + { + const size_t hash_size = sizeof(_data) / sizeof(_data[0]); + size_t hash = impl::hash_string(name) % hash_size; + + // look for existing variable + for (xpath_variable* var = _data[hash]; var; var = var->_next) + if (impl::strequal(var->name(), name)) + return var->type() == type ? var : 0; + + // add new variable + xpath_variable* result = impl::new_xpath_variable(type, name); + + if (result) + { + result->_type = type; + result->_next = _data[hash]; + + _data[hash] = result; + } + + return result; + } + + PUGI__FN bool xpath_variable_set::set(const char_t* name, bool value) + { + xpath_variable* var = add(name, xpath_type_boolean); + return var ? var->set(value) : false; + } + + PUGI__FN bool xpath_variable_set::set(const char_t* name, double value) + { + xpath_variable* var = add(name, xpath_type_number); + return var ? var->set(value) : false; + } + + PUGI__FN bool xpath_variable_set::set(const char_t* name, const char_t* value) + { + xpath_variable* var = add(name, xpath_type_string); + return var ? var->set(value) : false; + } + + PUGI__FN bool xpath_variable_set::set(const char_t* name, const xpath_node_set& value) + { + xpath_variable* var = add(name, xpath_type_node_set); + return var ? var->set(value) : false; + } + + PUGI__FN xpath_variable* xpath_variable_set::get(const char_t* name) + { + return find(name); + } + + PUGI__FN const xpath_variable* xpath_variable_set::get(const char_t* name) const + { + return find(name); + } + + PUGI__FN xpath_query::xpath_query(const char_t* query, xpath_variable_set* variables): _impl(0) + { + impl::xpath_query_impl* qimpl = impl::xpath_query_impl::create(); + + if (!qimpl) + { + #ifdef PUGIXML_NO_EXCEPTIONS + _result.error = "Out of memory"; + #else + throw std::bad_alloc(); + #endif + } + else + { + impl::buffer_holder impl_holder(qimpl, impl::xpath_query_impl::destroy); + + qimpl->root = impl::xpath_parser::parse(query, variables, &qimpl->alloc, &_result); + + if (qimpl->root) + { + qimpl->root->optimize(&qimpl->alloc); + + _impl = static_cast(impl_holder.release()); + _result.error = 0; + } + } + } + + PUGI__FN xpath_query::~xpath_query() + { + impl::xpath_query_impl::destroy(_impl); + } + + PUGI__FN xpath_value_type xpath_query::return_type() const + { + if (!_impl) return xpath_type_none; + + return static_cast(_impl)->root->rettype(); + } + + PUGI__FN bool xpath_query::evaluate_boolean(const xpath_node& n) const + { + if (!_impl) return false; + + impl::xpath_context c(n, 1, 1); + impl::xpath_stack_data sd; + + #ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return false; + #endif + + return static_cast(_impl)->root->eval_boolean(c, sd.stack); + } + + PUGI__FN double xpath_query::evaluate_number(const xpath_node& n) const + { + if (!_impl) return impl::gen_nan(); + + impl::xpath_context c(n, 1, 1); + impl::xpath_stack_data sd; + + #ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return impl::gen_nan(); + #endif + + return static_cast(_impl)->root->eval_number(c, sd.stack); + } + +#ifndef PUGIXML_NO_STL + PUGI__FN string_t xpath_query::evaluate_string(const xpath_node& n) const + { + impl::xpath_stack_data sd; + + impl::xpath_string r = impl::evaluate_string_impl(static_cast(_impl), n, sd); + + return string_t(r.c_str(), r.length()); + } +#endif + + PUGI__FN size_t xpath_query::evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const + { + impl::xpath_stack_data sd; + + impl::xpath_string r = impl::evaluate_string_impl(static_cast(_impl), n, sd); + + size_t full_size = r.length() + 1; + + if (capacity > 0) + { + size_t size = (full_size < capacity) ? full_size : capacity; + assert(size > 0); + + memcpy(buffer, r.c_str(), (size - 1) * sizeof(char_t)); + buffer[size - 1] = 0; + } + + return full_size; + } + + PUGI__FN xpath_node_set xpath_query::evaluate_node_set(const xpath_node& n) const + { + impl::xpath_ast_node* root = impl::evaluate_node_set_prepare(static_cast(_impl)); + if (!root) return xpath_node_set(); + + impl::xpath_context c(n, 1, 1); + impl::xpath_stack_data sd; + + #ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return xpath_node_set(); + #endif + + impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_all); + + return xpath_node_set(r.begin(), r.end(), r.type()); + } + + PUGI__FN xpath_node xpath_query::evaluate_node(const xpath_node& n) const + { + impl::xpath_ast_node* root = impl::evaluate_node_set_prepare(static_cast(_impl)); + if (!root) return xpath_node(); + + impl::xpath_context c(n, 1, 1); + impl::xpath_stack_data sd; + + #ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return xpath_node(); + #endif + + impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_first); + + return r.first(); + } + + PUGI__FN const xpath_parse_result& xpath_query::result() const + { + return _result; + } + + PUGI__FN static void unspecified_bool_xpath_query(xpath_query***) + { + } + + PUGI__FN xpath_query::operator xpath_query::unspecified_bool_type() const + { + return _impl ? unspecified_bool_xpath_query : 0; + } + + PUGI__FN bool xpath_query::operator!() const + { + return !_impl; + } + + PUGI__FN xpath_node xml_node::select_node(const char_t* query, xpath_variable_set* variables) const + { + xpath_query q(query, variables); + return select_node(q); + } + + PUGI__FN xpath_node xml_node::select_node(const xpath_query& query) const + { + return query.evaluate_node(*this); + } + + PUGI__FN xpath_node_set xml_node::select_nodes(const char_t* query, xpath_variable_set* variables) const + { + xpath_query q(query, variables); + return select_nodes(q); + } + + PUGI__FN xpath_node_set xml_node::select_nodes(const xpath_query& query) const + { + return query.evaluate_node_set(*this); + } + + PUGI__FN xpath_node xml_node::select_single_node(const char_t* query, xpath_variable_set* variables) const + { + xpath_query q(query, variables); + return select_single_node(q); + } + + PUGI__FN xpath_node xml_node::select_single_node(const xpath_query& query) const + { + return query.evaluate_node(*this); + } +} + +#endif + +#ifdef __BORLANDC__ +# pragma option pop +#endif + +// Intel C++ does not properly keep warning state for function templates, +// so popping warning state at the end of translation unit leads to warnings in the middle. +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +# pragma warning(pop) +#endif + +// Undefine all local macros (makes sure we're not leaking macros in header-only mode) +#undef PUGI__NO_INLINE +#undef PUGI__UNLIKELY +#undef PUGI__STATIC_ASSERT +#undef PUGI__DMC_VOLATILE +#undef PUGI__MSVC_CRT_VERSION +#undef PUGI__NS_BEGIN +#undef PUGI__NS_END +#undef PUGI__FN +#undef PUGI__FN_NO_INLINE +#undef PUGI__NODETYPE +#undef PUGI__IS_CHARTYPE_IMPL +#undef PUGI__IS_CHARTYPE +#undef PUGI__IS_CHARTYPEX +#undef PUGI__ENDSWITH +#undef PUGI__SKIPWS +#undef PUGI__OPTSET +#undef PUGI__PUSHNODE +#undef PUGI__POPNODE +#undef PUGI__SCANFOR +#undef PUGI__SCANWHILE +#undef PUGI__SCANWHILE_UNROLL +#undef PUGI__ENDSEG +#undef PUGI__THROW_ERROR +#undef PUGI__CHECK_ERROR + +#endif + +/** + * Copyright (c) 2006-2014 Arseny Kapoulkine + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ diff --git a/src/pugixml/pugixml.hpp b/src/pugixml/pugixml.hpp new file mode 100644 index 0000000..9798b46 --- /dev/null +++ b/src/pugixml/pugixml.hpp @@ -0,0 +1,1366 @@ +/** + * pugixml parser - version 1.5 + * -------------------------------------------------------- + * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Report bugs and download new versions at http://pugixml.org/ + * + * This library is distributed under the MIT License. See notice at the end + * of this file. + * + * This work is based on the pugxml parser, which is: + * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) + */ + +#ifndef PUGIXML_VERSION +// Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons +# define PUGIXML_VERSION 150 +#endif + +// Include user configuration file (this can define various configuration macros) +#include "pugiconfig.hpp" + +#ifndef HEADER_PUGIXML_HPP +#define HEADER_PUGIXML_HPP + +// Include stddef.h for size_t and ptrdiff_t +#include + +// Include exception header for XPath +#if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS) +# include +#endif + +// Include STL headers +#ifndef PUGIXML_NO_STL +# include +# include +# include +#endif + +// Macro for deprecated features +#ifndef PUGIXML_DEPRECATED +# if defined(__GNUC__) +# define PUGIXML_DEPRECATED __attribute__((deprecated)) +# elif defined(_MSC_VER) && _MSC_VER >= 1300 +# define PUGIXML_DEPRECATED __declspec(deprecated) +# else +# define PUGIXML_DEPRECATED +# endif +#endif + +// If no API is defined, assume default +#ifndef PUGIXML_API +# define PUGIXML_API +#endif + +// If no API for classes is defined, assume default +#ifndef PUGIXML_CLASS +# define PUGIXML_CLASS PUGIXML_API +#endif + +// If no API for functions is defined, assume default +#ifndef PUGIXML_FUNCTION +# define PUGIXML_FUNCTION PUGIXML_API +#endif + +// If the platform is known to have long long support, enable long long functions +#ifndef PUGIXML_HAS_LONG_LONG +# if defined(__cplusplus) && __cplusplus >= 201103 +# define PUGIXML_HAS_LONG_LONG +# elif defined(_MSC_VER) && _MSC_VER >= 1400 +# define PUGIXML_HAS_LONG_LONG +# endif +#endif + +// Character interface macros +#ifdef PUGIXML_WCHAR_MODE +# define PUGIXML_TEXT(t) L ## t +# define PUGIXML_CHAR wchar_t +#else +# define PUGIXML_TEXT(t) t +# define PUGIXML_CHAR char +#endif + +namespace pugi +{ + // Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE + typedef PUGIXML_CHAR char_t; + +#ifndef PUGIXML_NO_STL + // String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE + typedef std::basic_string, std::allocator > string_t; +#endif +} + +// The PugiXML namespace +namespace pugi +{ + // Tree node types + enum xml_node_type + { + node_null, // Empty (null) node handle + node_document, // A document tree's absolute root + node_element, // Element tag, i.e. '' + node_pcdata, // Plain character data, i.e. 'text' + node_cdata, // Character data, i.e. '' + node_comment, // Comment tag, i.e. '' + node_pi, // Processing instruction, i.e. '' + node_declaration, // Document declaration, i.e. '' + node_doctype // Document type declaration, i.e. '' + }; + + // Parsing options + + // Minimal parsing mode (equivalent to turning all other flags off). + // Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed. + const unsigned int parse_minimal = 0x0000; + + // This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default. + const unsigned int parse_pi = 0x0001; + + // This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default. + const unsigned int parse_comments = 0x0002; + + // This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default. + const unsigned int parse_cdata = 0x0004; + + // This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree. + // This flag is off by default; turning it on usually results in slower parsing and more memory consumption. + const unsigned int parse_ws_pcdata = 0x0008; + + // This flag determines if character and entity references are expanded during parsing. This flag is on by default. + const unsigned int parse_escapes = 0x0010; + + // This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default. + const unsigned int parse_eol = 0x0020; + + // This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default. + const unsigned int parse_wconv_attribute = 0x0040; + + // This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default. + const unsigned int parse_wnorm_attribute = 0x0080; + + // This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default. + const unsigned int parse_declaration = 0x0100; + + // This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default. + const unsigned int parse_doctype = 0x0200; + + // This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only + // of whitespace is added to the DOM tree. + // This flag is off by default; turning it on may result in slower parsing and more memory consumption. + const unsigned int parse_ws_pcdata_single = 0x0400; + + // This flag determines if leading and trailing whitespace is to be removed from plain character data. This flag is off by default. + const unsigned int parse_trim_pcdata = 0x0800; + + // This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document + // is a valid document. This flag is off by default. + const unsigned int parse_fragment = 0x1000; + + // The default parsing mode. + // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, + // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. + const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol; + + // The full parsing mode. + // Nodes of all types are added to the DOM tree, character/reference entities are expanded, + // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. + const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype; + + // These flags determine the encoding of input data for XML document + enum xml_encoding + { + encoding_auto, // Auto-detect input encoding using BOM or < / class xml_object_range + { + public: + typedef It const_iterator; + typedef It iterator; + + xml_object_range(It b, It e): _begin(b), _end(e) + { + } + + It begin() const { return _begin; } + It end() const { return _end; } + + private: + It _begin, _end; + }; + + // Writer interface for node printing (see xml_node::print) + class PUGIXML_CLASS xml_writer + { + public: + virtual ~xml_writer() {} + + // Write memory chunk into stream/file/whatever + virtual void write(const void* data, size_t size) = 0; + }; + + // xml_writer implementation for FILE* + class PUGIXML_CLASS xml_writer_file: public xml_writer + { + public: + // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio + xml_writer_file(void* file); + + virtual void write(const void* data, size_t size); + + private: + void* file; + }; + + #ifndef PUGIXML_NO_STL + // xml_writer implementation for streams + class PUGIXML_CLASS xml_writer_stream: public xml_writer + { + public: + // Construct writer from an output stream object + xml_writer_stream(std::basic_ostream >& stream); + xml_writer_stream(std::basic_ostream >& stream); + + virtual void write(const void* data, size_t size); + + private: + std::basic_ostream >* narrow_stream; + std::basic_ostream >* wide_stream; + }; + #endif + + // A light-weight handle for manipulating attributes in DOM tree + class PUGIXML_CLASS xml_attribute + { + friend class xml_attribute_iterator; + friend class xml_node; + + private: + xml_attribute_struct* _attr; + + typedef void (*unspecified_bool_type)(xml_attribute***); + + public: + // Default constructor. Constructs an empty attribute. + xml_attribute(); + + // Constructs attribute from internal pointer + explicit xml_attribute(xml_attribute_struct* attr); + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Comparison operators (compares wrapped attribute pointers) + bool operator==(const xml_attribute& r) const; + bool operator!=(const xml_attribute& r) const; + bool operator<(const xml_attribute& r) const; + bool operator>(const xml_attribute& r) const; + bool operator<=(const xml_attribute& r) const; + bool operator>=(const xml_attribute& r) const; + + // Check if attribute is empty + bool empty() const; + + // Get attribute name/value, or "" if attribute is empty + const char_t* name() const; + const char_t* value() const; + + // Get attribute value, or the default value if attribute is empty + const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const; + + // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty + int as_int(int def = 0) const; + unsigned int as_uint(unsigned int def = 0) const; + double as_double(double def = 0) const; + float as_float(float def = 0) const; + + #ifdef PUGIXML_HAS_LONG_LONG + long long as_llong(long long def = 0) const; + unsigned long long as_ullong(unsigned long long def = 0) const; + #endif + + // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty + bool as_bool(bool def = false) const; + + // Set attribute name/value (returns false if attribute is empty or there is not enough memory) + bool set_name(const char_t* rhs); + bool set_value(const char_t* rhs); + + // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") + bool set_value(int rhs); + bool set_value(unsigned int rhs); + bool set_value(double rhs); + bool set_value(float rhs); + bool set_value(bool rhs); + + #ifdef PUGIXML_HAS_LONG_LONG + bool set_value(long long rhs); + bool set_value(unsigned long long rhs); + #endif + + // Set attribute value (equivalent to set_value without error checking) + xml_attribute& operator=(const char_t* rhs); + xml_attribute& operator=(int rhs); + xml_attribute& operator=(unsigned int rhs); + xml_attribute& operator=(double rhs); + xml_attribute& operator=(float rhs); + xml_attribute& operator=(bool rhs); + + #ifdef PUGIXML_HAS_LONG_LONG + xml_attribute& operator=(long long rhs); + xml_attribute& operator=(unsigned long long rhs); + #endif + + // Get next/previous attribute in the attribute list of the parent node + xml_attribute next_attribute() const; + xml_attribute previous_attribute() const; + + // Get hash value (unique for handles to the same object) + size_t hash_value() const; + + // Get internal pointer + xml_attribute_struct* internal_object() const; + }; + +#ifdef __BORLANDC__ + // Borland C++ workaround + bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs); + bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs); +#endif + + // A light-weight handle for manipulating nodes in DOM tree + class PUGIXML_CLASS xml_node + { + friend class xml_attribute_iterator; + friend class xml_node_iterator; + friend class xml_named_node_iterator; + + protected: + xml_node_struct* _root; + + typedef void (*unspecified_bool_type)(xml_node***); + + public: + // Default constructor. Constructs an empty node. + xml_node(); + + // Constructs node from internal pointer + explicit xml_node(xml_node_struct* p); + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Comparison operators (compares wrapped node pointers) + bool operator==(const xml_node& r) const; + bool operator!=(const xml_node& r) const; + bool operator<(const xml_node& r) const; + bool operator>(const xml_node& r) const; + bool operator<=(const xml_node& r) const; + bool operator>=(const xml_node& r) const; + + // Check if node is empty. + bool empty() const; + + // Get node type + xml_node_type type() const; + + // Get node name, or "" if node is empty or it has no name + const char_t* name() const; + + // Get node value, or "" if node is empty or it has no value + // Note: For text node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes. + const char_t* value() const; + + // Get attribute list + xml_attribute first_attribute() const; + xml_attribute last_attribute() const; + + // Get children list + xml_node first_child() const; + xml_node last_child() const; + + // Get next/previous sibling in the children list of the parent node + xml_node next_sibling() const; + xml_node previous_sibling() const; + + // Get parent node + xml_node parent() const; + + // Get root of DOM tree this node belongs to + xml_node root() const; + + // Get text object for the current node + xml_text text() const; + + // Get child, attribute or next/previous sibling with the specified name + xml_node child(const char_t* name) const; + xml_attribute attribute(const char_t* name) const; + xml_node next_sibling(const char_t* name) const; + xml_node previous_sibling(const char_t* name) const; + + // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA + const char_t* child_value() const; + + // Get child value of child with specified name. Equivalent to child(name).child_value(). + const char_t* child_value(const char_t* name) const; + + // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value) + bool set_name(const char_t* rhs); + bool set_value(const char_t* rhs); + + // Add attribute with specified name. Returns added attribute, or empty attribute on errors. + xml_attribute append_attribute(const char_t* name); + xml_attribute prepend_attribute(const char_t* name); + xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr); + xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr); + + // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors. + xml_attribute append_copy(const xml_attribute& proto); + xml_attribute prepend_copy(const xml_attribute& proto); + xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr); + xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr); + + // Add child node with specified type. Returns added node, or empty node on errors. + xml_node append_child(xml_node_type type = node_element); + xml_node prepend_child(xml_node_type type = node_element); + xml_node insert_child_after(xml_node_type type, const xml_node& node); + xml_node insert_child_before(xml_node_type type, const xml_node& node); + + // Add child element with specified name. Returns added node, or empty node on errors. + xml_node append_child(const char_t* name); + xml_node prepend_child(const char_t* name); + xml_node insert_child_after(const char_t* name, const xml_node& node); + xml_node insert_child_before(const char_t* name, const xml_node& node); + + // Add a copy of the specified node as a child. Returns added node, or empty node on errors. + xml_node append_copy(const xml_node& proto); + xml_node prepend_copy(const xml_node& proto); + xml_node insert_copy_after(const xml_node& proto, const xml_node& node); + xml_node insert_copy_before(const xml_node& proto, const xml_node& node); + + // Move the specified node to become a child of this node. Returns moved node, or empty node on errors. + xml_node append_move(const xml_node& moved); + xml_node prepend_move(const xml_node& moved); + xml_node insert_move_after(const xml_node& moved, const xml_node& node); + xml_node insert_move_before(const xml_node& moved, const xml_node& node); + + // Remove specified attribute + bool remove_attribute(const xml_attribute& a); + bool remove_attribute(const char_t* name); + + // Remove specified child + bool remove_child(const xml_node& n); + bool remove_child(const char_t* name); + + // Parses buffer as an XML document fragment and appends all nodes as children of the current node. + // Copies/converts the buffer, so it may be deleted or changed after the function returns. + // Note: append_buffer allocates memory that has the lifetime of the owning document; removing the appended nodes does not immediately reclaim that memory. + xml_parse_result append_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Find attribute using predicate. Returns first attribute for which predicate returned true. + template xml_attribute find_attribute(Predicate pred) const + { + if (!_root) return xml_attribute(); + + for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute()) + if (pred(attrib)) + return attrib; + + return xml_attribute(); + } + + // Find child node using predicate. Returns first child for which predicate returned true. + template xml_node find_child(Predicate pred) const + { + if (!_root) return xml_node(); + + for (xml_node node = first_child(); node; node = node.next_sibling()) + if (pred(node)) + return node; + + return xml_node(); + } + + // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true. + template xml_node find_node(Predicate pred) const + { + if (!_root) return xml_node(); + + xml_node cur = first_child(); + + while (cur._root && cur._root != _root) + { + if (pred(cur)) return cur; + + if (cur.first_child()) cur = cur.first_child(); + else if (cur.next_sibling()) cur = cur.next_sibling(); + else + { + while (!cur.next_sibling() && cur._root != _root) cur = cur.parent(); + + if (cur._root != _root) cur = cur.next_sibling(); + } + } + + return xml_node(); + } + + // Find child node by attribute name/value + xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const; + xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const; + + #ifndef PUGIXML_NO_STL + // Get the absolute node path from root as a text string. + string_t path(char_t delimiter = '/') const; + #endif + + // Search for a node by path consisting of node names and . or .. elements. + xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const; + + // Recursively traverse subtree with xml_tree_walker + bool traverse(xml_tree_walker& walker); + + #ifndef PUGIXML_NO_XPATH + // Select single node by evaluating XPath query. Returns first node from the resulting node set. + xpath_node select_node(const char_t* query, xpath_variable_set* variables = 0) const; + xpath_node select_node(const xpath_query& query) const; + + // Select node set by evaluating XPath query + xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const; + xpath_node_set select_nodes(const xpath_query& query) const; + + // (deprecated: use select_node instead) Select single node by evaluating XPath query. + xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const; + xpath_node select_single_node(const xpath_query& query) const; + + #endif + + // Print subtree using a writer object + void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; + + #ifndef PUGIXML_NO_STL + // Print subtree to stream + void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; + void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const; + #endif + + // Child nodes iterators + typedef xml_node_iterator iterator; + + iterator begin() const; + iterator end() const; + + // Attribute iterators + typedef xml_attribute_iterator attribute_iterator; + + attribute_iterator attributes_begin() const; + attribute_iterator attributes_end() const; + + // Range-based for support + xml_object_range children() const; + xml_object_range children(const char_t* name) const; + xml_object_range attributes() const; + + // Get node offset in parsed file/string (in char_t units) for debugging purposes + ptrdiff_t offset_debug() const; + + // Get hash value (unique for handles to the same object) + size_t hash_value() const; + + // Get internal pointer + xml_node_struct* internal_object() const; + }; + +#ifdef __BORLANDC__ + // Borland C++ workaround + bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs); + bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs); +#endif + + // A helper for working with text inside PCDATA nodes + class PUGIXML_CLASS xml_text + { + friend class xml_node; + + xml_node_struct* _root; + + typedef void (*unspecified_bool_type)(xml_text***); + + explicit xml_text(xml_node_struct* root); + + xml_node_struct* _data_new(); + xml_node_struct* _data() const; + + public: + // Default constructor. Constructs an empty object. + xml_text(); + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Check if text object is empty + bool empty() const; + + // Get text, or "" if object is empty + const char_t* get() const; + + // Get text, or the default value if object is empty + const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const; + + // Get text as a number, or the default value if conversion did not succeed or object is empty + int as_int(int def = 0) const; + unsigned int as_uint(unsigned int def = 0) const; + double as_double(double def = 0) const; + float as_float(float def = 0) const; + + #ifdef PUGIXML_HAS_LONG_LONG + long long as_llong(long long def = 0) const; + unsigned long long as_ullong(unsigned long long def = 0) const; + #endif + + // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty + bool as_bool(bool def = false) const; + + // Set text (returns false if object is empty or there is not enough memory) + bool set(const char_t* rhs); + + // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") + bool set(int rhs); + bool set(unsigned int rhs); + bool set(double rhs); + bool set(float rhs); + bool set(bool rhs); + + #ifdef PUGIXML_HAS_LONG_LONG + bool set(long long rhs); + bool set(unsigned long long rhs); + #endif + + // Set text (equivalent to set without error checking) + xml_text& operator=(const char_t* rhs); + xml_text& operator=(int rhs); + xml_text& operator=(unsigned int rhs); + xml_text& operator=(double rhs); + xml_text& operator=(float rhs); + xml_text& operator=(bool rhs); + + #ifdef PUGIXML_HAS_LONG_LONG + xml_text& operator=(long long rhs); + xml_text& operator=(unsigned long long rhs); + #endif + + // Get the data node (node_pcdata or node_cdata) for this object + xml_node data() const; + }; + +#ifdef __BORLANDC__ + // Borland C++ workaround + bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs); + bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs); +#endif + + // Child node iterator (a bidirectional iterator over a collection of xml_node) + class PUGIXML_CLASS xml_node_iterator + { + friend class xml_node; + + private: + mutable xml_node _wrap; + xml_node _parent; + + xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent); + + public: + // Iterator traits + typedef ptrdiff_t difference_type; + typedef xml_node value_type; + typedef xml_node* pointer; + typedef xml_node& reference; + + #ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; + #endif + + // Default constructor + xml_node_iterator(); + + // Construct an iterator which points to the specified node + xml_node_iterator(const xml_node& node); + + // Iterator operators + bool operator==(const xml_node_iterator& rhs) const; + bool operator!=(const xml_node_iterator& rhs) const; + + xml_node& operator*() const; + xml_node* operator->() const; + + const xml_node_iterator& operator++(); + xml_node_iterator operator++(int); + + const xml_node_iterator& operator--(); + xml_node_iterator operator--(int); + }; + + // Attribute iterator (a bidirectional iterator over a collection of xml_attribute) + class PUGIXML_CLASS xml_attribute_iterator + { + friend class xml_node; + + private: + mutable xml_attribute _wrap; + xml_node _parent; + + xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent); + + public: + // Iterator traits + typedef ptrdiff_t difference_type; + typedef xml_attribute value_type; + typedef xml_attribute* pointer; + typedef xml_attribute& reference; + + #ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; + #endif + + // Default constructor + xml_attribute_iterator(); + + // Construct an iterator which points to the specified attribute + xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent); + + // Iterator operators + bool operator==(const xml_attribute_iterator& rhs) const; + bool operator!=(const xml_attribute_iterator& rhs) const; + + xml_attribute& operator*() const; + xml_attribute* operator->() const; + + const xml_attribute_iterator& operator++(); + xml_attribute_iterator operator++(int); + + const xml_attribute_iterator& operator--(); + xml_attribute_iterator operator--(int); + }; + + // Named node range helper + class PUGIXML_CLASS xml_named_node_iterator + { + friend class xml_node; + + public: + // Iterator traits + typedef ptrdiff_t difference_type; + typedef xml_node value_type; + typedef xml_node* pointer; + typedef xml_node& reference; + + #ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; + #endif + + // Default constructor + xml_named_node_iterator(); + + // Construct an iterator which points to the specified node + xml_named_node_iterator(const xml_node& node, const char_t* name); + + // Iterator operators + bool operator==(const xml_named_node_iterator& rhs) const; + bool operator!=(const xml_named_node_iterator& rhs) const; + + xml_node& operator*() const; + xml_node* operator->() const; + + const xml_named_node_iterator& operator++(); + xml_named_node_iterator operator++(int); + + const xml_named_node_iterator& operator--(); + xml_named_node_iterator operator--(int); + + private: + mutable xml_node _wrap; + xml_node _parent; + const char_t* _name; + + xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name); + }; + + // Abstract tree walker class (see xml_node::traverse) + class PUGIXML_CLASS xml_tree_walker + { + friend class xml_node; + + private: + int _depth; + + protected: + // Get current traversal depth + int depth() const; + + public: + xml_tree_walker(); + virtual ~xml_tree_walker(); + + // Callback that is called when traversal begins + virtual bool begin(xml_node& node); + + // Callback that is called for each node traversed + virtual bool for_each(xml_node& node) = 0; + + // Callback that is called when traversal ends + virtual bool end(xml_node& node); + }; + + // Parsing status, returned as part of xml_parse_result object + enum xml_parse_status + { + status_ok = 0, // No error + + status_file_not_found, // File was not found during load_file() + status_io_error, // Error reading from file/stream + status_out_of_memory, // Could not allocate memory + status_internal_error, // Internal error occurred + + status_unrecognized_tag, // Parser could not determine tag type + + status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction + status_bad_comment, // Parsing error occurred while parsing comment + status_bad_cdata, // Parsing error occurred while parsing CDATA section + status_bad_doctype, // Parsing error occurred while parsing document type declaration + status_bad_pcdata, // Parsing error occurred while parsing PCDATA section + status_bad_start_element, // Parsing error occurred while parsing start element tag + status_bad_attribute, // Parsing error occurred while parsing element attribute + status_bad_end_element, // Parsing error occurred while parsing end element tag + status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag) + + status_append_invalid_root, // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer) + + status_no_document_element // Parsing resulted in a document without element nodes + }; + + // Parsing result + struct PUGIXML_CLASS xml_parse_result + { + // Parsing status (see xml_parse_status) + xml_parse_status status; + + // Last parsed offset (in char_t units from start of input data) + ptrdiff_t offset; + + // Source document encoding + xml_encoding encoding; + + // Default constructor, initializes object to failed state + xml_parse_result(); + + // Cast to bool operator + operator bool() const; + + // Get error description + const char* description() const; + }; + + // Document class (DOM tree root) + class PUGIXML_CLASS xml_document: public xml_node + { + private: + char_t* _buffer; + + char _memory[192]; + + // Non-copyable semantics + xml_document(const xml_document&); + const xml_document& operator=(const xml_document&); + + void create(); + void destroy(); + + public: + // Default constructor, makes empty document + xml_document(); + + // Destructor, invalidates all node/attribute handles to this document + ~xml_document(); + + // Removes all nodes, leaving the empty document + void reset(); + + // Removes all nodes, then copies the entire contents of the specified document + void reset(const xml_document& proto); + + #ifndef PUGIXML_NO_STL + // Load document from stream. + xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default); + #endif + + // (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied. + xml_parse_result load(const char_t* contents, unsigned int options = parse_default); + + // Load document from zero-terminated string. No encoding conversions are applied. + xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default); + + // Load document from file + xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns. + xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data). + // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed. + xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data). + // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore). + xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details). + void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + + #ifndef PUGIXML_NO_STL + // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details). + void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const; + #endif + + // Save XML to file + bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + + // Get document element + xml_node document_element() const; + }; + +#ifndef PUGIXML_NO_XPATH + // XPath query return type + enum xpath_value_type + { + xpath_type_none, // Unknown type (query failed to compile) + xpath_type_node_set, // Node set (xpath_node_set) + xpath_type_number, // Number + xpath_type_string, // String + xpath_type_boolean // Boolean + }; + + // XPath parsing result + struct PUGIXML_CLASS xpath_parse_result + { + // Error message (0 if no error) + const char* error; + + // Last parsed offset (in char_t units from string start) + ptrdiff_t offset; + + // Default constructor, initializes object to failed state + xpath_parse_result(); + + // Cast to bool operator + operator bool() const; + + // Get error description + const char* description() const; + }; + + // A single XPath variable + class PUGIXML_CLASS xpath_variable + { + friend class xpath_variable_set; + + protected: + xpath_value_type _type; + xpath_variable* _next; + + xpath_variable(); + + // Non-copyable semantics + xpath_variable(const xpath_variable&); + xpath_variable& operator=(const xpath_variable&); + + public: + // Get variable name + const char_t* name() const; + + // Get variable type + xpath_value_type type() const; + + // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error + bool get_boolean() const; + double get_number() const; + const char_t* get_string() const; + const xpath_node_set& get_node_set() const; + + // Set variable value; no type conversion is performed, false is returned on type mismatch error + bool set(bool value); + bool set(double value); + bool set(const char_t* value); + bool set(const xpath_node_set& value); + }; + + // A set of XPath variables + class PUGIXML_CLASS xpath_variable_set + { + private: + xpath_variable* _data[64]; + + // Non-copyable semantics + xpath_variable_set(const xpath_variable_set&); + xpath_variable_set& operator=(const xpath_variable_set&); + + xpath_variable* find(const char_t* name) const; + + public: + // Default constructor/destructor + xpath_variable_set(); + ~xpath_variable_set(); + + // Add a new variable or get the existing one, if the types match + xpath_variable* add(const char_t* name, xpath_value_type type); + + // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch + bool set(const char_t* name, bool value); + bool set(const char_t* name, double value); + bool set(const char_t* name, const char_t* value); + bool set(const char_t* name, const xpath_node_set& value); + + // Get existing variable by name + xpath_variable* get(const char_t* name); + const xpath_variable* get(const char_t* name) const; + }; + + // A compiled XPath query object + class PUGIXML_CLASS xpath_query + { + private: + void* _impl; + xpath_parse_result _result; + + typedef void (*unspecified_bool_type)(xpath_query***); + + // Non-copyable semantics + xpath_query(const xpath_query&); + xpath_query& operator=(const xpath_query&); + + public: + // Construct a compiled object from XPath expression. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors. + explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0); + + // Destructor + ~xpath_query(); + + // Get query expression return type + xpath_value_type return_type() const; + + // Evaluate expression as boolean value in the specified context; performs type conversion if necessary. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + bool evaluate_boolean(const xpath_node& n) const; + + // Evaluate expression as double value in the specified context; performs type conversion if necessary. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + double evaluate_number(const xpath_node& n) const; + + #ifndef PUGIXML_NO_STL + // Evaluate expression as string value in the specified context; performs type conversion if necessary. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + string_t evaluate_string(const xpath_node& n) const; + #endif + + // Evaluate expression as string value in the specified context; performs type conversion if necessary. + // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero). + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + // If PUGIXML_NO_EXCEPTIONS is defined, returns empty set instead. + size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const; + + // Evaluate expression as node set in the specified context. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors. + // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead. + xpath_node_set evaluate_node_set(const xpath_node& n) const; + + // Evaluate expression as node set in the specified context. + // Return first node in document order, or empty node if node set is empty. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors. + // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node instead. + xpath_node evaluate_node(const xpath_node& n) const; + + // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode) + const xpath_parse_result& result() const; + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + }; + + #ifndef PUGIXML_NO_EXCEPTIONS + // XPath exception class + class PUGIXML_CLASS xpath_exception: public std::exception + { + private: + xpath_parse_result _result; + + public: + // Construct exception from parse result + explicit xpath_exception(const xpath_parse_result& result); + + // Get error message + virtual const char* what() const throw(); + + // Get parse result + const xpath_parse_result& result() const; + }; + #endif + + // XPath node class (either xml_node or xml_attribute) + class PUGIXML_CLASS xpath_node + { + private: + xml_node _node; + xml_attribute _attribute; + + typedef void (*unspecified_bool_type)(xpath_node***); + + public: + // Default constructor; constructs empty XPath node + xpath_node(); + + // Construct XPath node from XML node/attribute + xpath_node(const xml_node& node); + xpath_node(const xml_attribute& attribute, const xml_node& parent); + + // Get node/attribute, if any + xml_node node() const; + xml_attribute attribute() const; + + // Get parent of contained node/attribute + xml_node parent() const; + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Comparison operators + bool operator==(const xpath_node& n) const; + bool operator!=(const xpath_node& n) const; + }; + +#ifdef __BORLANDC__ + // Borland C++ workaround + bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs); + bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs); +#endif + + // A fixed-size collection of XPath nodes + class PUGIXML_CLASS xpath_node_set + { + public: + // Collection type + enum type_t + { + type_unsorted, // Not ordered + type_sorted, // Sorted by document order (ascending) + type_sorted_reverse // Sorted by document order (descending) + }; + + // Constant iterator type + typedef const xpath_node* const_iterator; + + // We define non-constant iterator to be the same as constant iterator so that various generic algorithms (i.e. boost foreach) work + typedef const xpath_node* iterator; + + // Default constructor. Constructs empty set. + xpath_node_set(); + + // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful + xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted); + + // Destructor + ~xpath_node_set(); + + // Copy constructor/assignment operator + xpath_node_set(const xpath_node_set& ns); + xpath_node_set& operator=(const xpath_node_set& ns); + + // Get collection type + type_t type() const; + + // Get collection size + size_t size() const; + + // Indexing operator + const xpath_node& operator[](size_t index) const; + + // Collection iterators + const_iterator begin() const; + const_iterator end() const; + + // Sort the collection in ascending/descending order by document order + void sort(bool reverse = false); + + // Get first node in the collection by document order + xpath_node first() const; + + // Check if collection is empty + bool empty() const; + + private: + type_t _type; + + xpath_node _storage; + + xpath_node* _begin; + xpath_node* _end; + + void _assign(const_iterator begin, const_iterator end); + }; +#endif + +#ifndef PUGIXML_NO_STL + // Convert wide string to UTF8 + std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const wchar_t* str); + std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const std::basic_string, std::allocator >& str); + + // Convert UTF8 to wide string + std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const char* str); + std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const std::basic_string, std::allocator >& str); +#endif + + // Memory allocation function interface; returns pointer to allocated memory or NULL on failure + typedef void* (*allocation_function)(size_t size); + + // Memory deallocation function interface + typedef void (*deallocation_function)(void* ptr); + + // Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions. + void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate); + + // Get current memory management functions + allocation_function PUGIXML_FUNCTION get_memory_allocation_function(); + deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function(); +} + +#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC)) +namespace std +{ + // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier) + std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&); + std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&); + std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&); +} +#endif + +#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC) +namespace std +{ + // Workarounds for (non-standard) iterator category detection + std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&); + std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&); + std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&); +} +#endif + +#endif + +// Make sure implementation is included in header-only mode +// Use macro expansion in #include to work around QMake (QTBUG-11923) +#if defined(PUGIXML_HEADER_ONLY) && !defined(PUGIXML_SOURCE) +# define PUGIXML_SOURCE "pugixml.cpp" +# include PUGIXML_SOURCE +#endif + +/** + * Copyright (c) 2006-2014 Arseny Kapoulkine + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ diff --git a/src/swig/Accuracy.i b/src/swig/Accuracy.i new file mode 100644 index 0000000..1e6015c --- /dev/null +++ b/src/swig/Accuracy.i @@ -0,0 +1,17 @@ +/* Accuracy.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +#ifdef SWIGPYTHON +%rename(__float__) PacBio::BAM::Accuracy::operator float; +#else // C#, R +%rename(ToFloat) PacBio::BAM::Accuracy::operator float; +#endif + +%include \ No newline at end of file diff --git a/src/swig/AlignmentPrinter.i b/src/swig/AlignmentPrinter.i new file mode 100644 index 0000000..6c4fc69 --- /dev/null +++ b/src/swig/AlignmentPrinter.i @@ -0,0 +1,11 @@ +/* AlignmentPrinter.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include \ No newline at end of file diff --git a/src/swig/BamFile.i b/src/swig/BamFile.i new file mode 100644 index 0000000..4a429e9 --- /dev/null +++ b/src/swig/BamFile.i @@ -0,0 +1,18 @@ +/* BamFile.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +#ifdef SWIGR +%ignore PacBio::BAM::BamFile::BamFile(const BamFile&); +#endif + +%ignore PacBio::BAM::BamFile::BamFile(BamFile&&); +%ignore PacBio::BAM::BamFile::operator=; + +%include diff --git a/src/swig/BamHeader.i b/src/swig/BamHeader.i new file mode 100644 index 0000000..3572f04 --- /dev/null +++ b/src/swig/BamHeader.i @@ -0,0 +1,21 @@ +/* BamHeader.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +// Hide warnings about "internal" being a C# reserved word +%warnfilter(314) PacBio::BAM::internal; + +%ignore PacBio::BAM::BamHeader::BamHeader(BamHeader&&); // move ctors not used +%ignore PacBio::BAM::BamHeader::operator=; // assignment operators not used + +%template(ProgramInfoList) std::vector; +%template(ReadGroupInfoList) std::vector; +%template(SequenceInfoList) std::vector; + +%include diff --git a/src/swig/BamRecord.i b/src/swig/BamRecord.i new file mode 100644 index 0000000..4b8cee6 --- /dev/null +++ b/src/swig/BamRecord.i @@ -0,0 +1,33 @@ +/* BamRecord.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +// Hide warnings about "internal" being a C# reserved word +%warnfilter(314) PacBio::BAM::internal; + +// hide warnings about unused methods +%ignore PacBio::BAM::BamRecord::BamRecord(BamRecordImpl&&); +%ignore PacBio::BAM::BamRecord::BamRecord(BamRecord&&); +%ignore PacBio::BAM::BamRecord::operator=; + +// ignore static methods, to allow member +%ignore PacBio::BAM::BamRecord::Clipped(const BamRecord&, const ClipType, const PacBio::BAM::Position, const PacBio::BAM::Position); +%ignore PacBio::BAM::BamRecord::Mapped(const BamRecord&, const int32_t, const Position, const Strand, const Cigar&, const uint8_t); + +// C# gets confused by the const and nonconst overloads +%ignore PacBio::BAM::BamRecord::Impl() const; + +#if defined(SWIGR) || defined(SWIGPYTHON) +%rename("EncodedPkmean") PacBio::BAM::BamRecord::Pkmean(const std::vector&); +%rename("EncodedPkmid") PacBio::BAM::BamRecord::Pkmid(const std::vector&); +%rename("EncodedPkmean2") PacBio::BAM::BamRecord::Pkmean2(const std::vector&); +%rename("EncodedPkmid2") PacBio::BAM::BamRecord::Pkmid2(const std::vector&); +#endif + +%include diff --git a/src/swig/BamRecordBuilder.i b/src/swig/BamRecordBuilder.i new file mode 100644 index 0000000..52e7690 --- /dev/null +++ b/src/swig/BamRecordBuilder.i @@ -0,0 +1,18 @@ +/* BamRecord.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::BamRecordBuilder::BamRecordBuilder(BamRecordBuilder&&); // move ctors not used +%ignore PacBio::BAM::BamRecordBuilder::operator=; + +%ignore PacBio::BAM::BamRecordBuilder::Reset(BamRecord&&); +%ignore PacBio::BAM::BamRecordBuilder::Cigar(PacBio::BAM::Cigar&&); +%ignore PacBio::BAM::BamRecordBuilder::Tags(TagCollection&&); + +%include diff --git a/src/swig/BamRecordImpl.i b/src/swig/BamRecordImpl.i new file mode 100644 index 0000000..2c8a48f --- /dev/null +++ b/src/swig/BamRecordImpl.i @@ -0,0 +1,14 @@ +/* BamRecordImpl.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::BamRecordImpl::BamRecordImpl(BamRecordImpl&&); +%ignore PacBio::BAM::BamRecordImpl::operator=; + +%include diff --git a/src/swig/BamRecordTag.i b/src/swig/BamRecordTag.i new file mode 100644 index 0000000..a34b661 --- /dev/null +++ b/src/swig/BamRecordTag.i @@ -0,0 +1,11 @@ +/* BamRecordTag.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include diff --git a/src/swig/BamTagCodec.i b/src/swig/BamTagCodec.i new file mode 100644 index 0000000..4a4326b --- /dev/null +++ b/src/swig/BamTagCodec.i @@ -0,0 +1,11 @@ +/* BamTagCodec.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include \ No newline at end of file diff --git a/src/swig/BamWriter.i b/src/swig/BamWriter.i new file mode 100644 index 0000000..dd23e5b --- /dev/null +++ b/src/swig/BamWriter.i @@ -0,0 +1,15 @@ +/* BamWriter.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::BamWriter(const BamWriter&); // copy ctor not used +%ignore PacBio::BAM::BamWriter(BamWriter&&); // move ctor not used +%ignore PacBio::BAM::BamWriter::operator=; // assignment operators not used + +%include diff --git a/src/swig/CMakeLists.txt b/src/swig/CMakeLists.txt new file mode 100644 index 0000000..a8869c3 --- /dev/null +++ b/src/swig/CMakeLists.txt @@ -0,0 +1,68 @@ +# --------------------------------------------- @ +# SWIG +# --------------------------------------------- @ + +# general SWIG +if(${wrapping_swig}) + + find_package(SWIG 3.0.5 REQUIRED) + + include(${SWIG_USE_FILE}) + include_directories(${PacBioBAM_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}) + + # + # quash compiler warnings from SWIG-generated code + # + check_cxx_compiler_flag("-Wno-unused-local-typedefs" HAS_NO_UNUSED_BUT_SET_VARIABLE) + if(HAS_NO_UNUSED_BUT_SET_VARIABLE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-but-set-variable") + endif() + + check_cxx_compiler_flag("-Wno-dynamic-class-memaccess" HAS_NO_DYNAMIC_CLASS_MEMACCESS) + if (HAS_NO_DYNAMIC_CLASS_MEMACCESS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-dynamic-class-memaccess") + endif() + + check_cxx_compiler_flag("-Wno-unused-parameter" HAS_NO_UNUSED_PARAMETER) + if (HAS_NO_UNUSED_PARAMETER) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter") + endif() + + check_cxx_compiler_flag("-Wno-return-local-addr" HAS_NO_RETURN_LOCAL_ADDR) + if (HAS_NO_RETURN_LOCAL_ADDR) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-return-local-addr") + endif() + + check_cxx_compiler_flag("-Wno-return-type" HAS_NO_RETURN_TYPE) + if (HAS_NO_RETURN_TYPE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-return-type") + endif() + + # + # SWIG source file properties + # + set_source_files_properties(PacBioBam.i PROPERTIES CPLUSPLUS ON) + + if (APPLE) + else() + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set_source_files_properties(PacBioBam.i PROPERTIES SWIG_FLAGS "-DSWIGWORDSIZE64") + endif() + endif() + +endif() + +# Python +if(PacBioBAM_wrap_python) + include(WrapPython.cmake) +endif() + +# R +if(PacBioBAM_wrap_r) + include(WrapR.cmake) +endif() + +# CSharp +if(PacBioBAM_wrap_csharp) + include(WrapCSharp.cmake) +endif() diff --git a/src/swig/Cigar.i b/src/swig/Cigar.i new file mode 100644 index 0000000..2c5cf8c --- /dev/null +++ b/src/swig/Cigar.i @@ -0,0 +1,16 @@ +/* Cigar.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%template(CigarOpList) std::vector; + +%ignore PacBio::BAM::Cigar::Cigar(Cigar&&); +%ignore PacBio::BAM::Cigar::operator=; + +%include diff --git a/src/swig/CigarOperation.i b/src/swig/CigarOperation.i new file mode 100644 index 0000000..0a23a17 --- /dev/null +++ b/src/swig/CigarOperation.i @@ -0,0 +1,47 @@ +/* CigarOperation.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::CigarOperation::CigarOperation(CigarOperation&&); +%ignore PacBio::BAM::CigarOperation::operator=; + +#ifdef SWIGR +%ignore PacBio::BAM::CigarOperation::CigarOperation(CigarOperationType, uint32_t); +#endif + +%include + +// enums aren't always named consistently (at least between Mac/clang/swig & Linux/gcc/swig) +// so, keep this after the main %include so client source can be consistent +#ifdef SWIGPYTHON +%pythoncode %{ +try: + UNKNOWN_OP + ALIGNMENT_MATCH + INSERTION + DELETION + REFERENCE_SKIP + SOFT_CLIP + HARD_CLIP + PADDING + SEQUENCE_MATCH + SEQUENCE_MISMATCH +except NameError: + UNKNOWN_OP = CigarOperationType_UNKNOWN_OP + ALIGNMENT_MATCH = CigarOperationType_ALIGNMENT_MATCH + INSERTION = CigarOperationType_INSERTION + DELETION = CigarOperationType_DELETION + REFERENCE_SKIP = CigarOperationType_REFERENCE_SKIP + SOFT_CLIP = CigarOperationType_SOFT_CLIP + HARD_CLIP = CigarOperationType_HARD_CLIP + PADDING = CigarOperationType_PADDING + SEQUENCE_MATCH = CigarOperationType_SEQUENCE_MATCH + SEQUENCE_MISMATCH = CigarOperationType_SEQUENCE_MISMATCH +%} +#endif diff --git a/src/swig/ClipType.i b/src/swig/ClipType.i new file mode 100644 index 0000000..350416c --- /dev/null +++ b/src/swig/ClipType.i @@ -0,0 +1,11 @@ +/* ClipType.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include diff --git a/src/swig/DataSet.i b/src/swig/DataSet.i new file mode 100644 index 0000000..f8cba2b --- /dev/null +++ b/src/swig/DataSet.i @@ -0,0 +1,49 @@ +/* DataSet.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +// move ctors not used +%ignore PacBio::BAM::DataSet::DataSet(DataSet&&); + +// assignment operators not used +%ignore PacBio::BAM::DataSet::operator=; + +#ifdef SWIGCSHARP + +// ignore non-const accessors +%ignore PacBio::BAM::DataSet::Attribute(const std::string&); +%ignore PacBio::BAM::DataSet::CreatedAt(); +%ignore PacBio::BAM::DataSet::Extensions(); +%ignore PacBio::BAM::DataSet::ExternalResources(); +%ignore PacBio::BAM::DataSet::Filters(); +%ignore PacBio::BAM::DataSet::Format(); +%ignore PacBio::BAM::DataSet::Metadata(); +%ignore PacBio::BAM::DataSet::MetaType(); +%ignore PacBio::BAM::DataSet::ModifiedAt(); +%ignore PacBio::BAM::DataSet::Name(); +%ignore PacBio::BAM::DataSet::Namespaces(); +%ignore PacBio::BAM::DataSet::ResourceId(); +%ignore PacBio::BAM::DataSet::SubDataSets(); +%ignore PacBio::BAM::DataSet::Tags(); +%ignore PacBio::BAM::DataSet::TimeStampedName(); +%ignore PacBio::BAM::DataSet::UniqueId(); +%ignore PacBio::BAM::DataSet::Version(); + +// disable operator(s) +%ignore PacBio::BAM::DataSet::operator+=; + +#endif // C# + +#ifdef SWIGR +%ignore PacBio::BAM::DataSet::DataSet(const DataSet::TypeEnum type); +/*%ignore PacBio::BAM::DataSet::DataSet(const BamFile& bamFile);*/ +#endif // R + + +%include diff --git a/src/swig/DataSetTypes.i b/src/swig/DataSetTypes.i new file mode 100644 index 0000000..5644d3f --- /dev/null +++ b/src/swig/DataSetTypes.i @@ -0,0 +1,121 @@ +/* DataSetTypes.i */ + +%module PacBioBam + +%{ +#include +#include +#include +#include + +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +%} + +%ignore PacBio::BAM::internal::DataSetElement::DataSetElement(DataSetElement&&); +%ignore PacBio::BAM::internal::DataSetElement::operator=; +%ignore PacBio::BAM::internal::DataSetElement::operator[]; +/*%rename(__getitem__) PacBio::BAM::internal::DataSetElement::operator[];*/ + +%ignore PacBio::BAM::internal::XmlName::XmlName(XmlName&&); +%ignore PacBio::BAM::internal::XmlName::operator=; + +#ifdef SWIGCSHARP + +// ignore non-const accessors +%ignore PacBio::BAM::DataSetBase::ExternalResources(); +%ignore PacBio::BAM::DataSetBase::Filters(); +%ignore PacBio::BAM::DataSetBase::Metadata(); +%ignore PacBio::BAM::DataSetBase::Namespaces(); +%ignore PacBio::BAM::DataSetBase::SubDataSets(); +%ignore PacBio::BAM::DataSetMetadata::NumRecords(); +%ignore PacBio::BAM::DataSetMetadata::Provenance(); +%ignore PacBio::BAM::DataSetMetadata::TotalLength(); +%ignore PacBio::BAM::ExternalResource::ExternalResources(); +%ignore PacBio::BAM::Filter::Properties(); +%ignore PacBio::BAM::Property::Name(); +%ignore PacBio::BAM::Property::Operator(); +%ignore PacBio::BAM::Property::Value(); +%ignore PacBio::BAM::Provenance::CreatedBy(); +%ignore PacBio::BAM::Provenance::CommonServicesInstanceId(); +%ignore PacBio::BAM::Provenance::CreatorUserId(); +%ignore PacBio::BAM::Provenance::ParentJobId(); +%ignore PacBio::BAM::Provenance::ParentTool(); +%ignore PacBio::BAM::internal::BaseEntityType::Description(); +%ignore PacBio::BAM::internal::BaseEntityType::Extensions(); +%ignore PacBio::BAM::internal::BaseEntityType::Format(); +%ignore PacBio::BAM::internal::BaseEntityType::ModifiedAt(); +%ignore PacBio::BAM::internal::BaseEntityType::Name(); +%ignore PacBio::BAM::internal::BaseEntityType::ResourceId(); +%ignore PacBio::BAM::internal::BaseEntityType::Tags(); +%ignore PacBio::BAM::internal::BaseEntityType::Version(); +%ignore PacBio::BAM::internal::DataEntityType::Checksum(); +%ignore PacBio::BAM::internal::DataEntityType::EncodedValue(); +%ignore PacBio::BAM::internal::DataEntityType::MetaType(); +%ignore PacBio::BAM::internal::DataEntityType::SimpleValue(); +%ignore PacBio::BAM::internal::DataEntityType::TimeStampedName(); +%ignore PacBio::BAM::internal::DataEntityType::UniqueId(); +%ignore PacBio::BAM::internal::DataEntityType::ValueDataType(); +%ignore PacBio::BAM::internal::DataSetElement::Attribute(const std::string&); +%ignore PacBio::BAM::internal::DataSetElement::Attributes(); +%ignore PacBio::BAM::internal::DataSetElement::Children(); +%ignore PacBio::BAM::internal::DataSetElement::ChildText(const std::string&); +%ignore PacBio::BAM::internal::DataSetElement::CreatedAt(); +%ignore PacBio::BAM::internal::DataSetElement::Text(); +%ignore PacBio::BAM::internal::IndexedDataType::FileIndices(); +%ignore PacBio::BAM::internal::StrictEntityType::MetaType(); +%ignore PacBio::BAM::internal::StrictEntityType::TimeStampedName(); +%ignore PacBio::BAM::internal::StrictEntityType::UniqueId(); + +// disable operator(s) +%ignore PacBio::BAM::DataSetMetadata::operator+=; +%ignore PacBio::BAM::ExternalResources::operator+=; +%ignore PacBio::BAM::Filters::operator+=; +%ignore PacBio::BAM::DataSetBase::operator+=; +%ignore PacBio::BAM::SubDataSets::operator+=; + +#endif // C# + +%include + +%ignore PacBio::BAM::internal::DataSetElementList::operator[]; +%ignore PacBio::BAM::internal::DataSetListIterator::operator++; +%ignore PacBio::BAM::internal::DataSetListConstIterator::operator++; + +%include + +%template(ExtensionListElement) PacBio::BAM::internal::DataSetListElement; +%template(ExternalResourceListElement) PacBio::BAM::internal::DataSetListElement; +%template(FileIndexListElement) PacBio::BAM::internal::DataSetListElement; +%template(FilterListElement) PacBio::BAM::internal::DataSetListElement; +%template(PropertyListElement) PacBio::BAM::internal::DataSetListElement; +%template(SubDataSetListElement) PacBio::BAM::internal::DataSetListElement; + +%extend PacBio::BAM::internal::DataSetListElement { + PacBio::BAM::ExtensionElement& __getitem__(unsigned int i) { return $self->Child(i); } + PacBio::BAM::ExtensionElement& __getitem__(const std::string& s) { return $self->Child(s); } +} +%extend PacBio::BAM::internal::DataSetListElement { + PacBio::BAM::ExternalResource& __getitem__(unsigned int i) { return $self->Child(i); } + PacBio::BAM::ExternalResource& __getitem__(const std::string& s) { return $self->Child(s); } +} +%extend PacBio::BAM::internal::DataSetListElement { + PacBio::BAM::FileIndex& __getitem__(unsigned int i) { return $self->Child(i);} + PacBio::BAM::FileIndex& __getitem__(const std::string& s) { return $self->Child(s);} +} +%extend PacBio::BAM::internal::DataSetListElement { + PacBio::BAM::Filter& __getitem__(unsigned int i) { return $self->Child(i); } + PacBio::BAM::Filter& __getitem__(const std::string& s) { return $self->Child(s); } +} +%extend PacBio::BAM::internal::DataSetListElement { + PacBio::BAM::Property& __getitem__(unsigned int i) { return $self->Child(i); } + PacBio::BAM::Property& __getitem__(const std::string& s) { return $self->Child(s); } +} +%extend PacBio::BAM::internal::DataSetListElement { + PacBio::BAM::DataSetBase& __getitem__(unsigned int i) { return $self->Child(i); } + PacBio::BAM::DataSetBase& __getitem__(const std::string& s) { return $self->Child(s); } +} + +%include +%include diff --git a/src/swig/EntireFileQuery.i b/src/swig/EntireFileQuery.i new file mode 100644 index 0000000..c7c0b06 --- /dev/null +++ b/src/swig/EntireFileQuery.i @@ -0,0 +1,15 @@ +/* EntireFileQuery.i */ + +%module PacBioBam + +%{ +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include +%include +%include diff --git a/src/swig/FrameEncodingType.i b/src/swig/FrameEncodingType.i new file mode 100644 index 0000000..1bf1552 --- /dev/null +++ b/src/swig/FrameEncodingType.i @@ -0,0 +1,11 @@ +/* FrameEncodingType.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include diff --git a/src/swig/Frames.i b/src/swig/Frames.i new file mode 100644 index 0000000..d3e001a --- /dev/null +++ b/src/swig/Frames.i @@ -0,0 +1,19 @@ +/* Frames.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::Frames::Frames(Frames&&); +%ignore PacBio::BAM::Frames::Frames(std::vector&&); +%ignore PacBio::BAM::Frames::operator=; +%ignore PacBio::BAM::Frames::Data(std::vector&&); + +%template(UInt8List) std::vector; +%template(UInt16List) std::vector; + +%include diff --git a/src/swig/GenomicInterval.i b/src/swig/GenomicInterval.i new file mode 100644 index 0000000..199a3c3 --- /dev/null +++ b/src/swig/GenomicInterval.i @@ -0,0 +1,13 @@ +/* GenomicInterval.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::GenomicInterval::operator=; + +%include diff --git a/src/swig/GenomicIntervalQuery.i b/src/swig/GenomicIntervalQuery.i new file mode 100644 index 0000000..d3f9fa7 --- /dev/null +++ b/src/swig/GenomicIntervalQuery.i @@ -0,0 +1,11 @@ +/* GenomicIntervalQuery.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include diff --git a/src/swig/GroupQuery.i b/src/swig/GroupQuery.i new file mode 100644 index 0000000..3128e15 --- /dev/null +++ b/src/swig/GroupQuery.i @@ -0,0 +1,11 @@ +/* GroupQuery.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include \ No newline at end of file diff --git a/src/swig/GroupQueryBase.i b/src/swig/GroupQueryBase.i new file mode 100644 index 0000000..ade4526 --- /dev/null +++ b/src/swig/GroupQueryBase.i @@ -0,0 +1,32 @@ +/* GroupQueryBase.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::GroupQueryIterator::operator++; +%ignore PacBio::BAM::GroupQueryConstIterator::operator++; + +%include + +%extend PacBio::BAM::GroupQueryIterator +{ + PacBio::BAM::GroupQueryIterator& incr(void) + { return $self->operator++(); } + + std::vector* value(void) + { return $self->operator->(); } +} + +%extend PacBio::BAM::GroupQueryConstIterator +{ + PacBio::BAM::GroupQueryConstIterator& incr(void) + { return $self->operator++(); } + + const std::vector* value(void) const + { return $self->operator->(); } +} \ No newline at end of file diff --git a/src/swig/IRecordWriter.i b/src/swig/IRecordWriter.i new file mode 100644 index 0000000..a64b083 --- /dev/null +++ b/src/swig/IRecordWriter.i @@ -0,0 +1,9 @@ +/* IRecordWriter.i */ +%module PacBioBam +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include diff --git a/src/swig/IndexedFastaReader.i b/src/swig/IndexedFastaReader.i new file mode 100644 index 0000000..25ef650 --- /dev/null +++ b/src/swig/IndexedFastaReader.i @@ -0,0 +1,13 @@ +/* IndexedFastaReader.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::IndexedFastaReader::operator=; // assignment operators not used + +%include diff --git a/src/swig/Interval.i b/src/swig/Interval.i new file mode 100644 index 0000000..0867eb4 --- /dev/null +++ b/src/swig/Interval.i @@ -0,0 +1,12 @@ +/* Interval.i */ +%module PacBioBam +%{ +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include + +%template(PositionInterval) PacBio::BAM::Interval; diff --git a/src/swig/LocalContextFlags.i b/src/swig/LocalContextFlags.i new file mode 100644 index 0000000..66ee990 --- /dev/null +++ b/src/swig/LocalContextFlags.i @@ -0,0 +1,15 @@ +/* LocalContextFlags.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +#ifdef SWIGCSHARP +%ignore operator|(const LocalContextFlags, const LocalContextFlags); +#endif + +%include diff --git a/src/swig/Orientation.i b/src/swig/Orientation.i new file mode 100644 index 0000000..2f10a7a --- /dev/null +++ b/src/swig/Orientation.i @@ -0,0 +1,11 @@ +/* Orientation.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include \ No newline at end of file diff --git a/src/swig/PacBio.BAM.csproj.in b/src/swig/PacBio.BAM.csproj.in new file mode 100644 index 0000000..61872db --- /dev/null +++ b/src/swig/PacBio.BAM.csproj.in @@ -0,0 +1,37 @@ + + + + Debug + AnyCPU + {6E414044-5469-48E4-BA14-1B9888875DD5} + Library + PacBio.BAM + PacBio.BAM + v4.5 + + + true + full + false + bin\Debug + DEBUG; + prompt + 4 + false + + + full + true + bin\Release + prompt + 4 + false + + + + + + + + + diff --git a/src/swig/PacBioBam.i b/src/swig/PacBioBam.i new file mode 100644 index 0000000..213387f --- /dev/null +++ b/src/swig/PacBioBam.i @@ -0,0 +1,161 @@ +/* pbbam.i */ +%module PacBioBam +%{ + +/*ifdef SWIGR + define SWIG_SHARED_PTR_NAMESPACE boost + define SWIG_SHARED_PTR_SUBNAMESPACE +endif*/ + +#include +#include +#include +%} + +#define SWIG_FILE_WITH_INIT +#define PBBAM_EXPORT + +#ifdef SWIGCSHARP +%rename(Equals) *::operator==; +%rename(ToBool) *::operator bool; +%rename(ToInt) *::operator int; +%rename(ToUint8) *::operator uint8_t; + +%ignore *::operator !=; + +// Iterator interfaces are not useful outside of C++ +%ignore *::begin; +%ignore *::end; + +%csmethodmodifiers *::ToString() const "public override"; + +#endif // SWIGCSHARP + +/********* SWIG includes ************/ + +%include "stdint.i" +%include "std_common.i" + +#ifdef SWIGR +%include "boost_shared_ptr.i" +#else +%include "std_shared_ptr.i" +#endif + +%include "std_map.i" +%include "std_pair.i" +%include "std_string.i" +%include "std_vector.i" + + // TODO: can we call these vectors!? +%template(StringList) std::vector; +%template(IntList) std::vector; +%template(UIntList) std::vector; +%template(FloatList) std::vector; +%template(ShortList) std::vector; +%template(CharList) std::vector; + +// exception handling +%include "exception.i" +%exception { + try { + $action + } catch (const std::exception& e) { + SWIG_exception(SWIG_RuntimeError, e.what()); + } +} + +/********* PacBioBAM includes ************/ + +#ifdef SWIGCSHARP + // Renames to play nice with C# + // (These are used in the dataset support code, where things like + // this happen in C++: + // + // void Extensions(Extensions x) { ... } + // + // and this poses problems for C#. Renaming should be fine + // as it is doubtful we will refer to these classes by name anyway.) + // +%rename(ExtensionsType) PacBio::BAM::Extensions; +%rename(ExternalResourcesType) PacBio::BAM::ExternalResources; +%rename(FiltersType) PacBio::BAM::Filters; +%rename(SubDataSetsType) PacBio::BAM::SubDataSets; +%rename(ProvenanceType) PacBio::BAM::Provenance; +%rename(PropertiesType) PacBio::BAM::Properties; +%rename(FileIndicesType) PacBio::BAM::FileIndices; +%rename(ParentToolType) PacBio::BAM::ParentTool; +%rename(CigarType) PacBio::BAM::Cigar; +#endif + +// Basic types +%include "Accuracy.i" +%include "BamRecordTag.i" +%include "CigarOperation.i" +%include "ClipType.i" +%include "FrameEncodingType.i" +%include "Interval.i" +%include "LocalContextFlags.i" +%include "Orientation.i" +%include "Position.i" +%include "PulseBehavior.i" +%include "QualityValue.i" +%include "RecordType.i" +%include "Strand.i" +%include "Tag.i" + +// Basic type aggregates +%include "Cigar.i" +%include "GenomicInterval.i" +%include "QualityValues.i" +%include "TagCollection.i" + +// keep this guy after the other basic types, hacky but works +%include "Frames.i" + +// Header API +%include "ProgramInfo.i" +%include "ReadGroupInfo.i" +%include "SequenceInfo.i" +%include "BamHeader.i" + +// SAM/BAM format +%include "IRecordWriter.i" +%include "BamFile.i" +%include "BamRecordImpl.i" +%include "BamRecord.i" +%include "BamRecordBuilder.i" +%include "BamTagCodec.i" +%include "BamWriter.i" +%include "SamTagCodec.i" +%include "SamWriter.i" + +// DataSet +%include "DataSetTypes.i" +%include "DataSet.i" + +// Query/iterator API +%include "QueryBase.i" +%include "EntireFileQuery.i" +%include "GenomicIntervalQuery.i" +%include "ZmwQuery.i" +%include "ZmwGroupQuery.i" + +// PBI +%include "PbiFile.i" +%include "PbiRawData.i" +%include "PbiIndex.i" + +// FASTA +%include "IndexedFastaReader.i" + +// Virtual record API +%include "VirtualRegion.i" +%include "VirtualZmwBamRecord.i" +%include "ZmwReadStitcher.i" +%include "WhitelistedZmwReadStitcher.i" + +// Virtual record API - deprecated +%include "VirtualPolymeraseBamRecord.i" +%include "VirtualPolymeraseReader.i" +%include "ZmwWhitelistVirtualReader.i" diff --git a/src/swig/PbiFile.i b/src/swig/PbiFile.i new file mode 100644 index 0000000..096ff10 --- /dev/null +++ b/src/swig/PbiFile.i @@ -0,0 +1,11 @@ +/* PbiFile.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include \ No newline at end of file diff --git a/src/swig/PbiIndex.i b/src/swig/PbiIndex.i new file mode 100644 index 0000000..d903f67 --- /dev/null +++ b/src/swig/PbiIndex.i @@ -0,0 +1,18 @@ +/* PbiIndex.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +/*%ignore PacBio::BAM::IndexResultBlock::IndexResultBlock();*/ +%ignore PacBio::BAM::IndexResultBlock::IndexResultBlock(size_t, size_t); + +%ignore PacBio::BAM::PbiIndex::PbiIndex(PbiIndex&&); // move ctors not used +%ignore PacBio::BAM::PbiIndex::operator=; // assignment operators not used +%ignore PacBio::BAM::PbiIndeX::VirtualFileOffsets; + +%include diff --git a/src/swig/PbiRawData.i b/src/swig/PbiRawData.i new file mode 100644 index 0000000..3db9ece --- /dev/null +++ b/src/swig/PbiRawData.i @@ -0,0 +1,35 @@ +/* PbiRawData.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +// move ctors not used +%ignore PacBio::BAM::PbiRawBarcodeData::PbiRawBarcodeData(PbiRawBarcodeData&&); +%ignore PacBio::BAM::PbiRawMappedData::PbiRawMappedData(PbiRawMappedData&&); +%ignore PacBio::BAM::PbiReferenceEntry::PbiReferenceEntry(PbiReferenceEntry&&); +%ignore PacBio::BAM::PbiRawReferenceData::PbiRawReferenceData(PbiRawReferenceData&&); +%ignore PacBio::BAM::PbiRawBasicData::PbiRawBasicData(PbiRawBasicData&&); +%ignore PacBio::BAM::PbiRawData::PbiRawData(PbiRawData&&); + +// assignment operators not used +%ignore PacBio::BAM::PbiRawBarcodeData::operator=; +%ignore PacBio::BAM::PbiRawMappedData::operator=; +%ignore PacBio::BAM::PbiReferenceEntry::operator=; +%ignore PacBio::BAM::PbiRawReferenceData::operator=; +%ignore PacBio::BAM::PbiRawBasicData::operator=; +%ignore PacBio::BAM::PbiRawData::operator=; + +#ifdef SWIGCSHARP +// ignore non-const accessors +%ignore PacBio::BAM::PbiRawData::BarcodeData(); +%ignore PacBio::BAM::PbiRawData::MappedData(); +%ignore PacBio::BAM::PbiRawData::ReferenceData(); +%ignore PacBio::BAM::PbiRawData::BasicData(); +#endif // C# + +%include diff --git a/src/swig/Position.i b/src/swig/Position.i new file mode 100644 index 0000000..9917024 --- /dev/null +++ b/src/swig/Position.i @@ -0,0 +1,11 @@ +/* Position.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include diff --git a/src/swig/ProgramInfo.i b/src/swig/ProgramInfo.i new file mode 100644 index 0000000..9f2a2aa --- /dev/null +++ b/src/swig/ProgramInfo.i @@ -0,0 +1,15 @@ +/* ProgramInfo.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::ProgramInfo::ProgramInfo(ProgramInfo&&); +%ignore PacBio::BAM::ProgramInfo::operator=; +%ignore PacBio::BAM::ProgramInfo::ToSam(const ProgramInfo&); // ignore static method, to allow member + +%include \ No newline at end of file diff --git a/src/swig/PulseBehavior.i b/src/swig/PulseBehavior.i new file mode 100644 index 0000000..65d90f0 --- /dev/null +++ b/src/swig/PulseBehavior.i @@ -0,0 +1,11 @@ +/* PulseBehavior.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include diff --git a/src/swig/QualityValue.i b/src/swig/QualityValue.i new file mode 100644 index 0000000..29874e1 --- /dev/null +++ b/src/swig/QualityValue.i @@ -0,0 +1,20 @@ +/* QualityValue.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::QualityValue::operator=; + +#ifdef SWIGPYTHON +%rename(__int__) PacBio::BAM::QualityValue::operator uint8_t; +#else // R, C# +%rename(ToInt) PacBio::BAM::QualityValue::operator uint8_t; +#endif + +%include + diff --git a/src/swig/QualityValues.i b/src/swig/QualityValues.i new file mode 100644 index 0000000..815a1b2 --- /dev/null +++ b/src/swig/QualityValues.i @@ -0,0 +1,17 @@ +/* QualityValues.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%template(QualityValueList) std::vector; + +%ignore PacBio::BAM::QualityValues::operator=; +%ignore PacBio::BAM::QualityValues::QualityValues(QualityValues&&); +%ignore PacBio::BAM::QualityValues::QualityValues(std::vector&&); + +%include diff --git a/src/swig/QueryBase.i b/src/swig/QueryBase.i new file mode 100644 index 0000000..f2220d9 --- /dev/null +++ b/src/swig/QueryBase.i @@ -0,0 +1,127 @@ +/* QueryBase.i */ + +%module PacBioBam + +%{ + +#include + +using namespace PacBio; +using namespace PacBio::BAM; +%} + + +%ignore PacBio::BAM::QueryIterator::operator++; +%ignore PacBio::BAM::QueryConstIterator::operator++; + +%ignore PacBio::BAM::internal::QueryIterator::operator++; +%ignore PacBio::BAM::internal::QueryConstIterator::operator++; + +%typemap(csinterfaces) PacBio::BAM::internal::QueryBase "global::System.Collections.IEnumerable\n, global::System.Collections.Generic.IEnumerable\n"; +%typemap(cscode) PacBio::BAM::internal::QueryBase +%{ + + public global::System.Collections.Generic.IEnumerator GetEnumerator() + { + var i = this.cbegin(); + var e = this.cend(); + while (!i.Equals(e)) + { + yield return i.value(); + i.incr(); + } + } + + global::System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + +%} + +namespace std { + %template(BamRecordList) std::vector; +} + +%typemap(csinterfaces) PacBio::BAM::internal::QueryBase > "global::System.Collections.IEnumerable\n, global::System.Collections.Generic.IEnumerable\n"; +%typemap(cscode) PacBio::BAM::internal::QueryBase > +%{ + + public global::System.Collections.Generic.IEnumerator GetEnumerator() + { + var i = this.cbegin(); + var e = this.cend(); + while (!i.Equals(e)) + { + yield return i.value(); + i.incr(); + } + } + + global::System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + +%} + +%include + +%template(IQuery) PacBio::BAM::internal::QueryBase; +%template(IGroupQuery) PacBio::BAM::internal::QueryBase >; + +// IEnumerable interfaces for Queries +%template(BamQueryIteratorBase) PacBio::BAM::internal::QueryIteratorBase; +%template(BamGroupQueryIteratorBase) PacBio::BAM::internal::QueryIteratorBase >; +%template(BamQueryIterator) PacBio::BAM::internal::QueryIterator; +%template(BamGroupQueryIterator) PacBio::BAM::internal::QueryIterator >; +%template(BamQueryConstIterator) PacBio::BAM::internal::QueryConstIterator; +%template(BamGroupQueryConstIterator) PacBio::BAM::internal::QueryConstIterator >; + +// Iterator API +#ifdef SWIGPYTHON +%pythoncode %{ +def Iterate(c): + i = c.begin() + e = c.end() + while i != e: + yield i.value() + i.incr() +%} +#endif + +%extend PacBio::BAM::internal::QueryIterator +{ + PacBio::BAM::internal::QueryIterator& incr(void) + { return $self->operator++(); } + + PacBio::BAM::BamRecord* value(void) + { return $self->operator->(); } +} + +%extend PacBio::BAM::internal::QueryConstIterator +{ + PacBio::BAM::internal::QueryConstIterator& incr(void) + { return $self->operator++(); } + + const PacBio::BAM::BamRecord* value(void) const + { return $self->operator->(); } +} + +%extend PacBio::BAM::internal::QueryIterator > +{ + PacBio::BAM::internal::QueryIterator >& incr(void) + { return $self->operator++(); } + + std::vector* value(void) + { return $self->operator->(); } +} + +%extend PacBio::BAM::internal::QueryConstIterator > +{ + PacBio::BAM::internal::QueryConstIterator >& incr(void) + { return $self->operator++(); } + + const std::vector* value(void) const + { return $self->operator->(); } +} diff --git a/src/swig/ReadGroupInfo.i b/src/swig/ReadGroupInfo.i new file mode 100644 index 0000000..a02deda --- /dev/null +++ b/src/swig/ReadGroupInfo.i @@ -0,0 +1,15 @@ +/* ReadGroupInfo.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::ReadGroupInfo::ReadGroupInfo(ReadGroupInfo&&); +%ignore PacBio::BAM::ReadGroupInfo::operator=; +%ignore PacBio::BAM::ReadGroupInfo::ToSam(const ReadGroupInfo&); + +%include \ No newline at end of file diff --git a/src/swig/RecordType.i b/src/swig/RecordType.i new file mode 100644 index 0000000..ef7947e --- /dev/null +++ b/src/swig/RecordType.i @@ -0,0 +1,11 @@ +/* RecordType.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include diff --git a/src/swig/SamTagCodec.i b/src/swig/SamTagCodec.i new file mode 100644 index 0000000..320d488 --- /dev/null +++ b/src/swig/SamTagCodec.i @@ -0,0 +1,11 @@ +/* SamTagCodec.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include \ No newline at end of file diff --git a/src/swig/SamWriter.i b/src/swig/SamWriter.i new file mode 100644 index 0000000..f01cec4 --- /dev/null +++ b/src/swig/SamWriter.i @@ -0,0 +1,15 @@ +/* SamWriter.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::SamWriter(const SamWriter&); // copy ctor not used +%ignore PacBio::BAM::SamWriter(SamWriter&&); // move ctor not used +%ignore PacBio::BAM::SamWriter::operator=; // assignment operators not used + +%include diff --git a/src/swig/SequenceInfo.i b/src/swig/SequenceInfo.i new file mode 100644 index 0000000..3b0ce67 --- /dev/null +++ b/src/swig/SequenceInfo.i @@ -0,0 +1,15 @@ +/* SequenceInfo.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::SequenceInfo::SequenceInfo(SequenceInfo&&); +%ignore PacBio::BAM::SequenceInfo::operator=; +%ignore PacBio::BAM::SequenceInfo::ToSam(const SequenceInfo&); // ignore static method, to allow member + +%include \ No newline at end of file diff --git a/src/swig/Strand.i b/src/swig/Strand.i new file mode 100644 index 0000000..96f71f1 --- /dev/null +++ b/src/swig/Strand.i @@ -0,0 +1,11 @@ +/* Strand.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include \ No newline at end of file diff --git a/src/swig/Tag.i b/src/swig/Tag.i new file mode 100644 index 0000000..832c856 --- /dev/null +++ b/src/swig/Tag.i @@ -0,0 +1,114 @@ +/* Tag.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::Tag::Tag(Tag&&); +%ignore PacBio::BAM::Tag::operator=; + +#if defined(SWIGR) || defined(SWIGPYTHON) + +%ignore PacBio::BAM::Tag::Tag(int8_t value); +%ignore PacBio::BAM::Tag::Tag(uint8_t value); +%ignore PacBio::BAM::Tag::Tag(int16_t value); +%ignore PacBio::BAM::Tag::Tag(uint16_t value); +%ignore PacBio::BAM::Tag::Tag(int32_t value); +%ignore PacBio::BAM::Tag::Tag(uint32_t value); +%ignore PacBio::BAM::Tag::Tag(float value); +%ignore PacBio::BAM::Tag::Tag(const std::string& value); +%ignore PacBio::BAM::Tag::Tag(const std::vector& value); +%ignore PacBio::BAM::Tag::Tag(const std::vector& value); +%ignore PacBio::BAM::Tag::Tag(const std::vector& value); +%ignore PacBio::BAM::Tag::Tag(const std::vector& value); +%ignore PacBio::BAM::Tag::Tag(const std::vector& value); +%ignore PacBio::BAM::Tag::Tag(const std::vector& value); +%ignore PacBio::BAM::Tag::Tag(const std::vector& value); + +%extend PacBio::BAM::Tag { + + PacBio::BAM::Tag FromInt8(int x) { return PacBio::BAM::Tag(static_cast(x)); } + PacBio::BAM::Tag FromUInt8(int x) { return PacBio::BAM::Tag(static_cast(x)); } + PacBio::BAM::Tag FromInt16(int x) { return PacBio::BAM::Tag(static_cast(x)); } + PacBio::BAM::Tag FromUInt16(int x) { return PacBio::BAM::Tag(static_cast(x)); } + PacBio::BAM::Tag FromInt32(int x) { return PacBio::BAM::Tag(static_cast(x)); } + PacBio::BAM::Tag FromUInt32(int x) { return PacBio::BAM::Tag(static_cast(x)); } + PacBio::BAM::Tag FromFloat(int x) { return PacBio::BAM::Tag(static_cast(x)); } + + PacBio::BAM::Tag FromInt8Array(const std::vector& v) + { + std::vector result; + const size_t numElements = v.size(); + result.reserve(numElements); + for (size_t i = 0; i < numElements; ++i) + result.push_back(static_cast(v.at(i))); + return PacBio::BAM::Tag(result); + } + + PacBio::BAM::Tag FromUInt8Array(const std::vector& v) + { + std::vector result; + const size_t numElements = v.size(); + result.reserve(numElements); + for (size_t i = 0; i < numElements; ++i) + result.push_back(static_cast(v.at(i))); + return PacBio::BAM::Tag(result); + } + + PacBio::BAM::Tag FromInt16Array(const std::vector& v) + { + std::vector result; + const size_t numElements = v.size(); + result.reserve(numElements); + for (size_t i = 0; i < numElements; ++i) + result.push_back(static_cast(v.at(i))); + return PacBio::BAM::Tag(result); + } + + PacBio::BAM::Tag FromUInt16Array(const std::vector& v) + { + std::vector result; + const size_t numElements = v.size(); + result.reserve(numElements); + for (size_t i = 0; i < numElements; ++i) + result.push_back(static_cast(v.at(i))); + return PacBio::BAM::Tag(result); + } + + PacBio::BAM::Tag FromInt32Array(const std::vector& v) + { + std::vector result; + const size_t numElements = v.size(); + result.reserve(numElements); + for (size_t i = 0; i < numElements; ++i) + result.push_back(static_cast(v.at(i))); + return PacBio::BAM::Tag(result); + } + + PacBio::BAM::Tag FromUInt32Array(const std::vector& v) + { + std::vector result; + const size_t numElements = v.size(); + result.reserve(numElements); + for (size_t i = 0; i < numElements; ++i) + result.push_back(static_cast(v.at(i))); + return PacBio::BAM::Tag(result); + } + + PacBio::BAM::Tag FromFloatArray(const std::vector& v) + { + std::vector result; + const size_t numElements = v.size(); + result.reserve(numElements); + for (size_t i = 0; i < numElements; ++i) + result.push_back(static_cast(v.at(i))); + return PacBio::BAM::Tag(result); + } +} +#endif // SWIGR + +%include diff --git a/src/swig/TagCollection.i b/src/swig/TagCollection.i new file mode 100644 index 0000000..bcfc707 --- /dev/null +++ b/src/swig/TagCollection.i @@ -0,0 +1,13 @@ +/* TagCollection.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%template(TagCollectionType) std::map; + +%include \ No newline at end of file diff --git a/src/swig/VirtualPolymeraseBamRecord.i b/src/swig/VirtualPolymeraseBamRecord.i new file mode 100644 index 0000000..a74e673 --- /dev/null +++ b/src/swig/VirtualPolymeraseBamRecord.i @@ -0,0 +1,36 @@ +/* VirtualPolymeraseBamRecord.i */ + +%module PacBioBam + +%{ +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +typedef PacBio::BAM::VirtualZmwBamRecord VirtualPolymeraseBamRecord; +%} + +///*%ignore PacBio::BAM::VirtualPolymeraseBamRecord::VirtualPolymeraseBamRecord(const VirtualPolymeraseBamRecord&);*/ +//%ignore PacBio::BAM::VirtualPolymeraseBamRecord::VirtualPolymeraseBamRecord(VirtualPolymeraseBamRecord&&); +//%ignore PacBio::BAM::VirtualPolymeraseBamRecord::operator=; + +//// disabled - can't get it to work right (at least in Python) +//// but the same info is available (& correct) from record.VirtualRegionsTable(regionType) +//%ignore PacBio::BAM::VirtualPolymeraseBamRecord::VirtualRegionsMap; + +//%template(VirtualRegionList) std::vector; +//%template(VirtualRegionsMap) std::map >; + +%include +%include +typedef PacBio::BAM::VirtualZmwBamRecord VirtualPolymeraseBamRecord; + +#ifdef SWIGPYTHON +%pythoncode %{ + +VirtualPolymeraseBamRecord = VirtualZmwBamRecord + +%} +#endif diff --git a/src/swig/VirtualPolymeraseReader.i b/src/swig/VirtualPolymeraseReader.i new file mode 100644 index 0000000..238768f --- /dev/null +++ b/src/swig/VirtualPolymeraseReader.i @@ -0,0 +1,23 @@ +/* VirtualPolymeraseReader.i */ + +%module PacBioBam + +%{ +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +typedef PacBio::BAM::ZmwReadStitcher VirtualPolymeraseReader; +%} + +%include +%include +typedef PacBio::BAM::ZmwReadStitcher VirtualPolymeraseReader; + +#ifdef SWIGPYTHON +%pythoncode %{ + +VirtualPolymeraseReader = ZmwReadStitcher + +%} +#endif \ No newline at end of file diff --git a/src/swig/VirtualRegion.i b/src/swig/VirtualRegion.i new file mode 100644 index 0000000..2436de2 --- /dev/null +++ b/src/swig/VirtualRegion.i @@ -0,0 +1,18 @@ +/* VirtualRegion.i */ + +%module PacBioBam + +%{ +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%ignore PacBio::BAM::VirtualRegion::VirtualRegion(VirtualRegion&&); +%ignore PacBio::BAM::VirtualRegion::operator=; + +%include +%include diff --git a/src/swig/VirtualZmwBamRecord.i b/src/swig/VirtualZmwBamRecord.i new file mode 100644 index 0000000..edae5fe --- /dev/null +++ b/src/swig/VirtualZmwBamRecord.i @@ -0,0 +1,26 @@ +/* VirtualZmwBamRecord.i */ + +%module PacBioBam + +%{ +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%feature("valuewrapper") PacBio::BAM::VirtualZmwBamRecord; + +/*%ignore PacBio::BAM::VirtualZmwBamRecord::VirtualZmwBamRecord(const VirtualZmwBamRecord&);*/ +%ignore PacBio::BAM::VirtualZmwBamRecord::VirtualZmwBamRecord(VirtualZmwBamRecord&&); +%ignore PacBio::BAM::VirtualZmwBamRecord::operator=; + +// disabled - can't get it to work right (at least in Python) +// but the same info is available (& correct) from record.VirtualRegionsTable(regionType) +%ignore PacBio::BAM::VirtualZmwBamRecord::VirtualRegionsMap; + +%template(VirtualRegionList) std::vector; +%template(VirtualRegionsMap) std::map >; + +%include \ No newline at end of file diff --git a/src/swig/WhitelistedZmwReadStitcher.i b/src/swig/WhitelistedZmwReadStitcher.i new file mode 100644 index 0000000..5ecd9d7 --- /dev/null +++ b/src/swig/WhitelistedZmwReadStitcher.i @@ -0,0 +1,11 @@ +/* WhitelistedZmwReadStitcher.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include \ No newline at end of file diff --git a/src/swig/WrapCSharp.cmake b/src/swig/WrapCSharp.cmake new file mode 100644 index 0000000..cb44d74 --- /dev/null +++ b/src/swig/WrapCSharp.cmake @@ -0,0 +1,46 @@ + +find_package(CSharp REQUIRED) +include (${CSHARP_USE_FILE}) + +set(PacBioBAM_CSharpLibDir ${PacBioBAM_LibDir}/csharp/PacBio.BAM) +set(PacBioBAM_CSharpDLL ${PacBioBAM_CSharpLibDir}/bin/Debug/PacBio.BAM.dll) +set(CSharpTestRootDir ${PacBioBAM_TestsDir}/src/CSharp) +set(NativeLibraryPaths ${PacBioBAM_CSharpLibDir}:${PacBioBAM_LibDir}:${Htslib_LibDir}) + +# +# Create SWIG wrapper +# +file(MAKE_DIRECTORY ${PacBioBAM_CSharpLibDir}) +set(CMAKE_SWIG_OUTDIR ${PacBioBAM_CSharpLibDir}) # ensure any swig files in lib/csharp +set_source_files_properties( + PacBioBam.i PROPERTIES + CPLUSPLUS ON + SWIG_FLAGS "-namespace;PacBio.BAM") +swig_add_module(PacBioBam csharp PacBioBam.i) +swig_link_libraries(PacBioBam ${PacBioBAM_LIBRARIES}) # add any C# libs you need from CSharp.cmake +set_target_properties( + ${SWIG_MODULE_PacBioBam_REAL_NAME} # ensure wrapper lib in lib/csharp + PROPERTIES + LIBRARY_OUTPUT_DIRECTORY ${PacBioBAM_CSharpLibDir} +) +add_dependencies(${SWIG_MODULE_PacBioBam_REAL_NAME} pbbam) + +# +# Write a csproj, then shell out to build and check the assembly--- +# can't get it working nicely in CMake yet +# +configure_file( + ${PacBioBAM_SwigSourceDir}/PacBio.BAM.csproj.in + ${PacBioBAM_CSharpLibDir}/PacBio.BAM.csproj) +configure_file( + ${CSharpTestRootDir}/TestPbbam.cs.in + ${CSharpTestRootDir}/TestPbbam.cs) +configure_file( + ${CSharpTestRootDir}/buildAssembly.sh.in + buildAssembly.sh) +add_custom_command( + OUTPUT ${PacBioBAM_CSharpDLL} + DEPENDS ${SWIG_MODULE_PacBioBam_REAL_NAME} + COMMAND bash ./buildAssembly.sh "${HTSLIB_LIBRARIES}" +) +add_custom_target(CSharpAssembly ALL DEPENDS ${PacBioBAM_CSharpDLL}) diff --git a/src/swig/WrapPython.cmake b/src/swig/WrapPython.cmake new file mode 100644 index 0000000..719c5c2 --- /dev/null +++ b/src/swig/WrapPython.cmake @@ -0,0 +1,55 @@ + +# setup +find_package(PythonLibs REQUIRED) +include_directories(${PYTHON_INCLUDE_PATH}) +set(PacBioBAM_PythonLibDir ${PacBioBAM_LibDir}/python) +set(PythonTestRootDir ${PacBioBAM_TestsDir}/src/python) + +# create wrapper +file(MAKE_DIRECTORY ${PacBioBAM_PythonLibDir}) +set(CMAKE_SWIG_OUTDIR ${PacBioBAM_PythonLibDir}) # put PacBioBam.py in lib/python + +swig_add_module(PacBioBam python PacBioBam.i) +swig_link_libraries(PacBioBam ${PacBioBAM_LIBRARIES} ${PYTHON_LIBRARIES}) +set_target_properties( + ${SWIG_MODULE_PacBioBam_REAL_NAME} # put _PacBioBam.so in lib/python + PROPERTIES + LIBRARY_OUTPUT_DIRECTORY ${PacBioBAM_PythonLibDir} +) +#add_dependencies(${SWIG_MODULE_PacBioBam_REAL_NAME} pbbam ${PacBioBAM_LIBRARIES}) +target_link_libraries(${SWIG_MODULE_PacBioBam_REAL_NAME} pbbam) + +# simple "wrapper worked" check +# this is run every build, to check importing from Python, but does NOT run full Python-side unit tests +add_custom_target( + check_swig_python + ALL + "PYTHONPATH=${PacBioBAM_PythonLibDir}" python check_swig.py + COMMENT "Checking Python wrapper" + WORKING_DIRECTORY ${PythonTestRootDir} +) +add_dependencies(check_swig_python ${SWIG_MODULE_PacBioBam_REAL_NAME}) + +# unit tests +if(PacBioBAM_build_tests) + + # configure data directory info + configure_file( + ${PythonTestRootDir}/test/config.py.in + ${PythonTestRootDir}/test/config.py + ) + + # test runner + add_test( + NAME PythonUnitTests + WORKING_DIRECTORY ${PythonTestRootDir} + COMMAND "python" test_pbbam.py + ) + set_tests_properties( + PythonUnitTests + PROPERTIES + ENVIRONMENT "PYTHONPATH=${PacBioBAM_PythonLibDir}" + ) + +endif() # unit tests + diff --git a/src/swig/WrapR.cmake b/src/swig/WrapR.cmake new file mode 100644 index 0000000..5dccedd --- /dev/null +++ b/src/swig/WrapR.cmake @@ -0,0 +1,71 @@ +# setup +set(R_INCLUDE_DIR_HINT /mnt/software/r/R/3.1.1/usr/share/R/include) # TODO: hard-coded hint for now, clean up later +find_package(R REQUIRED) +include_directories(${R_INCLUDE_DIR}) +set(PacBioBAM_RLibDir ${PacBioBAM_LibDir}/R) +set(RTestRootDir ${PacBioBAM_TestsDir}/src/R) + +# Suppress warnings from generated code +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag("-Wno-unused-parameter" HAS_NO_UNUSED_PARAMETER) +if(HAS_NO_UNUSED_PARAMETER) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter") +endif() + +# SWIG R does not support PBBAM_SHARED_PTR, but it does support boost::shared_ptr +# So force boost if we're wrapping for R. +add_definitions(-DPBBAM_USE_BOOST_SHARED_PTR) + +# create wrapper & library +file(MAKE_DIRECTORY ${PacBioBAM_RLibDir}) +set(CMAKE_SWIG_OUTDIR ${PacBioBAM_RLibDir}) # put PacBioBam.R wrapper in lib/R +swig_add_module(PacBioBam r PacBioBam.i) +swig_link_libraries(PacBioBam ${PacBioBAM_LIBRARIES}) +if(R_LIBRARIES) + swig_link_libraries(PacBioBam ${R_LIBRARIES}) +endif() + +# make sure the library is named "PacBioBam.so" explicitly +# no "lib" prefix... that gets in the way of the name lookups between SWIG/R +# and make sure library ends up in lib/R +set_target_properties( + ${SWIG_MODULE_PacBioBam_REAL_NAME} + PROPERTIES + LIBRARY_OUTPUT_DIRECTORY ${PacBioBAM_RLibDir} + RUNTIME_OUTPUT_DIRECTORY ${PacBioBAM_RLibDir} + SONAME PacBioBam.so + PREFIX "" +) +add_dependencies(${SWIG_MODULE_PacBioBam_REAL_NAME} pbbam) + +# simple "wrapper worked" check +configure_file( + ${RTestRootDir}/check_swig.R.in + ${RTestRootDir}/check_swig.R +) + +add_custom_target( + check_swig_R + ALL + "R" --slave --no-save < ${RTestRootDir}/check_swig.R + COMMENT "Checking R wrapper" + WORKING_DIRECTORY ${PacBioBAM_RLibDir} +) +add_dependencies(check_swig_R ${SWIG_MODULE_PacBioBam_REAL_NAME}) + +# unit tests +if(PacBioBAM_build_tests) + + # configure script + configure_file( + ${RTestRootDir}/test_pbbam.sh.in + ${RTestRootDir}/test_pbbam.sh + ) + + # test runner + add_test( + NAME RUnitTests + COMMAND "sh" ${RTestRootDir}/test_pbbam.sh + WORKING_DIRECTORY ${PacBioBAM_RLibDir} + ) +endif() diff --git a/src/swig/ZmwGroupQuery.i b/src/swig/ZmwGroupQuery.i new file mode 100644 index 0000000..c020eb5 --- /dev/null +++ b/src/swig/ZmwGroupQuery.i @@ -0,0 +1,14 @@ +/* ZmwGroupQuery.i */ + +%module PacBioBam + +%{ +#include +#include + +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include +%include diff --git a/src/swig/ZmwQuery.i b/src/swig/ZmwQuery.i new file mode 100644 index 0000000..8ad33d7 --- /dev/null +++ b/src/swig/ZmwQuery.i @@ -0,0 +1,11 @@ +/* ZmwQuery.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include \ No newline at end of file diff --git a/src/swig/ZmwReadStitcher.i b/src/swig/ZmwReadStitcher.i new file mode 100644 index 0000000..1eadcaf --- /dev/null +++ b/src/swig/ZmwReadStitcher.i @@ -0,0 +1,11 @@ +/* ZmwReadStitcher.i */ + +%module PacBioBam + +%{ +#include +using namespace PacBio; +using namespace PacBio::BAM; +%} + +%include \ No newline at end of file diff --git a/src/swig/ZmwWhitelistVirtualReader.i b/src/swig/ZmwWhitelistVirtualReader.i new file mode 100644 index 0000000..36f7e74 --- /dev/null +++ b/src/swig/ZmwWhitelistVirtualReader.i @@ -0,0 +1,23 @@ +/* ZmwWhitelistVirtualReader.i */ + +%module PacBioBam + +%{ +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +typedef PacBio::BAM::WhitelistedZmwReadStitcher ZmwWhitelistVirtualReader; +%} + +%include +%include +typedef PacBio::BAM::WhitelistedZmwReadStitcher ZmwWhitelistVirtualReader; + +#ifdef SWIGPYTHON +%pythoncode %{ + +ZmwWhitelistVirtualReader = WhitelistedZmwReadStitcher + +%} +#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..bbcc1e5 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,80 @@ + +if(PacBioBAM_build_tests) + + # setup GoogleTest + if (NOT GTEST_SRC_DIR) + set(PREBUILT_GTEST_SRC ${PacBioBAM_RootDir}/../../../../prebuilt.tmpout/gtest/gtest_1.7.0/) + if(EXISTS ${PREBUILT_GTEST_SRC}) + set(GTEST_SRC_DIR ${PREBUILT_GTEST_SRC}) + else() + set(GTEST_SRC_DIR ${PacBioBAM_RootDir}/../gtest) # keep old fallback behavior for external builds, for now at least + endif() + endif() + add_subdirectory(${GTEST_SRC_DIR} external/gtest/build) + + # generate paths/values used by for unit tests + configure_file( + ${PacBioBAM_TestsDir}/src/TestData.h.in + ${CMAKE_BINARY_DIR}/generated/TestData.h + ) + configure_file( + ${PacBioBAM_TestsDir}/data/group/group.fofn.in + ${CMAKE_BINARY_DIR}/generated/group.fofn + ) + + # grab PacBioBAM unit test source files + include(files.cmake) + set(SOURCES + ${PacBioBAMTest_H} + ${PacBioBAMTest_CPP} + ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PacBioBAM_CXX_FLAGS}") + + # define unit test executable + add_definitions(-DPBBAM_TESTING) + if(MSVC) + # VS2012+ pooh-pooh's Derek's "#define private public" trick + add_definitions(-D_ALLOW_KEYWORD_MACROS) + endif() + + if(PacBioBAM_wrap_r) + # SWIG R does not support std::shared_ptr, but it does support boost::shared_ptr + # So force boost if we're wrapping for R. + add_definitions(-DPBBAM_USE_BOOST_SHARED_PTR) + endif() + + add_executable(test_pbbam ${SOURCES}) + set_target_properties(test_pbbam PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${PacBioBAM_BinDir} + ) + target_include_directories(test_pbbam + PUBLIC + ${CMAKE_BINARY_DIR}/generated + ${PacBioBAM_INCLUDE_DIRS} + ${gtest_SOURCE_DIR}/include + ${gtest_SOURCE_DIR} + ) + + # generate test data + add_custom_target( + generate_test_data + WORKING_DIRECTORY ${PacBioBAM_TestsDir}/scripts + COMMAND "python" generate_data.py + ${PacBioBAM_TestsDir}/data/ + ${GeneratedTestDataDir} + ) + + # add unit tests to test framework + add_test( + NAME UnitTests + WORKING_DIRECTORY ${PacBioBAM_BinDir} + COMMAND test_pbbam + ) + add_dependencies(test_pbbam generate_test_data) + target_link_libraries(test_pbbam + pbbam + ${CMAKE_THREAD_LIBS_INIT} # quirky pthreads + gtest + gtest_main + ) +endif() # PacBioBAM_build_tests diff --git a/tests/data/aligned.bam b/tests/data/aligned.bam new file mode 100644 index 0000000..34d81e5 Binary files /dev/null and b/tests/data/aligned.bam differ diff --git a/tests/data/aligned.bam.bai b/tests/data/aligned.bam.bai new file mode 100644 index 0000000..66ba855 Binary files /dev/null and b/tests/data/aligned.bam.bai differ diff --git a/tests/data/aligned.bam.pbi b/tests/data/aligned.bam.pbi new file mode 100644 index 0000000..f2cf207 Binary files /dev/null and b/tests/data/aligned.bam.pbi differ diff --git a/tests/data/aligned.sam b/tests/data/aligned.sam new file mode 100644 index 0000000..ad45e63 --- /dev/null +++ b/tests/data/aligned.sam @@ -0,0 +1,8 @@ +@HD VN:1.3.1 SO:coordinate pb:3.0.3 +@SQ SN:lambda_NEB3011 LN:48502 M5:a1319ff90e994c8190a4fe6569d0822a +@RG ID:0d7b28fa PL:PACBIO DS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100 PU:singleInsertion PM:SEQUEL +@PG ID:bwa PN:bwa VN:0.7.10-r1017-dirty CL:bwa mem lambdaNEB.fa singleInsertion.fasta +singleInsertion/100/0_49 2048 lambda_NEB3011 5211 60 3H8=1D19=1I21=59H * 0 0 GGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGAT * NM:i:2 MD:Z:8^T40 AS:i:34 XS:i:0 RG:Z:0d7b28fa SA:Z:lambda_NEB3011,9378,+,52S37=2D10=1I11=,60,3; qe:i:49 qs:i:0 np:i:1 zm:i:100 rq:f:0.6 sn:B:f,0.2,0.2,0.2,0.2 +singleInsertion/200/0_49 2048 lambda_NEB3011 5211 60 3H8=1D19=1I21=59H * 0 0 GGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGAT * NM:i:2 MD:Z:8^T40 AS:i:34 XS:i:0 RG:Z:0d7b28fa SA:Z:lambda_NEB3011,9378,-,37=2D10=1I11=52S,60,3; qe:i:49 qs:i:0 np:i:1 zm:i:200 rq:f:0.6 sn:B:f,0.2,0.2,0.2,0.2 +singleInsertion/100/0_111 0 lambda_NEB3011 9378 60 52S37=2D10=1I11= * 0 0 TTTGGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGATAAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGAGCAGCACGGTAAACAGCGGCAA * NM:i:3 MD:Z:37^TC21 AS:i:43 XS:i:0 RG:Z:0d7b28fa SA:Z:lambda_NEB3011,5211,+,3S8=1D19=1I21=59S,60,2; qe:i:111 qs:i:0 np:i:1 zm:i:100 rq:f:0.6 sn:B:f,0.2,0.2,0.2,0.2 +singleInsertion/100/0_111 16 lambda_NEB3011 9378 60 37=2D10=1I11=52S * 0 0 AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGAGCAGCACGGTAAACAGCGGCAAATCAGCCAGTCCGGCATCAATTGGCCTCCTGACCGCTGTACCTGCAGCCAAA * NM:i:3 MD:Z:37^TC21 AS:i:43 XS:i:0 RG:Z:0d7b28fa SA:Z:lambda_NEB3011,5211,+,3S8=1D19=1I21=59S,60,2; qe:i:111 qs:i:0 np:i:1 zm:i:100 rq:f:0.6 sn:B:f,0.2,0.2,0.2,0.2 diff --git a/tests/data/aligned2.bam b/tests/data/aligned2.bam new file mode 100644 index 0000000..672e5e5 Binary files /dev/null and b/tests/data/aligned2.bam differ diff --git a/tests/data/aligned2.bam.bai b/tests/data/aligned2.bam.bai new file mode 100644 index 0000000..f954ab0 Binary files /dev/null and b/tests/data/aligned2.bam.bai differ diff --git a/tests/data/aligned2.bam.pbi b/tests/data/aligned2.bam.pbi new file mode 100644 index 0000000..c1e82de Binary files /dev/null and b/tests/data/aligned2.bam.pbi differ diff --git a/tests/data/chunking/chunking.subreadset.xml b/tests/data/chunking/chunking.subreadset.xml new file mode 100644 index 0000000..6d15ff1 --- /dev/null +++ b/tests/data/chunking/chunking.subreadset.xml @@ -0,0 +1,65 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam new file mode 100644 index 0000000..c4ec7ea Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam differ diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi new file mode 100644 index 0000000..4af87e2 Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi differ diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam new file mode 100644 index 0000000..e623aca Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam differ diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi new file mode 100644 index 0000000..6479979 Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi differ diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam new file mode 100644 index 0000000..8544f6a Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam differ diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi new file mode 100644 index 0000000..a9f4edb Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi differ diff --git a/tests/data/dataset/ali1.xml b/tests/data/dataset/ali1.xml new file mode 100644 index 0000000..ab0a82a --- /dev/null +++ b/tests/data/dataset/ali1.xml @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/dataset/ali2.xml b/tests/data/dataset/ali2.xml new file mode 100644 index 0000000..c35f9ec --- /dev/null +++ b/tests/data/dataset/ali2.xml @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/dataset/ali3.xml b/tests/data/dataset/ali3.xml new file mode 100644 index 0000000..f58d25f --- /dev/null +++ b/tests/data/dataset/ali3.xml @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/dataset/ali4.xml b/tests/data/dataset/ali4.xml new file mode 100644 index 0000000..ab0a82a --- /dev/null +++ b/tests/data/dataset/ali4.xml @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/dataset/bam_mapping.bam b/tests/data/dataset/bam_mapping.bam new file mode 100644 index 0000000..2d4ae7b Binary files /dev/null and b/tests/data/dataset/bam_mapping.bam differ diff --git a/tests/data/dataset/bam_mapping.bam.pbi b/tests/data/dataset/bam_mapping.bam.pbi new file mode 100644 index 0000000..fe7c3be Binary files /dev/null and b/tests/data/dataset/bam_mapping.bam.pbi differ diff --git a/tests/data/dataset/bam_mapping_1.bam b/tests/data/dataset/bam_mapping_1.bam new file mode 100644 index 0000000..1e9670e Binary files /dev/null and b/tests/data/dataset/bam_mapping_1.bam differ diff --git a/tests/data/dataset/bam_mapping_1.bam.pbi b/tests/data/dataset/bam_mapping_1.bam.pbi new file mode 100644 index 0000000..d99a174 Binary files /dev/null and b/tests/data/dataset/bam_mapping_1.bam.pbi differ diff --git a/tests/data/dataset/bam_mapping_2.bam b/tests/data/dataset/bam_mapping_2.bam new file mode 100644 index 0000000..09678ea Binary files /dev/null and b/tests/data/dataset/bam_mapping_2.bam differ diff --git a/tests/data/dataset/bam_mapping_2.bam.pbi b/tests/data/dataset/bam_mapping_2.bam.pbi new file mode 100644 index 0000000..d1765ef Binary files /dev/null and b/tests/data/dataset/bam_mapping_2.bam.pbi differ diff --git a/tests/data/dataset/bam_mapping_new.bam b/tests/data/dataset/bam_mapping_new.bam new file mode 100644 index 0000000..3039331 Binary files /dev/null and b/tests/data/dataset/bam_mapping_new.bam differ diff --git a/tests/data/dataset/bam_mapping_new.bam.pbi b/tests/data/dataset/bam_mapping_new.bam.pbi new file mode 100644 index 0000000..82d497c Binary files /dev/null and b/tests/data/dataset/bam_mapping_new.bam.pbi differ diff --git a/tests/data/dataset/bam_mapping_staggered.xml b/tests/data/dataset/bam_mapping_staggered.xml new file mode 100644 index 0000000..879c193 --- /dev/null +++ b/tests/data/dataset/bam_mapping_staggered.xml @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/dataset/barcode.dataset.xml b/tests/data/dataset/barcode.dataset.xml new file mode 100644 index 0000000..1fbbb18 --- /dev/null +++ b/tests/data/dataset/barcode.dataset.xml @@ -0,0 +1,11 @@ + + + + + + + 400 + 30 + paired + + diff --git a/tests/data/dataset/ccsread.dataset.xml b/tests/data/dataset/ccsread.dataset.xml new file mode 100644 index 0000000..97b5943 --- /dev/null +++ b/tests/data/dataset/ccsread.dataset.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/tests/data/dataset/lambda_contigs.xml b/tests/data/dataset/lambda_contigs.xml new file mode 100644 index 0000000..4abc8cc --- /dev/null +++ b/tests/data/dataset/lambda_contigs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/tests/data/dataset/malformed.xml b/tests/data/dataset/malformed.xml new file mode 100644 index 0000000..31e0942 --- /dev/null +++ b/tests/data/dataset/malformed.xml @@ -0,0 +1,85 @@ + + + + + + + 50000000 + 150000 + + + 2.3.0.1.142990 + NRT@172.31.128.10:8082, SwVer=2301.142990, HwVer=1.0 + + r000013_42267_150403 + Inst42267-040315-SAT-100pM-2kb-P6C4 + + + Inst42267-040315-SAT-100pM-2kb-P6C4 + Inst42267-040315-SAT-100pM-2kb-P6C4 + 0.0 + false + false + false + 1 + + 251acf71-9eb0-489e-9dd1-cdbd11432752 + + + + + + + + 7 + 4 + + + BasecallerV1 + 2-3-0_P6-C4.xml + + + Analysis_Results + rsy://mp-rsync/vol55//RS_DATA_STAGING/42267/Inst42267-040315-SAT-100pM-2kb-P6C4_13/A04_7/ + + Fasta + + Bases + Minimal + + + + + + + + + diff --git a/tests/data/dataset/pbalchemy10kbp.xml b/tests/data/dataset/pbalchemy10kbp.xml new file mode 100644 index 0000000..96189ad --- /dev/null +++ b/tests/data/dataset/pbalchemy10kbp.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/tests/data/dataset/reference.dataset.xml b/tests/data/dataset/reference.dataset.xml new file mode 100644 index 0000000..3cfbe8c --- /dev/null +++ b/tests/data/dataset/reference.dataset.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + + 5000000 + 500 + Tribble + Diploid + + + + + diff --git a/tests/data/dataset/subread_dataset1.xml b/tests/data/dataset/subread_dataset1.xml new file mode 100644 index 0000000..1d64e79 --- /dev/null +++ b/tests/data/dataset/subread_dataset1.xml @@ -0,0 +1,76 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + 500000 + 500 + + + 2.3.0.0.140640 + NRT@172.31.128.10:8082, SwVer=2300.140640, HwVer=1.0 + + e903682f-e502-465c-a2b6-9dd77c9f43fc + beta4_130726_biotin_DEV_vs_MFG_PB11K_9458p + + + 2014-12-24_141_NGAT_Igor_bisPNA Enrichment_Mag Bead Elution Buffers + B01 + 10 + true + true + true + 0 + Lorem ipsum + + abc2df90-d44f-4a48-9f35-3b99473c68f5 + + + + + + + + 0 + 0 + + + BasecallerV1 + 1-3-0_Standard_C2.xml + + Analysis_Results + rsy://mp-rsync/vol56//RS_DATA_STAGING//2014-12-24_141_NGAT_Igor_bisPNA%20Enrichment_Mag%20Bead%20Elution%20Buffers_1094/B01_1 + + Bam + + + + + + + + + + diff --git a/tests/data/dataset/subread_dataset2.xml b/tests/data/dataset/subread_dataset2.xml new file mode 100644 index 0000000..a395330 --- /dev/null +++ b/tests/data/dataset/subread_dataset2.xml @@ -0,0 +1,76 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + 500000 + 500 + + + 2.3.0.0.140640 + NRT@172.31.128.10:8082, SwVer=2300.140640, HwVer=1.0 + + e903682f-e502-465c-a2b6-9dd77c9f43fc + beta4_130726_biotin_DEV_vs_MFG_PB11K_9458p + + + 2014-12-24_141_NGAT_Igor_bisPNA Enrichment_Mag Bead Elution Buffers + B01 + 10 + true + true + true + 0 + Lorem ipsum + + abc2df90-d44f-4a48-9f35-3b99473c68f5 + + + + + + + + 0 + 0 + + + BasecallerV1 + 1-3-0_Standard_C2.xml + + Analysis_Results + rsy://mp-rsync/vol56//RS_DATA_STAGING//2014-12-24_141_NGAT_Igor_bisPNA%20Enrichment_Mag%20Bead%20Elution%20Buffers_1094/B01_1 + + Bam + + + + + + + + + + diff --git a/tests/data/dataset/subread_dataset3.xml b/tests/data/dataset/subread_dataset3.xml new file mode 100644 index 0000000..91923a8 --- /dev/null +++ b/tests/data/dataset/subread_dataset3.xml @@ -0,0 +1,76 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + 500000 + 500 + + + 2.3.0.0.140640 + NRT@172.31.128.10:8082, SwVer=2300.140640, HwVer=1.0 + + e903682f-e502-465c-a2b6-9dd77c9f43fc + beta4_130726_biotin_DEV_vs_MFG_PB11K_9458p + + + 2014-12-24_141_NGAT_Igor_bisPNA Enrichment_Mag Bead Elution Buffers + B01 + 10 + true + true + true + 0 + Lorem ipsum + + abc2df90-d44f-4a48-9f35-3b99473c68f5 + + + + + + + + 0 + 0 + + + BasecallerV1 + 1-3-0_Standard_C2.xml + + Analysis_Results + rsy://mp-rsync/vol56//RS_DATA_STAGING//2014-12-24_141_NGAT_Igor_bisPNA%20Enrichment_Mag%20Bead%20Elution%20Buffers_1094/B01_1 + + Bam + + + + + + + + + + diff --git a/tests/data/dataset/transformed_rs_subread_dataset.xml b/tests/data/dataset/transformed_rs_subread_dataset.xml new file mode 100644 index 0000000..465d9a6 --- /dev/null +++ b/tests/data/dataset/transformed_rs_subread_dataset.xml @@ -0,0 +1,75 @@ + + + + + + + + + 50000000 + 150000 + + + 2.0.1.0.124174 + NRT@172.31.128.10:8082, SwVer=2010.124174, HwVer=1.0 + + r001173_42129_130607 + 2013-06-07_42129_10kb_Ecoli_201-validation_2 + + + 2013-06-07_42129_10kb_Ecoli_201-validation_2 + P4-C2_Ecoli_10kb_MBS_stageHS + 0 + false + true + + false + + 1 + P4-C2_Ecoli_10kb_MBS_stageHS + + abafd4ed-5cf7-4b83-a869-1a5d239d30e2 + + + MagBead Standard Seq v2 + 2 + 1 + + + BasecallerV1 + 2-0-0_P4-C2.xml + + Analysis_Results + rsy://mp-f030-io/vol54//RS_DATA_STAGING/42129/2013-06-07_42129_10kb_Ecoli_201-validation_2_1173/A01_2/ + + Fasta + + + + + + + + + diff --git a/tests/data/empty.bam b/tests/data/empty.bam new file mode 100644 index 0000000..1b22456 Binary files /dev/null and b/tests/data/empty.bam differ diff --git a/tests/data/empty.bam.pbi b/tests/data/empty.bam.pbi new file mode 100644 index 0000000..e398d79 Binary files /dev/null and b/tests/data/empty.bam.pbi differ diff --git a/tests/data/group/group.fofn.in b/tests/data/group/group.fofn.in new file mode 100644 index 0000000..c2621c5 --- /dev/null +++ b/tests/data/group/group.fofn.in @@ -0,0 +1,3 @@ +@PacBioBAM_TestsDir@/data/group/test1.bam +@PacBioBAM_TestsDir@/data/group/test2.bam +@PacBioBAM_TestsDir@/data/group/test3.bam diff --git a/tests/data/group/test1.bam b/tests/data/group/test1.bam new file mode 100644 index 0000000..2ba687b Binary files /dev/null and b/tests/data/group/test1.bam differ diff --git a/tests/data/group/test2.bam b/tests/data/group/test2.bam new file mode 100644 index 0000000..9e22b30 Binary files /dev/null and b/tests/data/group/test2.bam differ diff --git a/tests/data/group/test2.bam.pbi b/tests/data/group/test2.bam.pbi new file mode 100644 index 0000000..761600b Binary files /dev/null and b/tests/data/group/test2.bam.pbi differ diff --git a/tests/data/group/test3.bam b/tests/data/group/test3.bam new file mode 100644 index 0000000..093e93a Binary files /dev/null and b/tests/data/group/test3.bam differ diff --git a/tests/data/lambdaNEB.fa b/tests/data/lambdaNEB.fa new file mode 100644 index 0000000..33011e5 --- /dev/null +++ b/tests/data/lambdaNEB.fa @@ -0,0 +1,608 @@ +>lambda_NEB3011 +GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTA +ATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGACAGGTGCTGAAAGCGAGCTTTTTGGCCTCTGTCGTTTCC +TTTCTCTGTTTTTGTCCGTGGAATGAACAATGGAAGTCAACAAAAAGCAGCTGGCTGACATTTTCGGTGCGAGTATCCGT +ACCATTCAGAACTGGCAGGAACAGGGAATGCCCGTTCTGCGAGGCGGTGGCAAGGGTAATGAGGTGCTTTATGACTCTGC +CGCCGTCATAAAATGGTATGCCGAAAGGGATGCTGAAATTGAGAACGAAAAGCTGCGCCGGGAGGTTGAAGAACTGCGGC +AGGCCAGCGAGGCAGATCTCCAGCCAGGAACTATTGAGTACGAACGCCATCGACTTACGCGTGCGCAGGCCGACGCACAG +GAACTGAAGAATGCCAGAGACTCCGCTGAAGTGGTGGAAACCGCATTCTGTACTTTCGTGCTGTCGCGGATCGCAGGTGA +AATTGCCAGTATTCTCGACGGGCTCCCCCTGTCGGTGCAGCGGCGTTTTCCGGAACTGGAAAACCGACATGTTGATTTCC +TGAAACGGGATATCATCAAAGCCATGAACAAAGCAGCCGCGCTGGATGAACTGATACCGGGGTTGCTGAGTGAATATATC +GAACAGTCAGGTTAACAGGCTGCGGCATTTTGTCCGCGCCGGGCTTCGCTCACTGTTCAGGCCGGAGCCACAGACCGCCG +TTGAATGGGCGGATGCTAATTACTATCTCCCGAAAGAATCCGCATACCAGGAAGGGCGCTGGGAAACACTGCCCTTTCAG +CGGGCCATCATGAATGCGATGGGCAGCGACTACATCCGTGAGGTGAATGTGGTGAAGTCTGCCCGTGTCGGTTATTCCAA +AATGCTGCTGGGTGTTTATGCCTACTTTATAGAGCATAAGCAGCGCAACACCCTTATCTGGTTGCCGACGGATGGTGATG +CCGAGAACTTTATGAAAACCCACGTTGAGCCGACTATTCGTGATATTCCGTCGCTGCTGGCGCTGGCCCCGTGGTATGGC +AAAAAGCACCGGGATAACACGCTCACCATGAAGCGTTTCACTAATGGGCGTGGCTTCTGGTGCCTGGGCGGTAAAGCGGC +AAAAAACTACCGTGAAAAGTCGGTGGATGTGGCGGGTTATGATGAACTTGCTGCTTTTGATGATGATATTGAACAGGAAG +GCTCTCCGACGTTCCTGGGTGACAAGCGTATTGAAGGCTCGGTCTGGCCAAAGTCCATCCGTGGCTCCACGCCAAAAGTG +AGAGGCACCTGTCAGATTGAGCGTGCAGCCAGTGAATCCCCGCATTTTATGCGTTTTCATGTTGCCTGCCCGCATTGCGG +GGAGGAGCAGTATCTTAAATTTGGCGACAAAGAGACGCCGTTTGGCCTCAAATGGACGCCGGATGACCCCTCCAGCGTGT +TTTATCTCTGCGAGCATAATGCCTGCGTCATCCGCCAGCAGGAGCTGGACTTTACTGATGCCCGTTATATCTGCGAAAAG +ACCGGGATCTGGACCCGTGATGGCATTCTCTGGTTTTCGTCATCCGGTGAAGAGATTGAGCCACCTGACAGTGTGACCTT +TCACATCTGGACAGCGTACAGCCCGTTCACCACCTGGGTGCAGATTGTCAAAGACTGGATGAAAACGAAAGGGGATACGG +GAAAACGTAAAACCTTCGTAAACACCACGCTCGGTGAGACGTGGGAGGCGAAAATTGGCGAACGTCCGGATGCTGAAGTG +ATGGCAGAGCGGAAAGAGCATTATTCAGCGCCCGTTCCTGACCGTGTGGCTTACCTGACCGCCGGTATCGACTCCCAGCT +GGACCGCTACGAAATGCGCGTATGGGGATGGGGGCCGGGTGAGGAAAGCTGGCTGATTGACCGGCAGATTATTATGGGCC +GCCACGACGATGAACAGACGCTGCTGCGTGTGGATGAGGCCATCAATAAAACCTATACCCGCCGGAATGGTGCAGAAATG +TCGATATCCCGTATCTGCTGGGATACTGGCGGGATTGACCCGACCATTGTGTATGAACGCTCGAAAAAACATGGGCTGTT +CCGGGTGATCCCCATTAAAGGGGCATCCGTCTACGGAAAGCCGGTGGCCAGCATGCCACGTAAGCGAAACAAAAACGGGG +TTTACCTTACCGAAATCGGTACGGATACCGCGAAAGAGCAGATTTATAACCGCTTCACACTGACGCCGGAAGGGGATGAA +CCGCTTCCCGGTGCCGTTCACTTCCCGAATAACCCGGATATTTTTGATCTGACCGAAGCGCAGCAGCTGACTGCTGAAGA +GCAGGTCGAAAAATGGGTGGATGGCAGGAAAAAAATACTGTGGGACAGCAAAAAGCGACGCAATGAGGCACTCGACTGCT +TCGTTTATGCGCTGGCGGCGCTGCGCATCAGTATTTCCCGCTGGCAGCTGGATCTCAGTGCGCTGCTGGCGAGCCTGCAG +GAAGAGGATGGTGCAGCAACCAACAAGAAAACACTGGCAGATTACGCCCGTGCCTTATCCGGAGAGGATGAATGACGCGA +CAGGAAGAACTTGCCGCTGCCCGTGCGGCACTGCATGACCTGATGACAGGTAAACGGGTGGCAACAGTACAGAAAGACGG +ACGAAGGGTGGAGTTTACGGCCACTTCCGTGTCTGACCTGAAAAAATATATTGCAGAGCTGGAAGTGCAGACCGGCATGA +CACAGCGACGCAGGGGACCTGCAGGATTTTATGTATGAAAACGCCCACCATTCCCACCCTTCTGGGGCCGGACGGCATGA +CATCGCTGCGCGAATATGCCGGTTATCACGGCGGTGGCAGCGGATTTGGAGGGCAGTTGCGGTCGTGGAACCCACCGAGT +GAAAGTGTGGATGCAGCCCTGTTGCCCAACTTTACCCGTGGCAATGCCCGCGCAGACGATCTGGTACGCAATAACGGCTA +TGCCGCCAACGCCATCCAGCTGCATCAGGATCATATCGTCGGGTCTTTTTTCCGGCTCAGTCATCGCCCAAGCTGGCGCT +ATCTGGGCATCGGGGAGGAAGAAGCCCGTGCCTTTTCCCGCGAGGTTGAAGCGGCATGGAAAGAGTTTGCCGAGGATGAC +TGCTGCTGCATTGACGTTGAGCGAAAACGCACGTTTACCATGATGATTCGGGAAGGTGTGGCCATGCACGCCTTTAACGG +TGAACTGTTCGTTCAGGCCACCTGGGATACCAGTTCGTCGCGGCTTTTCCGGACACAGTTCCGGATGGTCAGCCCGAAGC +GCATCAGCAACCCGAACAATACCGGCGACAGCCGGAACTGCCGTGCCGGTGTGCAGATTAATGACAGCGGTGCGGCGCTG +GGATATTACGTCAGCGAGGACGGGTATCCTGGCTGGATGCCGCAGAAATGGACATGGATACCCCGTGAGTTACCCGGCGG +GCGCGCCTCGTTCATTCACGTTTTTGAACCCGTGGAGGACGGGCAGACTCGCGGTGCAAATGTGTTTTACAGCGTGATGG +AGCAGATGAAGATGCTCGACACGCTGCAGAACACGCAGCTGCAGAGCGCCATTGTGAAGGCGATGTATGCCGCCACCATT +GAGAGTGAGCTGGATACGCAGTCAGCGATGGATTTTATTCTGGGCGCGAACAGTCAGGAGCAGCGGGAAAGGCTGACCGG +CTGGATTGGTGAAATTGCCGCGTATTACGCCGCAGCGCCGGTCCGGCTGGGAGGCGCAAAAGTACCGCACCTGATGCCGG +GTGACTCACTGAACCTGCAGACGGCTCAGGATACGGATAACGGCTACTCCGTGTTTGAGCAGTCACTGCTGCGGTATATC +GCTGCCGGGCTGGGTGTCTCGTATGAGCAGCTTTCCCGGAATTACGCCCAGATGAGCTACTCCACGGCACGGGCCAGTGC +GAACGAGTCGTGGGCGTACTTTATGGGGCGGCGAAAATTCGTCGCATCCCGTCAGGCGAGCCAGATGTTTCTGTGCTGGC +TGGAAGAGGCCATCGTTCGCCGCGTGGTGACGTTACCTTCAAAAGCGCGCTTCAGTTTTCAGGAAGCCCGCAGTGCCTGG +GGGAACTGCGACTGGATAGGCTCCGGTCGTATGGCCATCGATGGTCTGAAAGAAGTTCAGGAAGCGGTGATGCTGATAGA +AGCCGGACTGAGTACCTACGAGAAAGAGTGCGCAAAACGCGGTGACGACTATCAGGAAATTTTTGCCCAGCAGGTCCGTG +AAACGATGGAGCGCCGTGCAGCCGGTCTTAAACCGCCCGCCTGGGCGGCTGCAGCATTTGAATCCGGGCTGCGACAATCA +ACAGAGGAGGAGAAGAGTGACAGCAGAGCTGCGTAATCTCCCGCATATTGCCAGCATGGCCTTTAATGAGCCGCTGATGC +TTGAACCCGCCTATGCGCGGGTTTTCTTTTGTGCGCTTGCAGGCCAGCTTGGGATCAGCAGCCTGACGGATGCGGTGTCC +GGCGACAGCCTGACTGCCCAGGAGGCACTCGCGACGCTGGCATTATCCGGTGATGATGACGGACCACGACAGGCCCGCAG +TTATCAGGTCATGAACGGCATCGCCGTGCTGCCGGTGTCCGGCACGCTGGTCAGCCGGACGCGGGCGCTGCAGCCGTACT +CGGGGATGACCGGTTACAACGGCATTATCGCCCGTCTGCAACAGGCTGCCAGCGATCCGATGGTGGACGGCATTCTGCTC +GATATGGACACGCCCGGCGGGATGGTGGCGGGGGCATTTGACTGCGCTGACATCATCGCCCGTGTGCGTGACATAAAACC +GGTATGGGCGCTTGCCAACGACATGAACTGCAGTGCAGGTCAGTTGCTTGCCAGTGCCGCCTCCCGGCGTCTGGTCACGC +AGACCGCCCGGACAGGCTCCATCGGCGTCATGATGGCTCACAGTAATTACGGTGCTGCGCTGGAGAAACAGGGTGTGGAA +ATCACGCTGATTTACAGCGGCAGCCATAAGGTGGATGGCAACCCCTACAGCCATCTTCCGGATGACGTCCGGGAGACACT +GCAGTCCCGGATGGACGCAACCCGCCAGATGTTTGCGCAGAAGGTGTCGGCATATACCGGCCTGTCCGTGCAGGTTGTGC +TGGATACCGAGGCTGCAGTGTACAGCGGTCAGGAGGCCATTGATGCCGGACTGGCTGATGAACTTGTTAACAGCACCGAT +GCGATCACCGTCATGCGTGATGCACTGGATGCACGTAAATCCCGTCTCTCAGGAGGGCGAATGACCAAAGAGACTCAATC +AACAACTGTTTCAGCCACTGCTTCGCAGGCTGACGTTACTGACGTGGTGCCAGCGACGGAGGGCGAGAACGCCAGCGCGG +CGCAGCCGGACGTGAACGCGCAGATCACCGCAGCGGTTGCGGCAGAAAACAGCCGCATTATGGGGATCCTCAACTGTGAG +GAGGCTCACGGACGCGAAGAACAGGCACGCGTGCTGGCAGAAACCCCCGGTATGACCGTGAAAACGGCCCGCCGCATTCT +GGCCGCAGCACCACAGAGTGCACAGGCGCGCAGTGACACTGCGCTGGATCGTCTGATGCAGGGGGCACCGGCACCGCTGG +CTGCAGGTAACCCGGCATCTGATGCCGTTAACGATTTGCTGAACACACCAGTGTAAGGGATGTTTATGACGAGCAAAGAA +ACCTTTACCCATTACCAGCCGCAGGGCAACAGTGACCCGGCTCATACCGCAACCGCGCCCGGCGGATTGAGTGCGAAAGC +GCCTGCAATGACCCCGCTGATGCTGGACACCTCCAGCCGTAAGCTGGTTGCGTGGGATGGCACCACCGACGGTGCTGCCG +TTGGCATTCTTGCGGTTGCTGCTGACCAGACCAGCACCACGCTGACGTTCTACAAGTCCGGCACGTTCCGTTATGAGGAT +GTGCTCTGGCCGGAGGCTGCCAGCGACGAGACGAAAAAACGGACCGCGTTTGCCGGAACGGCAATCAGCATCGTTTAACT +TTACCCTTCATCACTAAAGGCCGCCTGTGCGGCTTTTTTTACGGGATTTTTTTATGTCGATGTACACAACCGCCCAACTG +CTGGCGGCAAATGAGCAGAAATTTAAGTTTGATCCGCTGTTTCTGCGTCTCTTTTTCCGTGAGAGCTATCCCTTCACCAC +GGAGAAAGTCTATCTCTCACAAATTCCGGGACTGGTAAACATGGCGCTGTACGTTTCGCCGATTGTTTCCGGTGAGGTTA +TCCGTTCCCGTGGCGGCTCCACCTCTGAATTTACGCCGGGATATGTCAAGCCGAAGCATGAAGTGAATCCGCAGATGACC +CTGCGTCGCCTGCCGGATGAAGATCCGCAGAATCTGGCGGACCCGGCTTACCGCCGCCGTCGCATCATCATGCAGAACAT +GCGTGACGAAGAGCTGGCCATTGCTCAGGTCGAAGAGATGCAGGCAGTTTCTGCCGTGCTTAAGGGCAAATACACCATGA +CCGGTGAAGCCTTCGATCCGGTTGAGGTGGATATGGGCCGCAGTGAGGAGAATAACATCACGCAGTCCGGCGGCACGGAG +TGGAGCAAGCGTGACAAGTCCACGTATGACCCGACCGACGATATCGAAGCCTACGCGCTGAACGCCAGCGGTGTGGTGAA +TATCATCGTGTTCGATCCGAAAGGCTGGGCGCTGTTCCGTTCCTTCAAAGCCGTCAAGGAGAAGCTGGATACCCGTCGTG +GCTCTAATTCCGAGCTGGAGACAGCGGTGAAAGACCTGGGCAAAGCGGTGTCCTATAAGGGGATGTATGGCGATGTGGCC +ATCGTCGTGTATTCCGGACAGTACGTGGAAAACGGCGTCAAAAAGAACTTCCTGCCGGACAACACGATGGTGCTGGGGAA +CACTCAGGCACGCGGTCTGCGCACCTATGGCTGCATTCAGGATGCGGACGCACAGCGCGAAGGCATTAACGCCTCTGCCC +GTTACCCGAAAAACTGGGTGACCACCGGCGATCCGGCGCGTGAGTTCACCATGATTCAGTCAGCACCGCTGATGCTGCTG +GCTGACCCTGATGAGTTCGTGTCCGTACAACTGGCGTAATCATGGCCCTTCGGGGCCATTGTTTCTCTGTGGAGGAGTCC +ATGACGAAAGATGAACTGATTGCCCGTCTCCGCTCGCTGGGTGAACAACTGAACCGTGATGTCAGCCTGACGGGGACGAA +AGAAGAACTGGCGCTCCGTGTGGCAGAGCTGAAAGAGGAGCTTGATGACACGGATGAAACTGCCGGTCAGGACACCCCTC +TCAGCCGGGAAAATGTGCTGACCGGACATGAAAATGAGGTGGGATCAGCGCAGCCGGATACCGTGATTCTGGATACGTCT +GAACTGGTCACGGTCGTGGCACTGGTGAAGCTGCATACTGATGCACTTCACGCCACGCGGGATGAACCTGTGGCATTTGT +GCTGCCGGGAACGGCGTTTCGTGTCTCTGCCGGTGTGGCAGCCGAAATGACAGAGCGCGGCCTGGCCAGAATGCAATAAC +GGGAGGCGCTGTGGCTGATTTCGATAACCTGTTCGATGCTGCCATTGCCCGCGCCGATGAAACGATACGCGGGTACATGG +GAACGTCAGCCACCATTACATCCGGTGAGCAGTCAGGTGCGGTGATACGTGGTGTTTTTGATGACCCTGAAAATATCAGC +TATGCCGGACAGGGCGTGCGCGTTGAAGGCTCCAGCCCGTCCCTGTTTGTCCGGACTGATGAGGTGCGGCAGCTGCGGCG +TGGAGACACGCTGACCATCGGTGAGGAAAATTTCTGGGTAGATCGGGTTTCGCCGGATGATGGCGGAAGTTGTCATCTCT +GGCTTGGACGGGGCGTACCGCCTGCCGTTAACCGTCGCCGCTGAAAGGGGGATGTATGGCCATAAAAGGTCTTGAGCAGG +CCGTTGAAAACCTCAGCCGTATCAGCAAAACGGCGGTGCCTGGTGCCGCCGCAATGGCCATTAACCGCGTTGCTTCATCC +GCGATATCGCAGTCGGCGTCACAGGTTGCCCGTGAGACAAAGGTACGCCGGAAACTGGTAAAGGAAAGGGCCAGGCTGAA +AAGGGCCACGGTCAAAAATCCGCAGGCCAGAATCAAAGTTAACCGGGGGGATTTGCCCGTAATCAAGCTGGGTAATGCGC +GGGTTGTCCTTTCGCGCCGCAGGCGTCGTAAAAAGGGGCAGCGTTCATCCCTGAAAGGTGGCGGCAGCGTGCTTGTGGTG +GGTAACCGTCGTATTCCCGGCGCGTTTATTCAGCAACTGAAAAATGGCCGGTGGCATGTCATGCAGCGTGTGGCTGGGAA +AAACCGTTACCCCATTGATGTGGTGAAAATCCCGATGGCGGTGCCGCTGACCACGGCGTTTAAACAAAATATTGAGCGGA +TACGGCGTGAACGTCTTCCGAAAGAGCTGGGCTATGCGCTGCAGCATCAACTGAGGATGGTAATAAAGCGATGAAACATA +CTGAACTCCGTGCAGCCGTACTGGATGCACTGGAGAAGCATGACACCGGGGCGACGTTTTTTGATGGTCGCCCCGCTGTT +TTTGATGAGGCGGATTTTCCGGCAGTTGCCGTTTATCTCACCGGCGCTGAATACACGGGCGAAGAGCTGGACAGCGATAC +CTGGCAGGCGGAGCTGCATATCGAAGTTTTCCTGCCTGCTCAGGTGCCGGATTCAGAGCTGGATGCGTGGATGGAGTCCC +GGATTTATCCGGTGATGAGCGATATCCCGGCACTGTCAGATTTGATCACCAGTATGGTGGCCAGCGGCTATGACTACCGG +CGCGACGATGATGCGGGCTTGTGGAGTTCAGCCGATCTGACTTATGTCATTACCTATGAAATGTGAGGACGCTATGCCTG +TACCAAATCCTACAATGCCGGTGAAAGGTGCCGGGACCACCCTGTGGGTTTATAAGGGGAGCGGTGACCCTTACGCGAAT +CCGCTTTCAGACGTTGACTGGTCGCGTCTGGCAAAAGTTAAAGACCTGACGCCCGGCGAACTGACCGCTGAGTCCTATGA +CGACAGCTATCTCGATGATGAAGATGCAGACTGGACTGCGACCGGGCAGGGGCAGAAATCTGCCGGAGATACCAGCTTCA +CGCTGGCGTGGATGCCCGGAGAGCAGGGGCAGCAGGCGCTGCTGGCGTGGTTTAATGAAGGCGATACCCGTGCCTATAAA +ATCCGCTTCCCGAACGGCACGGTCGATGTGTTCCGTGGCTGGGTCAGCAGTATCGGTAAGGCGGTGACGGCGAAGGAAGT +GATCACCCGCACGGTGAAAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGTAACAGCGGCAACCG +GCATGACCGTGACGCCTGCCAGCACCTCGGTGGTGAAAGGGCAGAGCACCACGCTGACCGTGGCCTTCCAGCCGGAGGGC +GTAACCGACAAGAGCTTTCGTGCGGTGTCTGCGGATAAAACAAAAGCCACCGTGTCGGTCAGTGGTATGACCATCACCGT +GAACGGCGTTGCTGCAGGCAAGGTCAACATTCCGGTTGTATCCGGTAATGGTGAGTTTGCTGCGGTTGCAGAAATTACCG +TCACCGCCAGTTAATCCGGAGAGTCAGCGATGTTCCTGAAAACCGAATCATTTGAACATAACGGTGTGACCGTCACGCTT +TCTGAACTGTCAGCCCTGCAGCGCATTGAGCATCTCGCCCTGATGAAACGGCAGGCAGAACAGGCGGAGTCAGACAGCAA +CCGGAAGTTTACTGTGGAAGACGCCATCAGAACCGGCGCGTTTCTGGTGGCGATGTCCCTGTGGCATAACCATCCGCAGA +AGACGCAGATGCCGTCCATGAATGAAGCCGTTAAACAGATTGAGCAGGAAGTGCTTACCACCTGGCCCACGGAGGCAATT +TCTCATGCTGAAAACGTGGTGTACCGGCTGTCTGGTATGTATGAGTTTGTGGTGAATAATGCCCCTGAACAGACAGAGGA +CGCCGGGCCCGCAGAGCCTGTTTCTGCGGGAAAGTGTTCGACGGTGAGCTGAGTTTTGCCCTGAAACTGGCGCGTGAGAT +GGGGCGACCCGACTGGCGTGCCATGCTTGCCGGGATGTCATCCACGGAGTATGCCGACTGGCACCGCTTTTACAGTACCC +ATTATTTTCATGATGTTCTGCTGGATATGCACTTTTCCGGGCTGACGTACACCGTGCTCAGCCTGTTTTTCAGCGATCCG +GATATGCATCCGCTGGATTTCAGTCTGCTGAACCGGCGCGAGGCTGACGAAGAGCCTGAAGATGATGTGCTGATGCAGAA +AGCGGCAGGGCTTGCCGGAGGTGTCCGCTTTGGCCCGGACGGGAATGAAGTTATCCCCGCTTCCCCGGATGTGGCGGACA +TGACGGAGGATGACGTAATGCTGATGACAGTATCAGAAGGGATCGCAGGAGGAGTCCGGTATGGCTGAACCGGTAGGCGA +TCTGGTCGTTGATTTGAGTCTGGATGCGGCCAGATTTGACGAGCAGATGGCCAGAGTCAGGCGTCATTTTTCTGGTACGG +AAAGTGATGCGAAAAAAACAGCGGCAGTCGTTGAACAGTCGCTGAGCCGACAGGCGCTGGCTGCACAGAAAGCGGGGATT +TCCGTCGGGCAGTATAAAGCCGCCATGCGTATGCTGCCTGCACAGTTCACCGACGTGGCCACGCAGCTTGCAGGCGGGCA +AAGTCCGTGGCTGATCCTGCTGCAACAGGGGGGGCAGGTGAAGGACTCCTTCGGCGGGATGATCCCCATGTTCAGGGGGC +TTGCCGGTGCGATCACCCTGCCGATGGTGGGGGCCACCTCGCTGGCGGTGGCGACCGGTGCGCTGGCGTATGCCTGGTAT +CAGGGCAACTCAACCCTGTCCGATTTCAACAAAACGCTGGTCCTTTCCGGCAATCAGGCGGGACTGACGGCAGATCGTAT +GCTGGTCCTGTCCAGAGCCGGGCAGGCGGCAGGGCTGACGTTTAACCAGACCAGCGAGTCACTCAGCGCACTGGTTAAGG +CGGGGGTAAGCGGTGAGGCTCAGATTGCGTCCATCAGCCAGAGTGTGGCGCGTTTCTCCTCTGCATCCGGCGTGGAGGTG +GACAAGGTCGCTGAAGCCTTCGGGAAGCTGACCACAGACCCGACGTCGGGGCTGACGGCGATGGCTCGCCAGTTCCATAA +CGTGTCGGCGGAGCAGATTGCGTATGTTGCTCAGTTGCAGCGTTCCGGCGATGAAGCCGGGGCATTGCAGGCGGCGAACG +AGGCCGCAACGAAAGGGTTTGATGACCAGACCCGCCGCCTGAAAGAGAACATGGGCACGCTGGAGACCTGGGCAGACAGG +ACTGCGCGGGCATTCAAATCCATGTGGGATGCGGTGCTGGATATTGGTCGTCCTGATACCGCGCAGGAGATGCTGATTAA +GGCAGAGGCTGCGTATAAGAAAGCAGACGACATCTGGAATCTGCGCAAGGATGATTATTTTGTTAACGATGAAGCGCGGG +CGCGTTACTGGGATGATCGTGAAAAGGCCCGTCTTGCGCTTGAAGCCGCCCGAAAGAAGGCTGAGCAGCAGACTCAACAG +GACAAAAATGCGCAGCAGCAGAGCGATACCGAAGCGTCACGGCTGAAATATACCGAAGAGGCGCAGAAGGCTTACGAACG +GCTGCAGACGCCGCTGGAGAAATATACCGCCCGTCAGGAAGAACTGAACAAGGCACTGAAAGACGGGAAAATCCTGCAGG +CGGATTACAACACGCTGATGGCGGCGGCGAAAAAGGATTATGAAGCGACGCTGAAAAAGCCGAAACAGTCCAGCGTGAAG +GTGTCTGCGGGCGATCGTCAGGAAGACAGTGCTCATGCTGCCCTGCTGACGCTTCAGGCAGAACTCCGGACGCTGGAGAA +GCATGCCGGAGCAAATGAGAAAATCAGCCAGCAGCGCCGGGATTTGTGGAAGGCGGAGAGTCAGTTCGCGGTACTGGAGG +AGGCGGCGCAACGTCGCCAGCTGTCTGCACAGGAGAAATCCCTGCTGGCGCATAAAGATGAGACGCTGGAGTACAAACGC +CAGCTGGCTGCACTTGGCGACAAGGTTACGTATCAGGAGCGCCTGAACGCGCTGGCGCAGCAGGCGGATAAATTCGCACA +GCAGCAACGGGCAAAACGGGCCGCCATTGATGCGAAAAGCCGGGGGCTGACTGACCGGCAGGCAGAACGGGAAGCCACGG +AACAGCGCCTGAAGGAACAGTATGGCGATAATCCGCTGGCGCTGAATAACGTCATGTCAGAGCAGAAAAAGACCTGGGCG +GCTGAAGACCAGCTTCGCGGGAACTGGATGGCAGGCCTGAAGTCCGGCTGGAGTGAGTGGGAAGAGAGCGCCACGGACAG +TATGTCGCAGGTAAAAAGTGCAGCCACGCAGACCTTTGATGGTATTGCACAGAATATGGCGGCGATGCTGACCGGCAGTG +AGCAGAACTGGCGCAGCTTCACCCGTTCCGTGCTGTCCATGATGACAGAAATTCTGCTTAAGCAGGCAATGGTGGGGATT +GTCGGGAGTATCGGCAGCGCCATTGGCGGGGCTGTTGGTGGCGGCGCATCCGCGTCAGGCGGTACAGCCATTCAGGCCGC +TGCGGCGAAATTCCATTTTGCAACCGGAGGATTTACGGGAACCGGCGGCAAATATGAGCCAGCGGGGATTGTTCACCGTG +GTGAGTTTGTCTTCACGAAGGAGGCAACCAGCCGGATTGGCGTGGGGAATCTTTACCGGCTGATGCGCGGCTATGCCACC +GGCGGTTATGTCGGTACACCGGGCAGCATGGCAGACAGCCGGTCGCAGGCGTCCGGGACGTTTGAGCAGAATAACCATGT +GGTGATTAACAACGACGGCACGAACGGGCAGATAGGTCCGGCTGCTCTGAAGGCGGTGTATGACATGGCCCGCAAGGGTG +CCCGTGATGAAATTCAGACACAGATGCGTGATGGTGGCCTGTTCTCCGGAGGTGGACGATGAAGACCTTCCGCTGGAAAG +TGAAACCCGGTATGGATGTGGCTTCGGTCCCTTCTGTAAGAAAGGTGCGCTTTGGTGATGGCTATTCTCAGCGAGCGCCT +GCCGGGCTGAATGCCAACCTGAAAACGTACAGCGTGACGCTTTCTGTCCCCCGTGAGGAGGCCACGGTACTGGAGTCGTT +TCTGGAAGAGCACGGGGGCTGGAAATCCTTTCTGTGGACGCCGCCTTATGAGTGGCGGCAGATAAAGGTGACCTGCGCAA +AATGGTCGTCGCGGGTCAGTATGCTGCGTGTTGAGTTCAGCGCAGAGTTTGAACAGGTGGTGAACTGATGCAGGATATCC +GGCAGGAAACACTGAATGAATGCACCCGTGCGGAGCAGTCGGCCAGCGTGGTGCTCTGGGAAATCGACCTGACAGAGGTC +GGTGGAGAACGTTATTTTTTCTGTAATGAGCAGAACGAAAAAGGTGAGCCGGTCACCTGGCAGGGGCGACAGTATCAGCC +GTATCCCATTCAGGGGAGCGGTTTTGAACTGAATGGCAAAGGCACCAGTACGCGCCCCACGCTGACGGTTTCTAACCTGT +ACGGTATGGTCACCGGGATGGCGGAAGATATGCAGAGTCTGGTCGGCGGAACGGTGGTCCGGCGTAAGGTTTACGCCCGT +TTTCTGGATGCGGTGAACTTCGTCAACGGAAACAGTTACGCCGATCCGGAGCAGGAGGTGATCAGCCGCTGGCGCATTGA +GCAGTGCAGCGAACTGAGCGCGGTGAGTGCCTCCTTTGTACTGTCCACGCCGACGGAAACGGATGGCGCTGTTTTTCCGG +GACGTATCATGCTGGCCAACACCTGCACCTGGACCTATCGCGGTGACGAGTGCGGTTATAGCGGTCCGGCTGTCGCGGAT +GAATATGACCAGCCAACGTCCGATATCACGAAGGATAAATGCAGCAAATGCCTGAGCGGTTGTAAGTTCCGCAATAACGT +CGGCAACTTTGGCGGCTTCCTTTCCATTAACAAACTTTCGCAGTAAATCCCATGACACAGACAGAATCAGCGATTCTGGC +GCACGCCCGGCGATGTGCGCCAGCGGAGTCGTGCGGCTTCGTGGTAAGCACGCCGGAGGGGGAAAGATATTTCCCCTGCG +TGAATATCTCCGGTGAGCCGGAGGCGTATTTCCGTATGTCGCCGGAAGACTGGCTGCAGGCAGAAATGCAGGGTGAGATT +GTGGCGCTGGTCCACAGCCACCCCGGTGGTCTGCCCTGGCTGAGTGAGGCCGACCGGCGGCTGCAGGTGCAGAGTGATTT +GCCGTGGTGGCTGGTCTGCCGGGGGACGATTCATAAGTTCCGCTGTGTGCCGCATCTCACCGGGCGGCGCTTTGAGCACG +GTGTGACGGACTGTTACACACTGTTCCGGGATGCTTATCATCTGGCGGGGATTGAGATGCCGGACTTTCATCGTGAGGAT +GACTGGTGGCGTAACGGCCAGAATCTCTATCTGGATAATCTGGAGGCGACGGGGCTGTATCAGGTGCCGTTGTCAGCGGC +ACAGCCGGGCGATGTGCTGCTGTGCTGTTTTGGTTCATCAGTGCCGAATCACGCCGCAATTTACTGCGGCGACGGCGAGC +TGCTGCACCATATTCCTGAACAACTGAGCAAACGAGAGAGGTACACCGACAAATGGCAGCGACGCACACACTCCCTCTGG +CGTCACCGGGCATGGCGCGCATCTGCCTTTACGGGGATTTACAACGATTTGGTCGCCGCATCGACCTTCGTGTGAAAACG +GGGGCTGAAGCCATCCGGGCACTGGCCACACAGCTCCCGGCGTTTCGTCAGAAACTGAGCGACGGCTGGTATCAGGTACG +GATTGCCGGGCGGGACGTCAGCACGTCCGGGTTAACGGCGCAGTTACATGAGACTCTGCCTGATGGCGCTGTAATTCATA +TTGTTCCCAGAGTCGCCGGGGCCAAGTCAGGTGGCGTATTCCAGATTGTCCTGGGGGCTGCCGCCATTGCCGGATCATTC +TTTACCGCCGGAGCCACCCTTGCAGCATGGGGGGCAGCCATTGGGGCCGGTGGTATGACCGGCATCCTGTTTTCTCTCGG +TGCCAGTATGGTGCTCGGTGGTGTGGCGCAGATGCTGGCACCGAAAGCCAGAACTCCCCGTATACAGACAACGGATAACG +GTAAGCAGAACACCTATTTCTCCTCACTGGATAACATGGTTGCCCAGGGCAATGTTCTGCCTGTTCTGTACGGGGAAATG +CGCGTGGGGTCACGCGTGGTTTCTCAGGAGATCAGCACGGCAGACGAAGGGGACGGTGGTCAGGTTGTGGTGATTGGTCG +CTGATGCAAAATGTTTTATGTGAAACCGCCTGCGGGCGGTTTTGTCATTTATGGAGCGTGAGGAATGGGTAAAGGAAGCA +GTAAGGGGCATACCCCGCGCGAAGCGAAGGACAACCTGAAGTCCACGCAGTTGCTGAGTGTGATCGATGCCATCAGCGAA +GGGCCGATTGAAGGTCCGGTGGATGGCTTAAAAAGCGTGCTGCTGAACAGTACGCCGGTGCTGGACACTGAGGGGAATAC +CAACATATCCGGTGTCACGGTGGTGTTCCGGGCTGGTGAGCAGGAGCAGACTCCGCCGGAGGGATTTGAATCCTCCGGCT +CCGAGACGGTGCTGGGTACGGAAGTGAAATATGACACGCCGATCACCCGCACCATTACGTCTGCAAACATCGACCGTCTG +CGCTTTACCTTCGGTGTACAGGCACTGGTGGAAACCACCTCAAAGGGTGACAGGAATCCGTCGGAAGTCCGCCTGCTGGT +TCAGATACAACGTAACGGTGGCTGGGTGACGGAAAAAGACATCACCATTAAGGGCAAAACCACCTCGCAGTATCTGGCCT +CGGTGGTGATGGGTAACCTGCCGCCGCGCCCGTTTAATATCCGGATGCGCAGGATGACGCCGGACAGCACCACAGACCAG +CTGCAGAACAAAACGCTCTGGTCGTCATACACTGAAATCATCGATGTGAAACAGTGCTACCCGAACACGGCACTGGTCGG +CGTGCAGGTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGTATTCTGCAGGTGC +CGTCGAACTATAACCCGCAGACGCGGCAATACAGCGGTATCTGGGACGGAACGTTTAAACCGGCATACAGCAACAACATG +GCCTGGTGTCTGTGGGATATGCTGACCCATCCGCGCTACGGCATGGGGAAACGTCTTGGTGCGGCGGATGTGGATAAATG +GGCGCTGTATGTCATCGGCCAGTACTGCGACCAGTCAGTGCCGGACGGCTTTGGCGGCACGGAGCCGCGCATCACCTGTA +ATGCGTACCTGACCACACAGCGTAAGGCGTGGGATGTGCTCAGCGATTTCTGCTCGGCGATGCGCTGTATGCCGGTATGG +AACGGGCAGACGCTGACGTTCGTGCAGGACCGACCGTCGGATAAGACGTGGACCTATAACCGCAGTAATGTGGTGATGCC +GGATGATGGCGCGCCGTTCCGCTACAGCTTCAGCGCCCTGAAGGACCGCCATAATGCCGTTGAGGTGAACTGGATTGACC +CGAACAACGGCTGGGAGACGGCGACAGAGCTTGTTGAAGATACGCAGGCCATTGCCCGTTACGGTCGTAATGTTACGAAG +ATGGATGCCTTTGGCTGTACCAGCCGGGGGCAGGCACACCGCGCCGGGCTGTGGCTGATTAAAACAGAACTGCTGGAAAC +GCAGACCGTGGATTTCAGCGTCGGCGCAGAAGGGCTTCGCCATGTACCGGGCGATGTTATTGAAATCTGCGATGATGACT +ATGCCGGTATCAGCACCGGTGGTCGTGTGCTGGCGGTGAACAGCCAGACCCGGACGCTGACGCTCGACCGTGAAATCACG +CTGCCATCCTCCGGTACCGCGCTGATAAGCCTGGTTGACGGAAGTGGCAATCCGGTCAGCGTGGAGGTTCAGTCCGTCAC +CGACGGCGTGAAGGTAAAAGTGAGCCGTGTTCCTGACGGTGTTGCTGAATACAGCGTATGGGAGCTGAAGCTGCCGACGC +TGCGCCAGCGACTGTTCCGCTGCGTGAGTATCCGTGAGAACGACGACGGCACGTATGCCATCACCGCCGTGCAGCATGTG +CCGGAAAAAGAGGCCATCGTGGATAACGGGGCGCACTTTGACGGCGAACAGAGTGGCACGGTGAATGGTGTCACGCCGCC +AGCGGTGCAGCACCTGACCGCAGAAGTCACTGCAGACAGCGGGGAATATCAGGTGCTGGCGCGATGGGACACACCGAAGG +TGGTGAAGGGCGTGAGTTTCCTGCTCCGTCTGACCGTAACAGCGGACGACGGCAGTGAGCGGCTGGTCAGCACGGCCCGG +ACGACGGAAACCACATACCGCTTCACGCAACTGGCGCTGGGGAACTACAGGCTGACAGTCCGGGCGGTAAATGCGTGGGG +GCAGCAGGGCGATCCGGCGTCGGTATCGTTCCGGATTGCCGCACCGGCAGCACCGTCGAGGATTGAGCTGACGCCGGGCT +ATTTTCAGATAACCGCCACGCCGCATCTTGCCGTTTATGACCCGACGGTACAGTTTGAGTTCTGGTTCTCGGAAAAGCAG +ATTGCGGATATCAGACAGGTTGAAACCAGCACGCGTTATCTTGGTACGGCGCTGTACTGGATAGCCGCCAGTATCAATAT +CAAACCGGGCCATGATTATTACTTTTATATCCGCAGTGTGAACACCGTTGGCAAATCGGCATTCGTGGAGGCCGTCGGTC +GGGCGAGCGATGATGCGGAAGGTTACCTGGATTTTTTCAAAGGCAAGATAACCGAATCCCATCTCGGCAAGGAGCTGCTG +GAAAAAGTCGAGCTGACGGAGGATAACGCCAGCAGACTGGAGGAGTTTTCGAAAGAGTGGAAGGATGCCAGTGATAAGTG +GAATGCCATGTGGGCTGTCAAAATTGAGCAGACCAAAGACGGCAAACATTATGTCGCGGGTATTGGCCTCAGCATGGAGG +ACACGGAGGAAGGCAAACTGAGCCAGTTTCTGGTTGCCGCCAATCGTATCGCATTTATTGACCCGGCAAACGGGAATGAA +ACGCCGATGTTTGTGGCGCAGGGCAACCAGATATTCATGAACGACGTGTTCCTGAAGCGCCTGACGGCCCCCACCATTAC +CAGCGGCGGCAATCCTCCGGCCTTTTCCCTGACACCGGACGGAAAGCTGACCGCTAAAAATGCGGATATCAGTGGCAGTG +TGAATGCGAACTCCGGGACGCTCAGTAATGTGACGATAGCTGAAAACTGTACGATAAACGGTACGCTGAGGGCGGAAAAA +ATCGTCGGGGACATTGTAAAGGCGGCGAGCGCGGCTTTTCCGCGCCAGCGTGAAAGCAGTGTGGACTGGCCGTCAGGTAC +CCGTACTGTCACCGTGACCGATGACCATCCTTTTGATCGCCAGATAGTGGTGCTTCCGCTGACGTTTCGCGGAAGTAAGC +GTACTGTCAGCGGCAGGACAACGTATTCGATGTGTTATCTGAAAGTACTGATGAACGGTGCGGTGATTTATGATGGCGCG +GCGAACGAGGCGGTACAGGTGTTCTCCCGTATTGTTGACATGCCAGCGGGTCGGGGAAACGTGATCCTGACGTTCACGCT +TACGTCCACACGGCATTCGGCAGATATTCCGCCGTATACGTTTGCCAGCGATGTGCAGGTTATGGTGATTAAGAAACAGG +CGCTGGGCATCAGCGTGGTCTGAGTGTGTTACAGAGGTTCGTCCGGGAACGGGCGTTTTATTATAAAACAGTGAGAGGTG +AACGATGCGTAATGTGTGTATTGCCGTTGCTGTCTTTGCCGCACTTGCGGTGACAGTCACTCCGGCCCGTGCGGAAGGTG +GACATGGTACGTTTACGGTGGGCTATTTTCAAGTGAAACCGGGTACATTGCCGTCGTTGTCGGGCGGGGATACCGGTGTG +AGTCATCTGAAAGGGATTAACGTGAAGTACCGTTATGAGCTGACGGACAGTGTGGGGGTGATGGCTTCCCTGGGGTTCGC +CGCGTCGAAAAAGAGCAGCACAGTGATGACCGGGGAGGATACGTTTCACTATGAGAGCCTGCGTGGACGTTATGTGAGCG +TGATGGCCGGACCGGTTTTACAAATCAGTAAGCAGGTCAGTGCGTACGCCATGGCCGGAGTGGCTCACAGTCGGTGGTCC +GGCAGTACAATGGATTACCGTAAGACGGAAATCACTCCCGGGTATATGAAAGAGACGACCACTGCCAGGGACGAAAGTGC +AATGCGGCATACCTCAGTGGCGTGGAGTGCAGGTATACAGATTAATCCGGCAGCGTCCGTCGTTGTTGATATTGCTTATG +AAGGCTCCGGCAGTGGCGACTGGCGTACTGACGGATTCATCGTTGGGGTCGGTTATAAATTCTGATTAGCCAGGTAACAC +AGTGTTATGACAGCCCGCCGGAACCGGTGGGCTTTTTTGTGGGGTGAATATGGCAGTAAAGATTTCAGGAGTCCTGAAAG +ACGGCACAGGAAAACCGGTACAGAACTGCACCATTCAGCTGAAAGCCAGACGTAACAGCACCACGGTGGTGGTGAACACG +GTGGGCTCAGAGAATCCGGATGAAGCCGGGCGTTACAGCATGGATGTGGAGTACGGTCAGTACAGTGTCATCCTGCAGGT +TGACGGTTTTCCACCATCGCACGCCGGGACCATCACCGTGTATGAAGATTCACAACCGGGGACGCTGAATGATTTTCTCT +GTGCCATGACGGAGGATGATGCCCGGCCGGAGGTGCTGCGTCGTCTTGAACTGATGGTGGAAGAGGTGGCGCGTAACGCG +TCCGTGGTGGCACAGAGTACGGCAGACGCGAAGAAATCAGCCGGCGATGCCAGTGCATCAGCTGCTCAGGTCGCGGCCCT +TGTGACTGATGCAACTGACTCAGCACGCGCCGCCAGCACGTCCGCCGGACAGGCTGCATCGTCAGCTCAGGAAGCGTCCT +CCGGCGCAGAAGCGGCATCAGCAAAGGCCACTGAAGCGGAAAAAAGTGCCGCAGCCGCAGAGTCCTCAAAAAACGCGGCG +GCCACCAGTGCCGGTGCGGCGAAAACGTCAGAAACGAATGCTGCAGCGTCACAACAATCAGCCGCCACGTCTGCCTCCAC +CGCGGCCACGAAAGCGTCAGAGGCCGCCACTTCAGCACGAGATGCGGTGGCCTCAAAAGAGGCAGCAAAATCATCAGAAA +CGAACGCATCATCAAGTGCCGGTCGTGCAGCTTCCTCGGCAACGGCGGCAGAAAATTCTGCCAGGGCGGCAAAAACGTCC +GAGACGAATGCCAGGTCATCTGAAACAGCAGCGGAACGGAGCGCCTCTGCCGCGGCAGACGCAAAAACAGCGGCGGCGGG +GAGTGCGTCAACGGCATCCACGAAGGCGACAGAGGCTGCGGGAAGTGCGGTATCAGCATCGCAGAGCAAAAGTGCGGCAG +AAGCGGCGGCAATACGTGCAAAAAATTCGGCAAAACGTGCAGAAGATATAGCTTCAGCTGTCGCGCTTGAGGATGCGGAC +ACAACGAGAAAGGGGATAGTGCAGCTCAGCAGTGCAACCAACAGCACGTCTGAAACGCTTGCTGCAACGCCAAAGGCGGT +TAAGGTGGTAATGGATGAAACGAACAGAAAAGCCCACTGGACAGTCCGGCACTGACCGGAACGCCAACAGCACCAACCGC +GCTCAGGGGAACAAACAATACCCAGATTGCGAACACCGCTTTTGTACTGGCCGCGATTGCAGATGTTATCGACGCGTCAC +CTGACGCACTGAATACGCTGAATGAACTGGCCGCAGCGCTCGGGAATGATCCAGATTTTGCTACCACCATGACTAACGCG +CTTGCGGGTAAACAACCGAAGAATGCGACACTGACGGCGCTGGCAGGGCTTTCCACGGCGAAAAATAAATTACCGTATTT +TGCGGAAAATGATGCCGCCAGCCTGACTGAACTGACTCAGGTTGGCAGGGATATTCTGGCAAAAAATTCCGTTGCAGATG +TTCTTGAATACCTTGGGGCCGGTGAGAATTCGGCCTTTCCGGCAGGTGCGCCGATCCCGTGGCCATCAGATATCGTTCCG +TCTGGCTACGTCCTGATGCAGGGGCAGGCGTTTGACAAATCAGCCTACCCAAAACTTGCTGTCGCGTATCCATCGGGTGT +GCTTCCTGATATGCGAGGCTGGACAATCAAGGGGAAACCCGCCAGCGGTCGTGCTGTATTGTCTCAGGAACAGGATGGAA +TTAAGTCGCACACCCACAGTGCCAGTGCATCCGGTACGGATTTGGGGACGAAAACCACATCGTCGTTTGATTACGGGACG +AAAACAACAGGCAGTTTCGATTACGGCACCAAATCGACGAATAACACGGGGGCTCATGCTCACAGTCTGAGCGGTTCAAC +AGGGGCCGCGGGTGCTCATGCCCACACAAGTGGTTTAAGGATGAACAGTTCTGGCTGGAGTCAGTATGGAACAGCAACCA +TTACAGGAAGTTTATCCACAGTTAAAGGAACCAGCACACAGGGTATTGCTTATTTATCGAAAACGGACAGTCAGGGCAGC +CACAGTCACTCATTGTCCGGTACAGCCGTGAGTGCCGGTGCACATGCGCATACAGTTGGTATTGGTGCGCACCAGCATCC +GGTTGTTATCGGTGCTCATGCCCATTCTTTCAGTATTGGTTCACACGGACACACCATCACCGTTAACGCTGCGGGTAACG +CGGAAAACACCGTCAAAAACATTGCATTTAACTATATTGTGAGGCTTGCATAATGGCATTCAGAATGAGTGAACAACCAC +GGACCATAAAAATTTATAATCTGCTGGCCGGAACTAATGAATTTATTGGTGAAGGTGACGCATATATTCCGCCTCATACC +GGTCTGCCTGCAAACAGTACCGATATTGCACCGCCAGATATTCCGGCTGGCTTTGTGGCTGTTTTCAACAGTGATGAGGC +ATCGTGGCATCTCGTTGAAGACCATCGGGGTAAAACCGTCTATGACGTGGCTTCCGGCGACGCGTTATTTATTTCTGAAC +TCGGTCCGTTACCGGAAAATTTTACCTGGTTATCGCCGGGAGGGGAATATCAGAAGTGGAACGGCACAGCCTGGGTGAAG +GATACGGAAGCAGAAAAACTGTTCCGGATCCGGGAGGCGGAAGAAACAAAAAAAAGCCTGATGCAGGTAGCCAGTGAGCA +TATTGCGCCGCTTCAGGATGCTGCAGATCTGGAAATTGCAACGAAGGAAGAAACCTCGTTGCTGGAAGCCTGGAAGAAGT +ATCGGGTGTTGCTGAACCGTGTTGATACATCAACTGCACCTGATATTGAGTGGCCTGCTGTCCCTGTTATGGAGTAATCG +TTTTGTGATATGCCGCAGAAACGTTGTATGAAATAACGTTCTGCGGTTAGTTAGTATATTGTAAAGCTGAGTATTGGTTT +ATTTGGCGATTATTATCTTCAGGAGAATAATGGAAGTTCTATGACTCAATTGTTCATAGTGTTTACATCACCGCCAATTG +CTTTTAAGACTGAACGCATGAAATATGGTTTTTCGTCATGTTTTGAGTCTGCTGTTGATATTTCTAAAGTCGGTTTTTTT +TCTTCGTTTTCTCTAACTATTTTCCATGAAATACATTTTTGATTATTATTTGAATCAATTCCAATTACCTGAAGTCTTTC +ATCTATAATTGGCATTGTATGTATTGGTTTATTGGAGTAGATGCTTGCTTTTCTGAGCCATAGCTCTGATATCCAAATGA +AGCCATAGGCATTTGTTATTTTGGCTCTGTCAGCTGCATAACGCCAAAAAATATATTTATCTGCTTGATCTTCAAATGTT +GTATTGATTAAATCAATTGGATGGAATTGTTTATCATAAAAAATTAATGTTTGAATGTGATAACCGTCCTTTAAAAAAGT +CGTTTCTGCAAGCTTGGCTGTATAGTCAACTAACTCTTCTGTCGAAGTGATATTTTTAGGCTTATCTACCAGTTTTAGAC +GCTCTTTAATATCTTCAGGAATTATTTTATTGTCATATTGTATCATGCTAAATGACAATTTGCTTATGGAGTAATCTTTT +AATTTTAAATAAGTTATTCTCCTGGCTTCATCAAATAAAGAGTCGAATGATGTTGGCGAAATCACATCGTCACCCATTGG +ATTGTTTATTTGTATGCCAAGAGAGTTACAGCAGTTATACATTCTGCCATAGATTATAGCTAAGGCATGTAATAATTCGT +AATCTTTTAGCGTATTAGCGACCCATCGTCTTTCTGATTTAATAATAGATGATTCAGTTAAATATGAAGGTAATTTCTTT +TGTGCAAGTCTGACTAACTTTTTTATACCAATGTTTAACATACTTTCATTTGTAATAAACTCAATGTCATTTTCTTCAAT +GTAAGATGAAATAAGAGTAGCCTTTGCCTCGCTATACATTTCTAAATCGCCTTGTTTTTCTATCGTATTGCGAGAATTTT +TAGCCCAAGCCATTAATGGATCATTTTTCCATTTTTCAATAACATTATTGTTATACCAAATGTCATATCCTATAATCTGG +TTTTTGTTTTTTTGAATAATAAATGTTACTGTTCTTGCGGTTTGGAGGAATTGATTCAAATTCAAGCGAAATAATTCAGG +GTCAAAATATGTATCAATGCAGCATTTGAGCAAGTGCGATAAATCTTTAAGTCTTCTTTCCCATGGTTTTTTAGTCATAA +AACTCTCCATTTTGATAGGTTGCATGCTAGATGCTGATATATTTTAGAGGTGATAAAATTAACTGCTTAACTGTCAATGT +AATACAAGTTGTTTGATCTTTGCAATGATTCTTATCAGAAACCATATAGTAAATTAGTTACACAGGAAATTTTTAATATT +ATTATTATCATTCATTATGTATTAAAATTAGAGTTGTGGCTTGGCTCTGCTAACACGTTGCTCATAGGAGATATGGTAGA +GCCGCAGACACGTCGTATGCAGGAACGTGCTGCGGCTGGCTGGTGAACTTCCGATAGTGCGGGTGTTGAATGATTTCCAG +TTGCTACCGATTTTACATATTTTTTGCATGAGAGAATTTGTACCACCTCCCACCGACCATCTATGACTGTACGCCACTGT +CCCTAGGACTGCTATGTGCCGGAGCGGACATTACAAACGTCCTTCTCGGTGCATGCCACTGTTGCCAATGACCTGCCTAG +GAATTGGTTAGCAAGTTACTACCGGATTTTGTAAAAACAGCCCTCCTCATATAAAAAGTATTCGTTCACTTCCGATAAGC +GTCGTAATTTTCTATCTTTCATCATATTCTAGATCCCTCTGAAAAAATCTTCCGAGTTTGCTAGGCACTGATACATAACT +CTTTTCCAATAATTGGGGAAGTCATTCAAATCTATAATAGGTTTCAGATTTGCTTCAATAAATTCTGACTGTAGCTGCTG +AAACGTTGCGGTTGAACTATATTTCCTTATAACTTTTACGAAAGAGTTTCTTTGAGTAATCACTTCACTCAAGTGCTTCC +CTGCCTCCAAACGATACCTGTTAGCAATATTTAATAGCTTGAAATGATGAAGAGCTCTGTGTTTGTCTTCCTGCCTCCAG +TTCGCCGGGCATTCAACATAAAAACTGATAGCACCCGGAGTTCCGGAAACGAAATTTGCATATACCCATTGCTCACGAAA +AAAAATGTCCTTGTCGATATAGGGATGAATCGCTTGGTGTACCTCATCTACTGCGAAAACTTGACCTTTCTCTCCCATAT +TGCAGTCGCGGCACGATGGAACTAAATTAATAGGCATCACCGAAAATTCAGGATAATGTGCAATAGGAAGAAAATGATCT +ATATTTTTTGTCTGTCCTATATCACCACAAAATGGACATTTTTCACCTGATGAAACAAGCATGTCATCGTAATATGTTCT +AGCGGGTTTGTTTTTATCTCGGAGATTATTTTCATAAAGCTTTTCTAATTTAACCTTTGTCAGGTTACCAACTACTAAGG +TTGTAGGCTCAAGAGGGTGTGTCCTGTCGTAGGTAAATAACTGACCTGTCGAGCTTAATATTCTATATTGTTGTTCTTTC +TGCAAAAAAGTGGGGAAGTGAGTAATGAAATTATTTCTAACATTTATCTGCATCATACCTTCCGAGCATTTATTAAGCAT +TTCGCTATAAGTTCTCGCTGGAAGAGGTAGTTTTTTCATTGTACTTTACCTTCATCTCTGTTCATTATCATCGCTTTTAA +AACGGTTCGACCTTCTAATCCTATCTGACCATTATAATTTTTTAGAATGGTTTCATAAGAAAGCTCTGAATCAACGGACT +GCGATAATAAGTGGTGGTATCCAGAATTTGTCACTTCAAGTAAAAACACCTCACGAGTTAAAACACCTAAGTTCTCACCG +AATGTCTCAATATCCGGACGGATAATATTTATTGCTTCTCTTGACCGTAGGACTTTCCACATGCAGGATTTTGGAACCTC +TTGCAGTACTACTGGGGAATGAGTTGCAATTATTGCTACACCATTGCGTGCATCGAGTAAGTCGCTTAATGTTCGTAAAA +AAGCAGAGAGCAAAGGTGGATGCAGATGAACCTCTGGTTCATCGAATAAAACTAATGACTTTTCGCCAACGACATCTACT +AATCTTGTGATAGTAAATAAAACAATTGCATGTCCAGAGCTCATTCGAAGCAGATATTTCTGGATATTGTCATAAAACAA +TTTAGTGAATTTATCATCGTCCACTTGAATCTGTGGTTCATTACGTCTTAACTCTTCATATTTAGAAATGAGGCTGATGA +GTTCCATATTTGAAAAGTTTTCATCACTACTTAGTTTTTTGATAGCTTCAAGCCAGAGTTGTCTTTTTCTATCTACTCTC +ATACAACCAATAAATGCTGAAATGAATTCTAAGCGGAGATCGCCTAGTGATTTTAAACTATTGCTGGCAGCATTCTTGAG +TCCAATATAAAAGTATTGTGTACCTTTTGCTGGGTCAGGTTGTTCTTTAGGAGGAGTAAAAGGATCAAATGCACTAAACG +AAACTGAAACAAGCGATCGAAAATATCCCTTTGGGATTCTTGACTCGATAAGTCTATTATTTTCAGAGAAAAAATATTCA +TTGTTTTCTGGGTTGGTGATTGCACCAATCATTCCATTCAAAATTGTTGTTTTACCACACCCATTCCGCCCGATAAAAGC +ATGAATGTTCGTGCTGGGCATAGAATTAACCGTCACCTCAAAAGGTATAGTTAAATCACTGAATCCGGGAGCACTTTTTC +TATTAAATGAAAAGTGGAAATCTGACAATTCTGGCAAACCATTTAACACACGTGCGAACTGTCCATGAATTTCTGAAAGA +GTTACCCCTCTAAGTAATGAGGTGTTAAGGACGCTTTCATTTTCAATGTCGGCTAATCGATTTGGCCATACTACTAAATC +CTGAATAGCTTTAAGAAGGTTATGTTTAAAACCATCGCTTAATTTGCTGAGATTAACATAGTAGTCAATGCTTTCACCTA +AGGAAAAAAACATTTCAGGGAGTTGACTGAATTTTTTATCTATTAATGAATAAGTGCTTACTTCTTCTTTTTGACCTACA +AAACCAATTTTAACATTTCCGATATCGCATTTTTCACCATGCTCATCAAAGACAGTAAGATAAAACATTGTAACAAAGGA +ATAGTCATTCCAACCATCTGCTCGTAGGAATGCCTTATTTTTTTCTACTGCAGGAATATACCCGCCTCTTTCAATAACAC +TAAACTCCAACATATAGTAACCCTTAATTTTATTAAAATAACCGCAATTTATTTGGCGGCAACACAGGATCTCTCTTTTA +AGTTACTCTCTATTACATACGTTTTCCATCTAAAAATTAGTAGTATTGAACTTAACGGGGCATCGTATTGTAGTTTTCCA +TATTTAGCTTTCTGCTTCCTTTTGGATAACCCACTGTTATTCATGTTGCATGGTGCACTGTTTATACCAACGATATAGTC +TATTAATGCATATATAGTATCGCCGAACGATTAGCTCTTCAGGCTTCTGAAGAAGCGTTTCAAGTACTAATAAGCCGATA +GATAGCCACGGACTTCGTAGCCATTTTTCATAAGTGTTAACTTCCGCTCCTCGCTCATAACAGACATTCACTACAGTTAT +GGCGGAAAGGTATGCATGCTGGGTGTGGGGAAGTCGTGAAAGAAAAGAAGTCAGCTGCGTCGTTTGACATCACTGCTATC +TTCTTACTGGTTATGCAGGTCGTAGTGGGTGGCACACAAAGCTTTGCACTGGATTGCGAGGCTTTGTGCTTCTCTGGAGT +GCGACAGGTTTGATGACAAAAAATTAGCGCAAGAAGACAAAAATCACCTTGCGCTAATGCTCTGTTACAGGTCACTAATA +CCATCTAAGTAGTTGATTCATAGTGACTGCATATGTTGTGTTTTACAGTATTATGTAGTCTGTTTTTTATGCAAAATCTA +ATTTAATATATTGATATTTATATCATTTTACGTTTCTCGTTCAGCTTTTTTATACTAAGTTGGCATTATAAAAAAGCATT +GCTTATCAATTTGTTGCAACGAACAGGTCACTATCAGTCAAAATAAAATCATTATTTGATTTCAATTTTGTCCCACTCCC +TGCCTCTGTCATCACGATACTGTGATGCCATGGTGTCCGACTTATGCCCGAGAAGATGTTGAGCAAACTTATCGCTTATC +TGCTTCTCATAGAGTCTTGCAGACAAACTGCGCAACTCGTGAAAGGTAGGCGGATCCCCTTCGAAGGAAAGACCTGATGC +TTTTCGTGCGCGCATAAAATACCTTGATACTGTGCCGGATGAAAGCGGTTCGCGACGAGTAGATGCAATTATGGTTTCTC +CGCCAAGAATCTCTTTGCATTTATCAAGTGTTTCCTTCATTGATATTCCGAGAGCATCAATATGCAATGCTGTTGGGATG +GCAATTTTTACGCCTGTTTTGCTTTGCTCGACATAAAGATATCCATCTACGATATCAGACCACTTCATTTCGCATAAATC +ACCAACTCGTTGCCCGGTAACAACAGCCAGTTCCATTGCAAGTCTGAGCCAACATGGTGATGATTCTGCTGCTTGATAAA +TTTTCAGGTATTCGTCAGCCGTAAGTCTTGATCTCCTTACCTCTGATTTTGCTGCGCGAGTGGCAGCGACATGGTTTGTT +GTTATATGGCCTTCAGCTATTGCCTCTCGGAATGCATCGCTCAGTGTTGATCTGATTAACTTGGCTGACGCCGCCTTGCC +CTCGTCTATGTATCCATTGAGCATTGCCGCAATTTCTTTTGTGGTGATGTCTTCAAGTGGAGCATCAGGCAGACCCCTCC +TTATTGCTTTAATTTTGCTCATGTAATTTATGAGTGTCTTCTGCTTGATTCCTCTGCTGGCCAGGATTTTTTCGTAGCGA +TCAAGCCATGAATGTAACGTAACGGAATTATCACTGTTGATTCTCGCTGTCAGAGGCTTGTGTTTGTGTCCTGAAAATAA +CTCAATGTTGGCCTGTATAGCTTCAGTGATTGCGATTCGCCTGTCTCTGCCTAATCCAAACTCTTTACCCGTCCTTGGGT +CCCTGTAGCAGTAATATCCATTGTTTCTTATATAAAGGTTAGGGGGTAAATCCCGGCGCTCATGACTTCGCCTTCTTCCC +ATTTCTGATCCTCTTCAAAAGGCCACCTGTTACTGGTCGATTTAAGTCAACCTTTACCGCTGATTCGTGGAACAGATACT +CTCTTCCATCCTTAACCGGAGGTGGGAATATCCTGCATTCCCGAACCCATCGACGAACTGTTTCAAGGCTTCTTGGACGT +CGCTGGCGTGCGTTCCACTCCTGAAGTGTCAAGTACATCGCAAAGTCTCCGCAATTACACGCAAGAAAAAACCGCCATCA +GGCGGCTTGGTGTTCTTTCAGTTCTTCAATTCGAATATTGGTTACGTCTGCATGTGCTATCTGCGCCCATATCATCCAGT +GGTCGTAGCAGTCGTTGATGTTCTCCGCTTCGATAACTCTGTTGAATGGCTCTCCATTCCATTCTCCTGTGACTCGGAAG +TGCATTTATCATCTCCATAAAACAAAACCCGCCGTAGCGAGTTCAGATAAAATAAATCCCCGCGAGTGCGAGGATTGTTA +TGTAATATTGGGTTTAATCATCTATATGTTTTGTACAGAGAGGGCAAGTATCGTTTCCACCGTACTCGTGATAATAATTT +TGCACGGTATCAGTCATTTCTCGCACATTGCAGAATGGGGATTTGTCTTCATTAGACTTATAAACCTTCATGGAATATTT +GTATGCCGACTCTATATCTATACCTTCATCTACATAAACACCTTCGTGATGTCTGCATGGAGACAAGACACCGGATCTGC +ACAACATTGATAACGCCCAATCTTTTTGCTCAGACTCTAACTCATTGATACTCATTTATAAACTCCTTGCAATGTATGTC +GTTTCAGCTAAACGGTATCAGCAATGTTTATGTAAAGAAACAGTAAGATAATACTCAACCCGATGTTTGAGTACGGTCAT +CATCTGACACTACAGACTCTGGCATCGCTGTGAAGACGACGCGAAATTCAGCATTTTCACAAGCGTTATCTTTTACAAAA +CCGATCTCACTCTCCTTTGATGCGAATGCCAGCGTCAGACATCATATGCAGATACTCACCTGCATCCTGAACCCATTGAC +CTCCAACCCCGTAATAGCGATGCGTAATGATGTCGATAGTTACTAACGGGTCTTGTTCGATTAACTGCCGCAGAAACTCT +TCCAGGTCACCAGTGCAGTGCTTGATAACAGGAGTCTTCCCAGGATGGCGAACAACAAGAAACTGGTTTCCGTCTTCACG +GACTTCGTTGCTTTCCAGTTTAGCAATACGCTTACTCCCATCCGAGATAACACCTTCGTAATACTCACGCTGCTCGTTGA +GTTTTGATTTTGCTGTTTCAAGCTCAACACGCAGTTTCCCTACTGTTAGCGCAATATCCTCGTTCTCCTGGTCGCGGCGT +TTGATGTATTGCTGGTTTCTTTCCCGTTCATCCAGCAGTTCCAGCACAATCGATGGTGTTACCAATTCATGGAAAAGGTC +TGCGTCAAATCCCCAGTCGTCATGCATTGCCTGCTCTGCCGCTTCACGCAGTGCCTGAGAGTTAATTTCGCTCACTTCGA +ACCTCTCTGTTTACTGATAAGTTCCAGATCCTCCTGGCAACTTGCACAAGTCCGACAACCCTGAACGACCAGGCGTCTTC +GTTCATCTATCGGATCGCCACACTCACAACAATGAGTGGCAGATATAGCCTGGTGGTTCAGGCGGCGCATTTTTATTGCT +GTGTTGCGCTGTAATTCTTCTATTTCTGATGCTGAATCAATGATGTCTGCCATCTTTCATTAATCCCTGAACTGTTGGTT +AATACGCTTGAGGGTGAATGCGAATAATAAAAAAGGAGCCTGTAGCTCCCTGATGATTTTGCTTTTCATGTTCATCGTTC +CTTAAAGACGCCGTTTAACATGCCGATTGCCAGGCTTAAATGAGTCGGTGTGAATCCCATCAGCGTTACCGTTTCGCGGT +GCTTCTTCAGTACGCTACGGCAAATGTCATCGACGTTTTTATCCGGAAACTGCTGTCTGGCTTTTTTTGATTTCAGAATT +AGCCTGACGGGCAATGCTGCGAAGGGCGTTTTCCTGCTGAGGTGTCATTGAACAAGTCCCATGTCGGCAAGCATAAGCAC +ACAGAATATGAAGCCCGCTGCCAGAAAAATGCATTCCGTGGTTGTCATACCTGGTTTCTCTCATCTGCTTCTGCTTTCGC +CACCATCATTTCCAGCTTTTGTGAAAGGGATGCGGCTAACGTATGAAATTCTTCGTCTGTTTCTACTGGTATTGGCACAA +ACCTGATTCCAATTTGAGCAAGGCTATGTGCCATCTCGATACTCGTTCTTAACTCAACAGAAGATGCTTTGTGCATACAG +CCCCTCGTTTATTATTTATCTCCTCAGCCAGCCGCTGTGCTTTCAGTGGATTTCGGATAACAGAAAGGCCGGGAAATACC +CAGCCTCGCTTTGTAACGGAGTAGACGAAAGTGATTGCGCCTACCCGGATATTATCGTGAGGATGCGTCATCGCCATTGC +TCCCCAAATACAAAACCAATTTCAGCCAGTGCCTCGTCCATTTTTTCGATGAACTCCGGCACGATCTCGTCAAAACTCGC +CATGTACTTTTCATCCCGCTCAATCACGACATAATGCAGGCCTTCACGCTTCATACGCGGGTCATAGTTGGCAAAGTACC +AGGCATTTTTTCGCGTCACCCACATGCTGTACTGCACCTGGGCCATGTAAGCTGACTTTATGGCCTCGAAACCACCGAGC +CGGAACTTCATGAAATCCCGGGAGGTAAACGGGCATTTCAGTTCAAGGCCGTTGCCGTCACTGCATAAACCATCGGGAGA +GCAGGCGGTACGCATACTTTCGTCGCGATAGATGATCGGGGATTCAGTAACATTCACGCCGGAAGTGAATTCAAACAGGG +TTCTGGCGTCGTTCTCGTACTGTTTTCCCCAGGCCAGTGCTTTAGCGTTAACTTCCGGAGCCACACCGGTGCAAACCTCA +GCAAGCAGGGTGTGGAAGTAGGACATTTTCATGTCAGGCCACTTCTTTCCGGAGCGGGGTTTTGCTATCACGTTGTGAAC +TTCTGAAGCGGTGATGACGCCGAGCCGTAATTTGTGCCACGCATCATCCCCCTGTTCGACAGCTCTCACATCGATCCCGG +TACGCTGCAGGATAATGTCCGGTGTCATGCTGCCACCTTCTGCTCTGCGGCTTTCTGTTTCAGGAATCCAAGAGCTTTTA +CTGCTTCGGCCTGTGTCAGTTCTGACGATGCACGAATGTCGCGGCGAAATATCTGGGAACAGAGCGGCAATAAGTCGTCA +TCCCATGTTTTATCCAGGGCGATCAGCAGAGTGTTAATCTCCTGCATGGTTTCATCGTTAACCGGAGTGATGTCGCGTTC +CGGCTGACGTTCTGCAGTGTATGCAGTATTTTCGACAATGCGCTCGGCTTCATCCTTGTCATAGATACCAGCAAATCCGA +AGGCCAGACGGGCACACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTGATTTCT +CTGCCTTCGCGAGTTTTGAATGGTTCGCGGCGGCATTCATCCATCCATTCGGTAACGCAGATCGGATGATTACGGTCCTT +GCGGTAAATCCGGCATGTACAGGATTCATTGTCCTGCTCAAAGTCCATGCCATCAAACTGCTGGTTTTCATTGATGATGC +GGGACCAGCCATCAACGCCCACCACCGGAACGATGCCATTCTGCTTATCAGGAAAGGCGTAAATTTCTTTCGTCCACGGA +TTAAGGCCGTACTGGTTGGCAACGATCAGTAATGCGATGAACTGCGCATCGCTGGCATCACCTTTAAATGCCGTCTGGCG +AAGAGTGGTGATCAGTTCCTGTGGGTCGACAGAATCCATGCCGACACGTTCAGCCAGCTTCCCAGCCAGCGTTGCGAGTG +CAGTACTCATTCGTTTTATACCTCTGAATCAATATCAACCTGGTGGTGAGCAATGGTTTCAACCATGTACCGGATGTGTT +CTGCCATGCGCTCCTGAAACTCAACATCGTCATCAAACGCACGGGTAATGGATTTTTTGCTGGCCCCGTGGCGTTGCAAA +TGATCGATGCATAGCGATTCAAACAGGTGCTGGGGCAGGCCTTTTTCCATGTCGTCTGCCAGTTCTGCCTCTTTCTCTTC +ACGGGCGAGCTGCTGGTAGTGACGCGCCCAGCTCTGAGCCTCAAGACGATCCTGAATGTAATAAGCGTTCATGGCTGAAC +TCCTGAAATAGCTGTGAAAATATCGCCCGCGAAATGCCGGGCTGATTAGGAAAACAGGAAAGGGGGTTAGTGAATGCTTT +TGCTTGATCTCAGTTTCAGTATTAATATCCATTTTTTATAAGCGTCGACGGCTTCACGAAACATCTTTTCATCGCCAATA +AAAGTGGCGATAGTGAATTTAGTCTGGATAGCCATAAGTGTTTGATCCATTCTTTGGGACTCCTGGCTGATTAAGTATGT +CGATAAGGCGTTTCCATCCGTCACGTAATTTACGGGTGATTCGTTCAAGTAAAGATTCGGAAGGGCAGCCAGCAACAGGC +CACCCTGCAATGGCATATTGCATGGTGTGCTCCTTATTTATACATAACGAAAAACGCCTCGAGTGAAGCGTTATTGGTAT +GCGGTAAAACCGCACTCAGGCGGCCTTGATAGTCATATCATCTGAATCAAATATTCCTGATGTATCGATATCGGTAATTC +TTATTCCTTCGCTACCATCCATTGGAGGCCATCCTTCCTGACCATTTCCATCATTCCAGTCGAACTCACACACAACACCA +TATGCATTTAAGTCGCTTGAAATTGCTATAAGCAGAGCATGTTGCGCCAGCATGATTAATACAGCATTTAATACAGAGCC +GTGTTTATTGAGTCGGTATTCAGAGTCTGACCAGAAATTATTAATCTGGTGAAGTTTTTCCTCTGTCATTACGTCATGGT +CGATTTCAATTTCTATTGATGCTTTCCAGTCGTAATCAATGATGTATTTTTTGATGTTTGACATCTGTTCATATCCTCAC +AGATAAAAAATCGCCCTCACACTGGAGGGCAAAGAAGATTTCCAATAATCAGAACAAGTCGGCTCCTGTTTAGTTACGAG +CGACATTGCTCCGTGTATTCACTCGTTGGAATGAATACACAGTGCAGTGTTTATTCTGTTATTTATGCCAAAAATAAAGG +CCACTATCAGGCAGCTTTGTTGTTCTGTTTACCAAGTTCTCTGGCAATCATTGCCGTCGTTCGTATTGCCCATTTATCGA +CATATTTCCCATCTTCCATTACAGGAAACATTTCTTCAGGCTTAACCATGCATTCCGATTGCAGCTTGCATCCATTGCAT +CGCTTGAATTGTCCACACCATTGATTTTTATCAATAGTCGTAGTCATACGGATAGTCCTGGTATTGTTCCATCACATCCT +GAGGATGCTCTTCGAACTCTTCAAATTCTTCTTCCATATATCACCTTAAATAGTGGATTGCGGTAGTAAAGATTGTGCCT +GTCTTTTAACCACATCAGGCTCGGTGGTTCTCGTGTACCCCTACAGCGAGAAATCGGATAAACTATTACAACCCCTACAG +TTTGATGAGTATAGAAATGGATCCACTCGTTATTCTCGGACGAGTGTTCAGTAATGAACCTCTGGAGAGAACCATGTATA +TGATCGTTATCTGGGTTGGACTTCTGCTTTTAAGCCCAGATAACTGGCCTGAATATGTTAATGAGAGAATCGGTATTCCT +CATGTGTGGCATGTTTTCGTCTTTGCTCTTGCATTTTCGCTAGCAATTAATGTGCATCGATTATCAGCTATTGCCAGCGC +CAGATATAAGCGATTTAAGCTAAGAAAACGCATTAAGATGCAAAACGATAAAGTGCGATCAGTAATTCAAAACCTTACAG +AAGAGCAATCTATGGTTTTGTGCGCAGCCCTTAATGAAGGCAGGAAGTATGTGGTTACATCAAAACAATTCCCATACATT +AGTGAGTTGATTGAGCTTGGTGTGTTGAACAAAACTTTTTCCCGATGGAATGGAAAGCATATATTATTCCCTATTGAGGA +TATTTACTGGACTGAATTAGTTGCCAGCTATGATCCATATAATATTGAGATAAAGCCAAGGCCAATATCTAAGTAACTAG +ATAAGAGGAATCGATTTTCCCTTAATTTTCTGGCGTCCACTGCATGTTATGCCGCGTTCGCCAGGCTTGCTGTACCATGT +GCGCTGATTCTTGCGCTCAATACGTTGCAGGTTGCTTTCAATCTGTTTGTGGTATTCAGCCAGCACTGTAAGGTCTATCG +GATTTAGTGCGCTTTCTACTCGTGATTTCGGTTTGCGATTCAGCGAGAGAATAGGGCGGTTAACTGGTTTTGCGCTTACC +CCAACCAACAGGGGATTTGCTGCTTTCCATTGAGCCTGTTTCTCTGCGCGACGTTCGCGGCGGCGTGTTTGTGCATCCAT +CTGGATTCTCCTGTCAGTTAGCTTTGGTGGTGTGTGGCAGTTGTAGTCCTGAACGAAAACCCCCCGCGATTGGCACATTG +GCAGCTAATCCGGAATCGCACTTACGGCCAATGCTTCGTTTCGTATCACACACCCCAAAGCCTTCTGCTTTGAATGCTGC +CCTTCTTCAGGGCTTAATTTTTAAGAGCGTCACCTTCATGGTGGTCAGTGCGTCCTGCTGATGTGCTCAGTATCACCGCC +AGTGGTATTTATGTCAACACCGCCAGAGATAATTTATCACCGCAGATGGTTATCTGTATGTTTTTTATATGAATTTATTT +TTTGCAGGGGGGCATTGTTTGGTAGGTGAGAGATCTGAATTGCTATGTTTAGTGAGTTGTATCTATTTATTTTTCAATAA +ATACAATTGGTTATGTGTTTTGGGGGCGATCGTGAGGCAAAGAAAACCCGGCGCTGAGGCCGGGTTATTCTTGTTCTCTG +GTCAAATTATATAGTTGGAAAACAAGGATGCATATATGAATGAACGATGCAGAGGCAATGCCGATGGCGATAGTGGGTAT +CATGTAGCCGCTTATGCTGGAAAGAAGCAATAACCCGCAGAAAAACAAAGCTCCAAGCTCAACAAAACTAAGGGCATAGA +CAATAACTACCGATGTCATATACCCATACTCTCTAATCTTGGCCAGTCGGCGCGTTCTGCTTCCGATTAGAAACGTCAAG +GCAGCAATCAGGATTGCAATCATGGTTCCTGCATATGATGACAATGTCGCCCCAAGACCATCTCTATGAGCTGAAAAAGA +AACACCAGGAATGTAGTGGCGGAAAAGGAGATAGCAAATGCTTACGATAACGTAAGGAATTATTACTATGTAAACACCAG +GCATGATTCTGTTCCGCATAATTACTCCTGATAATTAATCCTTAACTTTGCCCACCTGCCTTTTAAAACATTCCAGTATA +TCACTTTTCATTCTTGCGTAGCAATATGCCATCTCTTCAGCTATCTCAGCATTGGTGACCTTGTTCAGAGGCGCTGAGAG +ATGGCCTTTTTCTGATAGATAATGTTCTGTTAAAATATCTCCGGCCTCATCTTTTGCCCGCAGGCTAATGTCTGAAAATT +GAGGTGACGGGTTAAAAATAATATCCTTGGCAACCTTTTTTATATCCCTTTTAAATTTTGGCTTAATGACTATATCCAAT +GAGTCAAAAAGCTCCCCTTCAATATCTGTTGCCCCTAAGACCTTTAATATATCGCCAAATACAGGTAGCTTGGCTTCTAC +CTTCACCGTTGTTCGGCCGATGAAATGCATATGCATAACATCGTCTTTGGTGGTTCCCCTCATCAGTGGCTCTATCTGAA +CGCGCTCTCCACTGCTTAATGACATTCCTTTCCCGATTAAAAAATCTGTCAGATCGGATGTGGTCGGCCCGAAAACAGTT +CTGGCAAAACCAATGGTGTCGCCTTCAACAAACAAAAAAGATGGGAATCCCAATGATTCGTCATCTGCGAGGCTGTTCTT +AATATCTTCAACTGAAGCTTTAGAGCGATTTATCTTCTGAACCAGACTCTTGTCATTTGTTTTGGTAAAGAGAAAAGTTT +TTCCATCGATTTTATGAATATACAAATAATTGGAGCCAACCTGCAGGTGATGATTATCAGCCAGCAGAGAATTAAGGAAA +ACAGACAGGTTTATTGAGCGCTTATCTTTCCCTTTATTTTTGCTGCGGTAAGTCGCATAAAAACCATTCTTCATAATTCA +ATCCATTTACTATGTTATGTTCTGAGGGGAGTGAAAATTCCCCTAATTCGATGAAGATTCTTGCTCAATTGTTATCAGCT +ATGCGCCGACCAGAACACCTTGCCGATCAGCCAAACGTCTCTTCAGGCCACTGACTAGCGATAACTTTCCCCACAACGGA +ACAACTCTCATTGCATGGGATCATTGGGTACTGTGGGTTTAGTGGTTGTAAAAACACCTGACCGCTATCCCTGATCAGTT +TCTTGAAGGTAAACTCATCACCCCCAAGTCTGGCTATGCAGAAATCACCTGGCTCAACAGCCTGCTCAGGGTCAACGAGA +ATTAACATTCCGTCAGGAAAGCTTGGCTTGGAGCCTGTTGGTGCGGTCATGGAATTACCTTCAACCTCAAGCCAGAATGC +AGAATCACTGGCTTTTTTGGTTGTGCTTACCCATCTCTCCGCATCACCTTTGGTAAAGGTTCTAAGCTTAGGTGAGAACA +TCCCTGCCTGAACATGAGAAAAAACAGGGTACTCATACTCACTTCTAAGTGACGGCTGCATACTAACCGCTTCATACATC +TCGTAGATTTCTCTGGCGATTGAAGGGCTAAATTCTTCAACGCTAACTTTGAGAATTTTTGTAAGCAATGCGGCGTTATA +AGCATTTAATGCATTGATGCCATTAAATAAAGCACCAACGCCTGACTGCCCCATCCCCATCTTGTCTGCGACAGATTCCT +GGGATAAGCCAAGTTCATTTTTCTTTTTTTCATAAATTGCTTTAAGGCGACGTGCGTCCTCAAGCTGCTCTTGTGTTAAT +GGTTTCTTTTTTGTGCTCATACGTTAAATCTATCACCGCAAGGGATAAATATCTAACACCGTGCGTGTTGACTATTTTAC +CTCTGGCGGTGATAATGGTTGCATGTACTAAGGAGGTTGTATGGAACAACGCATAACCCTGAAAGATTATGCAATGCGCT +TTGGGCAAACCAAGACAGCTAAAGATCTCGGCGTATATCAAAGCGCGATCAACAAGGCCATTCATGCAGGCCGAAAGATT +TTTTTAACTATAAACGCTGATGGAAGCGTTTATGCGGAAGAGGTAAAGCCCTTCCCGAGTAACAAAAAAACAACAGCATA +AATAACCCCGCTCTTACACATTCCAGCCCTGAAAAAGGGCATCAAATTAAACCACACCTATGGTGTATGCATTTATTTGC +ATACATTCAATCAATTGTTATCTAAGGAAATACTTACATATGGTTCGTGCAAACAAACGCAACGAGGCTCTACGAATCGA +GAGTGCGTTGCTTAACAAAATCGCAATGCTTGGAACTGAGAAGACAGCGGAAGCTGTGGGCGTTGATAAGTCGCAGATCA +GCAGGTGGAAGAGGGACTGGATTCCAAAGTTCTCAATGCTGCTTGCTGTTCTTGAATGGGGGGTCGTTGACGACGACATG +GCTCGATTGGCGCGACAAGTTGCTGCGATTCTCACCAATAAAAAACGCCCGGCGGCAACCGAGCGTTCTGAACAAATCCA +GATGGAGTTCTGAGGTCATTACTGGATCTATCAACAGGAGTCATTATGACAAATACAGCAAAAATACTCAACTTCGGCAG +AGGTAACTTTGCCGGACAGGAGCGTAATGTGGCAGATCTCGATGATGGTTACGCCAGACTATCAAATATGCTGCTTGAGG +CTTATTCGGGCGCAGATCTGACCAAGCGACAGTTTAAAGTGCTGCTTGCCATTCTGCGTAAAACCTATGGGTGGAATAAA +CCAATGGACAGAATCACCGATTCTCAACTTAGCGAGATTACAAAGTTACCTGTCAAACGGTGCAATGAAGCCAAGTTAGA +ACTCGTCAGAATGAATATTATCAAGCAGCAAGGCGGCATGTTTGGACCAAATAAAAACATCTCAGAATGGTGCATCCCTC +AAAACGAGGGAAAATCCCCTAAAACGAGGGATAAAACATCCCTCAAATTGGGGGATTGCTATCCCTCAAAACAGGGGGAC +ACAAAAGACACTATTACAAAAGAAAAAAGAAAAGATTATTCGTCAGAGAATTCTGGCGAATCCTCTGACCAGCCAGAAAA +CGACCTTTCTGTGGTGAAACCGGATGCTGCAATTCAGAGCGGCAGCAAGTGGGGGACAGCAGAAGACCTGACCGCCGCAG +AGTGGATGTTTGACATGGTGAAGACTATCGCACCATCAGCCAGAAAACCGAATTTTGCTGGGTGGGCTAACGATATCCGC +CTGATGCGTGAACGTGACGGACGTAACCACCGCGACATGTGTGTGCTGTTCCGCTGGGCATGCCAGGACAACTTCTGGTC +CGGTAACGTGCTGAGCCCGGCCAAACTCCGCGATAAGTGGACCCAACTCGAAATCAACCGTAACAAGCAACAGGCAGGCG +TGACAGCCAGCAAACCAAAACTCGACCTGACAAACACAGACTGGATTTACGGGGTGGATCTATGAAAAACATCGCCGCAC +AGATGGTTAACTTTGACCGTGAGCAGATGCGTCGGATCGCCAACAACATGCCGGAACAGTACGACGAAAAGCCGCAGGTA +CAGCAGGTAGCGCAGATCATCAACGGTGTGTTCAGCCAGTTACTGGCAACTTTCCCGGCGAGCCTGGCTAACCGTGACCA +GAACGAAGTGAACGAAATCCGTCGCCAGTGGGTTCTGGCTTTTCGGGAAAACGGGATCACCACGATGGAACAGGTTAACG +CAGGAATGCGCGTAGCCCGTCGGCAGAATCGACCATTTCTGCCATCACCCGGGCAGTTTGTTGCATGGTGCCGGGAAGAA +GCATCCGTTACCGCCGGACTGCCAAACGTCAGCGAGCTGGTTGATATGGTTTACGAGTATTGCCGGAAGCGAGGCCTGTA +TCCGGATGCGGAGTCTTATCCGTGGAAATCAAACGCGCACTACTGGCTGGTTACCAACCTGTATCAGAACATGCGGGCCA +ATGCGCTTACTGATGCGGAATTACGCCGTAAGGCCGCAGATGAGCTTGTCCATATGACTGCGAGAATTAACCGTGGTGAG +GCGATCCCTGAACCAGTAAAACAACTTCCTGTCATGGGCGGTAGACCTCTAAATCGTGCACAGGCTCTGGCGAAGATCGC +AGAAATCAAAGCTAAGTTCGGACTGAAAGGAGCAAGTGTATGACGGGCAAAGAGGCAATTATTCATTACCTGGGGACGCA +TAATAGCTTCTGTGCGCCGGACGTTGCCGCGCTAACAGGCGCAACAGTAACCAGCATAAATCAGGCCGCGGCTAAAATGG +CACGGGCAGGTCTTCTGGTTATCGAAGGTAAGGTCTGGCGAACGGTGTATTACCGGTTTGCTACCAGGGAAGAACGGGAA +GGAAAGATGAGCACGAACCTGGTTTTTAAGGAGTGTCGCCAGAGTGCCGCGATGAAACGGGTATTGGCGGTATATGGAGT +TAAAAGATGACCATCTACATTACTGAGCTAATAACAGGCCTGCTGGTAATCGCAGGCCTTTTTATTTGGGGGAGAGGGAA +GTCATGAAAAAACTAACCTTTGAAATTCGATCTCCAGCACATCAGCAAAACGCTATTCACGCAGTACAGCAAATCCTTCC +AGACCCAACCAAACCAATCGTAGTAACCATTCAGGAACGCAACCGCAGCTTAGACCAAAACAGGAAGCTATGGGCCTGCT +TAGGTGACGTCTCTCGTCAGGTTGAATGGCATGGTCGCTGGCTGGATGCAGAAAGCTGGAAGTGTGTGTTTACCGCAGCA +TTAAAGCAGCAGGATGTTGTTCCTAACCTTGCCGGGAATGGCTTTGTGGTAATAGGCCAGTCAACCAGCAGGATGCGTGT +AGGCGAATTTGCGGAGCTATTAGAGCTTATACAGGCATTCGGTACAGAGCGTGGCGTTAAGTGGTCAGACGAAGCGAGAC +TGGCTCTGGAGTGGAAAGCGAGATGGGGAGACAGGGCTGCATGATAAATGTCGTTAGTTTCTCCGGTGGCAGGACGTCAG +CATATTTGCTCTGGCTAATGGAGCAAAAGCGACGGGCAGGTAAAGACGTGCATTACGTTTTCATGGATACAGGTTGTGAA +CATCCAATGACATATCGGTTTGTCAGGGAAGTTGTGAAGTTCTGGGATATACCGCTCACCGTATTGCAGGTTGATATCAA +CCCGGAGCTTGGACAGCCAAATGGTTATACGGTATGGGAACCAAAGGATATTCAGACGCGAATGCCTGTTCTGAAGCCAT +TTATCGATATGGTAAAGAAATATGGCACTCCATACGTCGGCGGCGCGTTCTGCACTGACAGATTAAAACTCGTTCCCTTC +ACCAAATACTGTGATGACCATTTCGGGCGAGGGAATTACACCACGTGGATTGGCATCAGAGCTGATGAACCGAAGCGGCT +AAAGCCAAAGCCTGGAATCAGATATCTTGCTGAACTGTCAGACTTTGAGAAGGAAGATATCCTCGCATGGTGGAAGCAAC +AACCATTCGATTTGCAAATACCGGAACATCTCGGTAACTGCATATTCTGCATTAAAAAATCAACGCAAAAAATCGGACTT +GCCTGCAAAGATGAGGAGGGATTGCAGCGTGTTTTTAATGAGGTCATCACGGGATCCCATGTGCGTGACGGACATCGGGA +AACGCCAAAGGAGATTATGTACCGAGGAAGAATGTCGCTGGACGGTATCGCGAAAATGTATTCAGAAAATGATTATCAAG +CCCTGTATCAGGACATGGTACGAGCTAAAAGATTCGATACCGGCTCTTGTTCTGAGTCATGCGAAATATTTGGAGGGCAG +CTTGATTTCGACTTCGGGAGGGAAGCTGCATGATGCGATGTTATCGGTGCGGTGAATGCAAAGAAGATAACCGCTTCCGA +CCAAATCAACCTTACTGGAATCGATGGTGTCTCCGGTGTGAAAGAACACCAACAGGGGTGTTACCACTACCGCAGGAAAA +GGAGGACGTGTGGCGAGACAGCGACGAAGTATCACCGACATAATCTGCGAAAACTGCAAATACCTTCCAACGAAACGCAC +CAGAAATAAACCCAAGCCAATCCCAAAAGAATCTGACGTAAAAACCTTCAACTACACGGCTCACCTGTGGGATATCCGGT +GGCTAAGACGTCGTGCGAGGAAAACAAGGTGATTGACCAAAATCGAAGTTACGAACAAGAAAGCGTCGAGCGAGCTTTAA +CGTGCGCTAACTGCGGTCAGAAGCTGCATGTGCTGGAAGTTCACGTGTGTGAGCACTGCTGCGCAGAACTGATGAGCGAT +CCGAATAGCTCGATGCACGAGGAAGAAGATGATGGCTAAACCAGCGCGAAGACGATGTAAAAACGATGAATGCCGGGAAT +GGTTTCACCCTGCATTCGCTAATCAGTGGTGGTGCTCTCCAGAGTGTGGAACCAAGATAGCACTCGAACGACGAAGTAAA +GAACGCGAAAAAGCGGAAAAAGCAGCAGAGAAGAAACGACGACGAGAGGAGCAGAAACAGAAAGATAAACTTAAGATTCG +AAAACTCGCCTTAAAGCCCCGCAGTTACTGGATTAAACAAGCCCAACAAGCCGTAAACGCCTTCATCAGAGAAAGAGACC +GCGACTTACCATGTATCTCGTGCGGAACGCTCACGTCTGCTCAGTGGGATGCCGGACATTACCGGACAACTGCTGCGGCA +CCTCAACTCCGATTTAATGAACGCAATATTCACAAGCAATGCGTGGTGTGCAACCAGCACAAAAGCGGAAATCTCGTTCC +GTATCGCGTCGAACTGATTAGCCGCATCGGGCAGGAAGCAGTAGACGAAATCGAATCAAACCATAACCGCCATCGCTGGA +CTATCGAAGAGTGCAAGGCGATCAAGGCAGAGTACCAACAGAAACTCAAAGACCTGCGAAATAGCAGAAGTGAGGCCGCA +TGACGTTCTCAGTAAAAACCATTCCAGACATGCTCGTTGAAACATACGGAAATCAGACAGAAGTAGCACGCAGACTGAAA +TGTAGTCGCGGTACGGTCAGAAAATACGTTGATGATAAAGACGGGAAAATGCACGCCATCGTCAACGACGTTCTCATGGT +TCATCGCGGATGGAGTGAAAGAGATGCGCTATTACGAAAAAATTGATGGCAGCAAATACCGAAATATTTGGGTAGTTGGC +GATCTGCACGGATGCTACACGAACCTGATGAACAAACTGGATACGATTGGATTCGACAACAAAAAAGACCTGCTTATCTC +GGTGGGCGATTTGGTTGATCGTGGTGCAGAGAACGTTGAATGCCTGGAATTAATCACATTCCCCTGGTTCAGAGCTGTAC +GTGGAAACCATGAGCAAATGATGATTGATGGCTTATCAGAGCGTGGAAACGTTAATCACTGGCTGCTTAATGGCGGTGGC +TGGTTCTTTAATCTCGATTACGACAAAGAAATTCTGGCTAAAGCTCTTGCCCATAAAGCAGATGAACTTCCGTTAATCAT +CGAACTGGTGAGCAAAGATAAAAAATATGTTATCTGCCACGCCGATTATCCCTTTGACGAATACGAGTTTGGAAAGCCAG +TTGATCATCAGCAGGTAATCTGGAACCGCGAACGAATCAGCAACTCACAAAACGGGATCGTGAAAGAAATCAAAGGCGCG +GACACGTTCATCTTTGGTCATACGCCAGCAGTGAAACCACTCAAGTTTGCCAACCAAATGTATATCGATACCGGCGCAGT +GTTCTGCGGAAACCTAACATTGATTCAGGTACAGGGAGAAGGCGCATGAGACTCGAAAGCGTAGCTAAATTTCATTCGCC +AAAAAGCCCGATGATGAGCGACTCACCACGGGCCACGGCTTCTGACTCTCTTTCCGGTACTGATGTGATGGCTGCTATGG +GGATGGCGCAATCACAAGCCGGATTCGGTATGGCTGCATTCTGCGGTAAGCACGAACTCAGCCAGAACGACAAACAAAAG +GCTATCAACTATCTGATGCAATTTGCACACAAGGTATCGGGGAAATACCGTGGTGTGGCAAAGCTTGAAGGAAATACTAA +GGCAAAGGTACTGCAAGTGCTCGCAACATTCGCTTATGCGGATTATTGCCGTAGTGCCGCGACGCCGGGGGCAAGATGCA +GAGATTGCCATGGTACAGGCCGTGCGGTTGATATTGCCAAAACAGAGCTGTGGGGGAGAGTTGTCGAGAAAGAGTGCGGA +AGATGCAAAGGCGTCGGCTATTCAAGGATGCCAGCAAGCGCAGCATATCGCGCTGTGACGATGCTAATCCCAAACCTTAC +CCAACCCACCTGGTCACGCACTGTTAAGCCGCTGTATGACGCTCTGGTGGTGCAATGCCACAAAGAAGAGTCAATCGCAG +ACAACATTTTGAATGCGGTCACACGTTAGCAGCATGATTGCCACGGATGGCAACATATTAACGGCATGATATTGACTTAT +TGAATAAAATTGGGTAAATTTGACTCAACGATGGGTTAATTCGCTCGTTGTGGTAGTGAGATGAAAAGAGGCGGCGCTTA +CTACCGATTCCGCCTAGTTGGTCACTTCGACGTATCGTCTGGAACTCCAACCATCGCAGGCAGAGAGGTCTGCAAAATGC +AATCCCGAAACAGTTCGCAGGTAATAGTTAGAGCCTGCATAACGGTTTCGGGATTTTTTATATCTGCACAACAGGTAAGA +GCATTGAGTCGATAATCGTGAAGAGTCGGCGAGCCTGGTTAGCCAGTGCTCTTTCCGTTGTGCTGAATTAAGCGAATACC +GGAAGCAGAACCGGATCACCAAATGCGTACAGGCGTCATCGCCGCCCAGCAACAGCACAACCCAAACTGAGCCGTAGCCA +CTGTCTGTCCTGAATTCATTAGTAATAGTTACGCTGCGGCCTTTTACACATGACCTTCGTGAAAGCGGGTGGCAGGAGGT +CGCGCTAACAACCTCCTGCCGTTTTGCCCGTGCATATCGGTCACGAACAAATCTGATTACTAAACACAGTAGCCTGGATT +TGTTCTATCAGTAATCGACCTTATTCCTAATTAAATAGAGCAAATCCCCTTATTGGGGGTAAGACATGAAGATGCCAGAA +AAACATGACCTGTTGGCCGCCATTCTCGCGGCAAAGGAACAAGGCATCGGGGCAATCCTTGCGTTTGCAATGGCGTACCT +TCGCGGCAGATATAATGGCGGTGCGTTTACAAAAACAGTAATCGACGCAACGATGTGCGCCATTATCGCCTAGTTCATTC +GTGACCTTCTCGACTTCGCCGGACTAAGTAGCAATCTCGCTTATATAACGAGCGTGTTTATCGGCTACATCGGTACTGAC +TCGATTGGTTCGCTTATCAAACGCTTCGCTGCTAAAAAAGCCGGAGTAGAAGATGGTAGAAATCAATAATCAACGTAAGG +CGTTCCTCGATATGCTGGCGTGGTCGGAGGGAACTGATAACGGACGTCAGAAAACCAGAAATCATGGTTATGACGTCATT +GTAGGCGGAGAGCTATTTACTGATTACTCCGATCACCCTCGCAAACTTGTCACGCTAAACCCAAAACTCAAATCAACAGG +CGCCGGACGCTACCAGCTTCTTTCCCGTTGGTGGGATGCCTACCGCAAGCAGCTTGGCCTGAAAGACTTCTCTCCGAAAA +GTCAGGACGCTGTGGCATTGCAGCAGATTAAGGAGCGTGGCGCTTTACCTATGATTGATCGTGGTGATATCCGTCAGGCA +ATCGACCGTTGCAGCAATATCTGGGCTTCACTGCCGGGCGCTGGTTATGGTCAGTTCGAGCATAAGGCTGACAGCCTGAT +TGCAAAATTCAAAGAAGCGGGCGGAACGGTCAGAGAGATTGATGTATGAGCAGAGTCACCGCGATTATCTCCGCTCTGGT +TATCTGCATCATCGTCTGCCTGTCATGGGCTGTTAATCATTACCGTGATAACGCCATTACCTACAAAGCCCAGCGCGACA +AAAATGCCAGAGAACTGAAGCTGGCGAACGCGGCAATTACTGACATGCAGATGCGTCAGCGTGATGTTGCTGCGCTCGAT +GCAAAATACACGAAGGAGTTAGCTGATGCTAAAGCTGAAAATGATGCTCTGCGTGATGATGTTGCCGCTGGTCGTCGTCG +GTTGCACATCAAAGCAGTCTGTCAGTCAGTGCGTGAAGCCACCACCGCCTCCGGCGTGGATAATGCAGCCTCCCCCCGAC +TGGCAGACACCGCTGAACGGGATTATTTCACCCTCAGAGAGAGGCTGATCACTATGCAAAAACAACTGGAAGGAACCCAG +AAGTATATTAATGAGCAGTGCAGATAGAGTTGCCCATATCGATGGGCAACTCATGCAATTATTGTGAGCAATACACACGC +GCTTCCAGCGGAGTATAAATGCCTAAAGTAATAAAACCGAGCAATCCATTTACGAATGTTTGCTGGGTTTCTGTTTTAAC +AACATTTTCTGCGCCGCCACAAATTTTGGCTGCATCGACAGTTTTCTTCTGCCCAATTCCAGAAACGAAGAAATGATGGG +TGATGGTTTCCTTTGGTGCTACTGCTGCCGGTTTGTTTTGAACAGTAAACGTCTGTTGAGCACATCCTGTAATAAGCAGG +GCCAGCGCAGTAGCGAGTAGCATTTTTTTCATGGTGTTATTCCCGATGCTTTTTGAAGTTCGCAGAATCGTATGTGTAGA +AAATTAAACAAACCCTAAACAATGAGTTGAAATTTCATATTGTTAATATTTATTAATGTATGTCAGGTGCGATGAATCGT +CATTGTATTCCCGGATTAACTATGTCCACAGCCCTGACGGGGAACTTCTCTGCGGGAGTGTCCGGGAATAATTAAAACGA +TGCACACAGGGTTTAGCGCGTACACGTATTGCATTATGCCAACGCCCCGGTGCTGACACGGAAGAAACCGGACGTTATGA +TTTAGCGTGGAAAGATTTGTGTAGTGTTCTGAATGCTCTCAGTAAATAGTAATGAATTATCAAAGGTATAGTAATATCTT +TTATGTTCATGGATATTTGTAACCCATCGGAAAACTCCTGCTTTAGCAAGATTTTCCCTGTATTGCTGAAATGTGATTTC +TCTTGATTTCAACCTATCATAGGACGTTTCTATAAGATGCGTGTTTCTTGAGAATTTAACATTTACAACCTTTTTAAGTC +CTTTTATTAACACGGTGTTATCGTTTTCTAACACGATGTGAATATTATCTGTGGCTAGATAGTAAATATAATGTGAGACG +TTGTGACGTTTTAGTTCAGAATAAAACAATTCACAGTCTAAATCTTTTCGCACTTGATCGAATATTTCTTTAAAAATGGC +AACCTGAGCCATTGGTAAAACCTTCCATGTGATACGAGGGCGCGTAGTTTGCATTATCGTTTTTATCGTTTCAATCTGGT +CTGACCTCCTTGTGTTTTGTTGATGATTTATGTCAAATATTAGGAATGTTTTCACTTAATAGTATTGGTTGCGTAACAAA +GTGCGGTCCTGCTGGCATTCTGGAGGGAAATACAACCGACAGATGTATGTAAGGCCAACGTGCTCAAATCTTCATACAGA +AAGATTTGAAGTAATATTTTAACCGCTAGATGAAGAGCAAGCGCATGGAGCGACAAAATGAATAAAGAACAATCTGCTGA +TGATCCCTCCGTGGATCTGATTCGTGTAAAAAATATGCTTAATAGCACCATTTCTATGAGTTACCCTGATGTTGTAATTG +CATGTATAGAACATAAGGTGTCTCTGGAAGCATTCAGAGCAATTGAGGCAGCGTTGGTGAAGCACGATAATAATATGAAG +GATTATTCCCTGGTGGTTGACTGATCACCATAACTGCTAATCATTCAAACTATTTAGTCTGTGACAGAGCCAACACGCAG +TCTGTCACTGTCAGGAAAGTGGTAAAACTGCAACTCAATTACTGCAATGCCCTCGTAATTAAGTGAATTTACAATATCGT +CCTGTTCGGAGGGAAGAACGCGGGATGTTCATTCTTCATCACTTTTAATTGATGTATATGCTCTCTTTTCTGACGTTAGT +CTCCGACGGCAGGCTTCAATGACCCAGGCTGAGAAATTCCCGGACCCTTTTTGCTCAAGAGCGATGTTAATTTGTTCAAT +CATTTGGTTAGGAAAGCGGATGTTGCGGGTTGTTGTTCTGCGGGTTCTGTTCTTCGTTGACATGAGGTTGCCCCGTATTC +AGTGTCGCTGATTTGTATTGTCTGAAGTTGTTTTTACGTTAAGTTGATGCAGATCAATTAATACGATACCTGCGTCATAA +TTGATTATTTGACGTGGTTTGATGGCCTCCACGCACGTTGTGATATGTAGATGATAATCATTATCACTTTACGGGTCCTT +TCCGGTGATCCGACAGGTTACG diff --git a/tests/data/lambdaNEB.fa.fai b/tests/data/lambdaNEB.fa.fai new file mode 100644 index 0000000..064af36 --- /dev/null +++ b/tests/data/lambdaNEB.fa.fai @@ -0,0 +1 @@ +lambda_NEB3011 48502 16 80 81 diff --git a/tests/data/merge.fofn b/tests/data/merge.fofn new file mode 100644 index 0000000..8a79dff --- /dev/null +++ b/tests/data/merge.fofn @@ -0,0 +1,2 @@ +aligned.bam +aligned2.bam diff --git a/tests/data/phi29.bam b/tests/data/phi29.bam new file mode 100644 index 0000000..46176b6 Binary files /dev/null and b/tests/data/phi29.bam differ diff --git a/tests/data/phi29.bam.pbi b/tests/data/phi29.bam.pbi new file mode 100644 index 0000000..5282b94 Binary files /dev/null and b/tests/data/phi29.bam.pbi differ diff --git a/tests/data/polymerase/consolidate.subread.dataset.xml b/tests/data/polymerase/consolidate.subread.dataset.xml new file mode 100644 index 0000000..ca85a7a --- /dev/null +++ b/tests/data/polymerase/consolidate.subread.dataset.xml @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + diff --git a/tests/data/polymerase/filtered_resources.subread.dataset.xml b/tests/data/polymerase/filtered_resources.subread.dataset.xml new file mode 100644 index 0000000..e414e00 --- /dev/null +++ b/tests/data/polymerase/filtered_resources.subread.dataset.xml @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/polymerase/internal.hqregions.bam b/tests/data/polymerase/internal.hqregions.bam new file mode 100644 index 0000000..8e31e6b Binary files /dev/null and b/tests/data/polymerase/internal.hqregions.bam differ diff --git a/tests/data/polymerase/internal.hqregions.bam.pbi b/tests/data/polymerase/internal.hqregions.bam.pbi new file mode 100644 index 0000000..b79e661 Binary files /dev/null and b/tests/data/polymerase/internal.hqregions.bam.pbi differ diff --git a/tests/data/polymerase/internal.lqregions.bam b/tests/data/polymerase/internal.lqregions.bam new file mode 100644 index 0000000..96878a3 Binary files /dev/null and b/tests/data/polymerase/internal.lqregions.bam differ diff --git a/tests/data/polymerase/internal.lqregions.bam.pbi b/tests/data/polymerase/internal.lqregions.bam.pbi new file mode 100644 index 0000000..a4b7237 Binary files /dev/null and b/tests/data/polymerase/internal.lqregions.bam.pbi differ diff --git a/tests/data/polymerase/internal.polymerase.bam b/tests/data/polymerase/internal.polymerase.bam new file mode 100644 index 0000000..8f293c1 Binary files /dev/null and b/tests/data/polymerase/internal.polymerase.bam differ diff --git a/tests/data/polymerase/internal.polymerase.bam.pbi b/tests/data/polymerase/internal.polymerase.bam.pbi new file mode 100644 index 0000000..c423905 Binary files /dev/null and b/tests/data/polymerase/internal.polymerase.bam.pbi differ diff --git a/tests/data/polymerase/internal.scraps.bam b/tests/data/polymerase/internal.scraps.bam new file mode 100644 index 0000000..47c1689 Binary files /dev/null and b/tests/data/polymerase/internal.scraps.bam differ diff --git a/tests/data/polymerase/internal.scraps.bam.pbi b/tests/data/polymerase/internal.scraps.bam.pbi new file mode 100644 index 0000000..9db21f2 Binary files /dev/null and b/tests/data/polymerase/internal.scraps.bam.pbi differ diff --git a/tests/data/polymerase/internal.subreads.bam b/tests/data/polymerase/internal.subreads.bam new file mode 100644 index 0000000..00ad171 Binary files /dev/null and b/tests/data/polymerase/internal.subreads.bam differ diff --git a/tests/data/polymerase/internal.subreads.bam.pbi b/tests/data/polymerase/internal.subreads.bam.pbi new file mode 100644 index 0000000..b0d7e28 Binary files /dev/null and b/tests/data/polymerase/internal.subreads.bam.pbi differ diff --git a/tests/data/polymerase/multiple_resources.subread.dataset.xml b/tests/data/polymerase/multiple_resources.subread.dataset.xml new file mode 100644 index 0000000..109535d --- /dev/null +++ b/tests/data/polymerase/multiple_resources.subread.dataset.xml @@ -0,0 +1,46 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/polymerase/production.polymerase.bam b/tests/data/polymerase/production.polymerase.bam new file mode 100644 index 0000000..4c84b23 Binary files /dev/null and b/tests/data/polymerase/production.polymerase.bam differ diff --git a/tests/data/polymerase/production.scraps.bam b/tests/data/polymerase/production.scraps.bam new file mode 100644 index 0000000..a32bdfb Binary files /dev/null and b/tests/data/polymerase/production.scraps.bam differ diff --git a/tests/data/polymerase/production.scraps.bam.pbi b/tests/data/polymerase/production.scraps.bam.pbi new file mode 100644 index 0000000..5ef119d Binary files /dev/null and b/tests/data/polymerase/production.scraps.bam.pbi differ diff --git a/tests/data/polymerase/production.subreads.bam b/tests/data/polymerase/production.subreads.bam new file mode 100644 index 0000000..452aad5 Binary files /dev/null and b/tests/data/polymerase/production.subreads.bam differ diff --git a/tests/data/polymerase/production.subreads.bam.pbi b/tests/data/polymerase/production.subreads.bam.pbi new file mode 100644 index 0000000..7ff2fcc Binary files /dev/null and b/tests/data/polymerase/production.subreads.bam.pbi differ diff --git a/tests/data/polymerase/production_hq.hqregion.bam b/tests/data/polymerase/production_hq.hqregion.bam new file mode 100644 index 0000000..66d436b Binary files /dev/null and b/tests/data/polymerase/production_hq.hqregion.bam differ diff --git a/tests/data/polymerase/production_hq.hqregion.bam.pbi b/tests/data/polymerase/production_hq.hqregion.bam.pbi new file mode 100644 index 0000000..ec8f166 Binary files /dev/null and b/tests/data/polymerase/production_hq.hqregion.bam.pbi differ diff --git a/tests/data/polymerase/production_hq.scraps.bam b/tests/data/polymerase/production_hq.scraps.bam new file mode 100644 index 0000000..716e098 Binary files /dev/null and b/tests/data/polymerase/production_hq.scraps.bam differ diff --git a/tests/data/polymerase/production_hq.scraps.bam.pbi b/tests/data/polymerase/production_hq.scraps.bam.pbi new file mode 100644 index 0000000..1017562 Binary files /dev/null and b/tests/data/polymerase/production_hq.scraps.bam.pbi differ diff --git a/tests/data/polymerase/qnameFiltered.subreads.dataset.xml b/tests/data/polymerase/qnameFiltered.subreads.dataset.xml new file mode 100644 index 0000000..c200ded --- /dev/null +++ b/tests/data/polymerase/qnameFiltered.subreads.dataset.xml @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + diff --git a/tests/data/polymerase/qname_whitelist.txt b/tests/data/polymerase/qname_whitelist.txt new file mode 100644 index 0000000..0004061 --- /dev/null +++ b/tests/data/polymerase/qname_whitelist.txt @@ -0,0 +1,3 @@ +ArminsFakeMovie/0/3116_3628 +ArminsFakeMovie/0/3722_4267 +ArminsFakeMovie/0/6812_7034 diff --git a/tests/data/polymerase/scrapless.scraps.bam b/tests/data/polymerase/scrapless.scraps.bam new file mode 100644 index 0000000..7b989c4 Binary files /dev/null and b/tests/data/polymerase/scrapless.scraps.bam differ diff --git a/tests/data/polymerase/scrapless.scraps.bam.pbi b/tests/data/polymerase/scrapless.scraps.bam.pbi new file mode 100644 index 0000000..140af8a Binary files /dev/null and b/tests/data/polymerase/scrapless.scraps.bam.pbi differ diff --git a/tests/data/polymerase/scrapless.subreads.bam b/tests/data/polymerase/scrapless.subreads.bam new file mode 100644 index 0000000..739b3b4 Binary files /dev/null and b/tests/data/polymerase/scrapless.subreads.bam differ diff --git a/tests/data/polymerase/scrapless.subreads.bam.pbi b/tests/data/polymerase/scrapless.subreads.bam.pbi new file mode 100644 index 0000000..a20a00f Binary files /dev/null and b/tests/data/polymerase/scrapless.subreads.bam.pbi differ diff --git a/tests/data/relative/a/test.bam b/tests/data/relative/a/test.bam new file mode 100644 index 0000000..bd06b8a Binary files /dev/null and b/tests/data/relative/a/test.bam differ diff --git a/tests/data/relative/b/test1.bam b/tests/data/relative/b/test1.bam new file mode 100644 index 0000000..bd06b8a Binary files /dev/null and b/tests/data/relative/b/test1.bam differ diff --git a/tests/data/relative/b/test2.bam b/tests/data/relative/b/test2.bam new file mode 100644 index 0000000..bd06b8a Binary files /dev/null and b/tests/data/relative/b/test2.bam differ diff --git a/tests/data/relative/relative.fofn b/tests/data/relative/relative.fofn new file mode 100644 index 0000000..755c589 --- /dev/null +++ b/tests/data/relative/relative.fofn @@ -0,0 +1,3 @@ +a/test.bam +b/test1.bam +b/test2.bam diff --git a/tests/data/relative/relative.xml b/tests/data/relative/relative.xml new file mode 100644 index 0000000..0e78fe4 --- /dev/null +++ b/tests/data/relative/relative.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/tests/data/relative/relative2.fofn b/tests/data/relative/relative2.fofn new file mode 100644 index 0000000..f1969ac --- /dev/null +++ b/tests/data/relative/relative2.fofn @@ -0,0 +1,4 @@ +a/test.bam +b/test1.bam +b/test2.bam +relative.xml diff --git a/tests/data/segfault.bam b/tests/data/segfault.bam new file mode 100644 index 0000000..755c7eb Binary files /dev/null and b/tests/data/segfault.bam differ diff --git a/tests/data/test_group_query/group.fofn.in b/tests/data/test_group_query/group.fofn.in new file mode 100644 index 0000000..4af9e82 --- /dev/null +++ b/tests/data/test_group_query/group.fofn.in @@ -0,0 +1,3 @@ +@PacBioBAM_TestsDir@/data/test_group_query/test1.bam +@PacBioBAM_TestsDir@/data/test_group_query/test2.bam +@PacBioBAM_TestsDir@/data/test_group_query/test3.bam diff --git a/tests/data/test_group_query/test1.bam b/tests/data/test_group_query/test1.bam new file mode 100644 index 0000000..5673abc Binary files /dev/null and b/tests/data/test_group_query/test1.bam differ diff --git a/tests/data/test_group_query/test2.bam b/tests/data/test_group_query/test2.bam new file mode 100644 index 0000000..565b224 Binary files /dev/null and b/tests/data/test_group_query/test2.bam differ diff --git a/tests/data/test_group_query/test2.bam.pbi b/tests/data/test_group_query/test2.bam.pbi new file mode 100644 index 0000000..384ad28 Binary files /dev/null and b/tests/data/test_group_query/test2.bam.pbi differ diff --git a/tests/data/test_group_query/test3.bam b/tests/data/test_group_query/test3.bam new file mode 100644 index 0000000..3b1e21b Binary files /dev/null and b/tests/data/test_group_query/test3.bam differ diff --git a/tests/data/unmap1.bam b/tests/data/unmap1.bam new file mode 100644 index 0000000..3fe2af5 Binary files /dev/null and b/tests/data/unmap1.bam differ diff --git a/tests/data/unmap1.bam.bai b/tests/data/unmap1.bam.bai new file mode 100644 index 0000000..dd19971 Binary files /dev/null and b/tests/data/unmap1.bam.bai differ diff --git a/tests/data/unmap2.bam b/tests/data/unmap2.bam new file mode 100644 index 0000000..8feed79 Binary files /dev/null and b/tests/data/unmap2.bam differ diff --git a/tests/data/unmap2.bam.bai b/tests/data/unmap2.bam.bai new file mode 100644 index 0000000..f495714 Binary files /dev/null and b/tests/data/unmap2.bam.bai differ diff --git a/tests/files.cmake b/tests/files.cmake new file mode 100644 index 0000000..61370ac --- /dev/null +++ b/tests/files.cmake @@ -0,0 +1,56 @@ +# test case headers +set( PacBioBAMTest_H + +) + +# test case sources +set( PacBioBAMTest_CPP + + ${PacBioBAM_TestsDir}/src/test_Accuracy.cpp + ${PacBioBAM_TestsDir}/src/test_AlignmentPrinter.cpp + ${PacBioBAM_TestsDir}/src/test_BamFile.cpp + ${PacBioBAM_TestsDir}/src/test_BamHeader.cpp + ${PacBioBAM_TestsDir}/src/test_BamRecord.cpp + ${PacBioBAM_TestsDir}/src/test_BamRecordBuilder.cpp + ${PacBioBAM_TestsDir}/src/test_BamRecordClipping.cpp + ${PacBioBAM_TestsDir}/src/test_BamRecordImplCore.cpp + ${PacBioBAM_TestsDir}/src/test_BamRecordImplTags.cpp + ${PacBioBAM_TestsDir}/src/test_BamRecordImplVariableData.cpp + ${PacBioBAM_TestsDir}/src/test_BamRecordMapping.cpp + ${PacBioBAM_TestsDir}/src/test_BamWriter.cpp + ${PacBioBAM_TestsDir}/src/test_BarcodeQuery.cpp + ${PacBioBAM_TestsDir}/src/test_Cigar.cpp + ${PacBioBAM_TestsDir}/src/test_Compare.cpp + ${PacBioBAM_TestsDir}/src/test_DataSetCore.cpp + ${PacBioBAM_TestsDir}/src/test_DataSetIO.cpp + ${PacBioBAM_TestsDir}/src/test_DataSetQuery.cpp + ${PacBioBAM_TestsDir}/src/test_DataSetXsd.cpp + ${PacBioBAM_TestsDir}/src/test_EndToEnd.cpp + ${PacBioBAM_TestsDir}/src/test_EntireFileQuery.cpp + ${PacBioBAM_TestsDir}/src/test_Fasta.cpp + ${PacBioBAM_TestsDir}/src/test_FileUtils.cpp + ${PacBioBAM_TestsDir}/src/test_Frames.cpp + ${PacBioBAM_TestsDir}/src/test_GenomicIntervalQuery.cpp + ${PacBioBAM_TestsDir}/src/test_IndexedFastaReader.cpp + ${PacBioBAM_TestsDir}/src/test_Intervals.cpp + ${PacBioBAM_TestsDir}/src/test_PacBioIndex.cpp + ${PacBioBAM_TestsDir}/src/test_PbiFilter.cpp + ${PacBioBAM_TestsDir}/src/test_PbiFilterQuery.cpp + ${PacBioBAM_TestsDir}/src/test_QNameQuery.cpp + ${PacBioBAM_TestsDir}/src/test_QualityValues.cpp + ${PacBioBAM_TestsDir}/src/test_Pulse2BaseCache.cpp + ${PacBioBAM_TestsDir}/src/test_ReadAccuracyQuery.cpp + ${PacBioBAM_TestsDir}/src/test_ReadGroupInfo.cpp + ${PacBioBAM_TestsDir}/src/test_SamWriter.cpp + ${PacBioBAM_TestsDir}/src/test_SequenceUtils.cpp + ${PacBioBAM_TestsDir}/src/test_StringUtils.cpp + ${PacBioBAM_TestsDir}/src/test_SubreadLengthQuery.cpp + ${PacBioBAM_TestsDir}/src/test_Tags.cpp + ${PacBioBAM_TestsDir}/src/test_TimeUtils.cpp + # ${PacBioBAM_TestsDir}/src/test_UnmappedReadsQuery.cpp + ${PacBioBAM_TestsDir}/src/test_Validator.cpp + ${PacBioBAM_TestsDir}/src/test_Version.cpp + ${PacBioBAM_TestsDir}/src/test_WhitelistedZmwReadStitcher.cpp + ${PacBioBAM_TestsDir}/src/test_ZmwReadStitcher.cpp + ${PacBioBAM_TestsDir}/src/test_ZmwQuery.cpp +) diff --git a/tests/scripts/cram.py b/tests/scripts/cram.py new file mode 100755 index 0000000..20c4681 --- /dev/null +++ b/tests/scripts/cram.py @@ -0,0 +1,516 @@ +#!/usr/bin/env python +"""Functional testing framework for command line applications""" + +import difflib +import itertools +import optparse +import os +import re +import signal +import subprocess +import sys +import shutil +import time +import tempfile + +try: + import configparser +except ImportError: + import ConfigParser as configparser + +__all__ = ['main', 'test'] + +def findtests(paths): + """Yield tests in paths in sorted order""" + for p in paths: + if os.path.isdir(p): + for root, dirs, files in os.walk(p): + if os.path.basename(root).startswith('.'): + continue + for f in sorted(files): + if not f.startswith('.') and f.endswith('.t'): + yield os.path.normpath(os.path.join(root, f)) + else: + yield os.path.normpath(p) + +def regex(pattern, s): + """Match a regular expression or return False if invalid. + + >>> [bool(regex(r, 'foobar')) for r in ('foo.*', '***')] + [True, False] + """ + try: + return re.match(pattern + r'\Z', s) + except re.error: + return False + +def glob(el, l): + r"""Match a glob-like pattern. + + The only supported special characters are * and ?. Escaping is + supported. + + >>> bool(glob(r'\* \\ \? fo?b*', '* \\ ? foobar')) + True + """ + i, n = 0, len(el) + res = '' + while i < n: + c = el[i] + i += 1 + if c == '\\' and el[i] in '*?\\': + res += el[i - 1:i + 1] + i += 1 + elif c == '*': + res += '.*' + elif c == '?': + res += '.' + else: + res += re.escape(c) + return regex(res, l) + +annotations = {'glob': glob, 're': regex} + +def match(el, l): + """Match patterns based on annotations""" + for k in annotations: + ann = ' (%s)\n' % k + if el.endswith(ann) and annotations[k](el[:-len(ann)], l[:-1]): + return True + return False + +class SequenceMatcher(difflib.SequenceMatcher, object): + """Like difflib.SequenceMatcher, but matches globs and regexes""" + + def find_longest_match(self, alo, ahi, blo, bhi): + """Find longest matching block in a[alo:ahi] and b[blo:bhi]""" + # SequenceMatcher uses find_longest_match() to slowly whittle down + # the differences between a and b until it has each matching block. + # Because of this, we can end up doing the same matches many times. + matches = [] + for n, (el, line) in enumerate(zip(self.a[alo:ahi], self.b[blo:bhi])): + if el != line and match(el, line): + # This fools the superclass's method into thinking that the + # regex/glob in a is identical to b by replacing a's line (the + # expected output) with b's line (the actual output). + self.a[alo + n] = line + matches.append((n, el)) + ret = super(SequenceMatcher, self).find_longest_match(alo, ahi, + blo, bhi) + # Restore the lines replaced above. Otherwise, the diff output + # would seem to imply that the tests never had any regexes/globs. + for n, el in matches: + self.a[alo + n] = el + return ret + +def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', + tofiledate='', n=3, lineterm='\n', matcher=SequenceMatcher): + """Compare two sequences of lines; generate the delta as a unified diff. + + This is like difflib.unified_diff(), but allows custom matchers. + """ + started = False + for group in matcher(None, a, b).get_grouped_opcodes(n): + if not started: + fromdate = fromfiledate and '\t%s' % fromfiledate or '' + todate = fromfiledate and '\t%s' % tofiledate or '' + yield '--- %s%s%s' % (fromfile, fromdate, lineterm) + yield '+++ %s%s%s' % (tofile, todate, lineterm) + started = True + i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4] + yield "@@ -%d,%d +%d,%d @@%s" % (i1 + 1, i2 - i1, j1 + 1, j2 - j1, + lineterm) + for tag, i1, i2, j1, j2 in group: + if tag == 'equal': + for line in a[i1:i2]: + yield ' ' + line + continue + if tag == 'replace' or tag == 'delete': + for line in a[i1:i2]: + yield '-' + line + if tag == 'replace' or tag == 'insert': + for line in b[j1:j2]: + yield '+' + line + +needescape = re.compile(r'[\x00-\x09\x0b-\x1f\x7f-\xff]').search +escapesub = re.compile(r'[\x00-\x09\x0b-\x1f\\\x7f-\xff]').sub +escapemap = dict((chr(i), r'\x%02x' % i) for i in range(256)) +escapemap.update({'\\': '\\\\', '\r': r'\r', '\t': r'\t'}) + +def escape(s): + """Like the string-escape codec, but doesn't escape quotes""" + return escapesub(lambda m: escapemap[m.group(0)], s[:-1]) + ' (esc)\n' + +def makeresetsigpipe(): + """Make a function to reset SIGPIPE to SIG_DFL (for use in subprocesses). + + Doing subprocess.Popen(..., preexec_fn=makeresetsigpipe()) will prevent + Python's SIGPIPE handler (SIG_IGN) from being inherited by the + child process. + """ + if sys.platform == 'win32' or getattr(signal, 'SIGPIPE', None) is None: + return None + return lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL) + +def test(path, shell, indent=2): + """Run test at path and return input, output, and diff. + + This returns a 3-tuple containing the following: + + (list of lines in test, same list with actual output, diff) + + diff is a generator that yields the diff between the two lists. + + If a test exits with return code 80, the actual output is set to + None and diff is set to []. + """ + indent = ' ' * indent + cmdline = '%s$ ' % indent + conline = '%s> ' % indent + + f = open(path) + abspath = os.path.abspath(path) + env = os.environ.copy() + env['TESTDIR'] = os.path.dirname(abspath) + env['TESTFILE'] = os.path.basename(abspath) + p = subprocess.Popen([shell, '-'], bufsize=-1, stdin=subprocess.PIPE, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + universal_newlines=True, env=env, + preexec_fn=makeresetsigpipe(), + close_fds=os.name == 'posix') + salt = 'CRAM%s' % time.time() + + after = {} + refout, postout = [], [] + i = pos = prepos = -1 + stdin = [] + for i, line in enumerate(f): + refout.append(line) + if line.startswith(cmdline): + after.setdefault(pos, []).append(line) + prepos = pos + pos = i + stdin.append('echo "\n%s %s $?"\n' % (salt, i)) + stdin.append(line[len(cmdline):]) + elif line.startswith(conline): + after.setdefault(prepos, []).append(line) + stdin.append(line[len(conline):]) + elif not line.startswith(indent): + after.setdefault(pos, []).append(line) + stdin.append('echo "\n%s %s $?"\n' % (salt, i + 1)) + + output = p.communicate(input=''.join(stdin))[0] + if p.returncode == 80: + return (refout, None, []) + + # Add a trailing newline to the input script if it's missing. + if refout and not refout[-1].endswith('\n'): + refout[-1] += '\n' + + # We use str.split instead of splitlines to get consistent + # behavior between Python 2 and 3. In 3, we use unicode strings, + # which has more line breaks than \n and \r. + pos = -1 + ret = 0 + for i, line in enumerate(output[:-1].split('\n')): + line += '\n' + if line.startswith(salt): + presalt = postout.pop() + if presalt != '%s\n' % indent: + postout.append(presalt[:-1] + ' (no-eol)\n') + ret = int(line.split()[2]) + if ret != 0: + postout.append('%s[%s]\n' % (indent, ret)) + postout += after.pop(pos, []) + pos = int(line.split()[1]) + else: + if needescape(line): + line = escape(line) + postout.append(indent + line) + postout += after.pop(pos, []) + + diffpath = os.path.basename(abspath) + diff = unified_diff(refout, postout, diffpath, diffpath + '.err') + for firstline in diff: + return refout, postout, itertools.chain([firstline], diff) + return refout, postout, [] + +def prompt(question, answers, auto=None): + """Write a prompt to stdout and ask for answer in stdin. + + answers should be a string, with each character a single + answer. An uppercase letter is considered the default answer. + + If an invalid answer is given, this asks again until it gets a + valid one. + + If auto is set, the question is answered automatically with the + specified value. + """ + default = [c for c in answers if c.isupper()] + while True: + sys.stdout.write('%s [%s] ' % (question, answers)) + sys.stdout.flush() + if auto is not None: + sys.stdout.write(auto + '\n') + sys.stdout.flush() + return auto + + answer = sys.stdin.readline().strip().lower() + if not answer and default: + return default[0] + elif answer and answer in answers.lower(): + return answer + +def log(msg=None, verbosemsg=None, verbose=False): + """Write msg to standard out and flush. + + If verbose is True, write verbosemsg instead. + """ + if verbose: + msg = verbosemsg + if msg: + sys.stdout.write(msg) + sys.stdout.flush() + +def patch(cmd, diff, path): + """Run echo [lines from diff] | cmd -p0""" + p = subprocess.Popen([cmd, '-p0'], bufsize=-1, stdin=subprocess.PIPE, + universal_newlines=True, + preexec_fn=makeresetsigpipe(), + cwd=path, + close_fds=os.name == 'posix') + p.communicate(''.join(diff)) + return p.returncode == 0 + +def run(paths, tmpdir, shell, quiet=False, verbose=False, patchcmd=None, + answer=None, indent=2): + """Run tests in paths in tmpdir. + + If quiet is True, diffs aren't printed. If verbose is True, + filenames and status information are printed. + + If patchcmd is set, a prompt is written to stdout asking if + changed output should be merged back into the original test. The + answer is read from stdin. If 'y', the test is patched using patch + based on the changed output. + """ + cwd = os.getcwd() + seen = set() + basenames = set() + skipped = failed = 0 + for i, path in enumerate(findtests(paths)): + abspath = os.path.abspath(path) + if abspath in seen: + continue + seen.add(abspath) + + log(None, '%s: ' % path, verbose) + if not os.stat(abspath).st_size: + skipped += 1 + log('s', 'empty\n', verbose) + else: + basename = os.path.basename(path) + if basename in basenames: + basename = '%s-%s' % (basename, i) + else: + basenames.add(basename) + testdir = os.path.join(tmpdir, basename) + os.mkdir(testdir) + try: + os.chdir(testdir) + refout, postout, diff = test(abspath, shell, indent) + finally: + os.chdir(cwd) + + errpath = abspath + '.err' + if postout is None: + skipped += 1 + log('s', 'skipped\n', verbose) + elif not diff: + log('.', 'passed\n', verbose) + if os.path.exists(errpath): + os.remove(errpath) + else: + failed += 1 + log('!', 'failed\n', verbose) + if not quiet: + log('\n', None, verbose) + errfile = open(errpath, 'w') + try: + for line in postout: + errfile.write(line) + finally: + errfile.close() + if not quiet: + if patchcmd: + diff = list(diff) + for line in diff: + log(line) + if (patchcmd and + prompt('Accept this change?', 'yN', answer) == 'y'): + if patch(patchcmd, diff, os.path.dirname(abspath)): + log(None, '%s: merged output\n' % path, verbose) + os.remove(errpath) + else: + log('%s: merge failed\n' % path) + log('\n', None, verbose) + log('# Ran %s tests, %s skipped, %s failed.\n' + % (len(seen), skipped, failed)) + return bool(failed) + +def which(cmd): + """Return the patch to cmd or None if not found""" + for p in os.environ['PATH'].split(os.pathsep): + path = os.path.join(p, cmd) + if os.path.isfile(path) and os.access(path, os.X_OK): + return os.path.abspath(path) + return None + +def expandpath(path): + """Expands ~ and environment variables in path""" + return os.path.expanduser(os.path.expandvars(path)) + +class OptionParser(optparse.OptionParser): + """Like optparse.OptionParser, but supports setting values through + CRAM= and .cramrc.""" + + def __init__(self, *args, **kwargs): + self._config_opts = {} + optparse.OptionParser.__init__(self, *args, **kwargs) + + def add_option(self, *args, **kwargs): + option = optparse.OptionParser.add_option(self, *args, **kwargs) + if option.dest and option.dest != 'version': + key = option.dest.replace('_', '-') + self._config_opts[key] = option.action == 'store_true' + return option + + def parse_args(self, args=None, values=None): + config = configparser.RawConfigParser() + config.read(expandpath(os.environ.get('CRAMRC', '.cramrc'))) + defaults = {} + for key, isbool in self._config_opts.items(): + try: + if isbool: + try: + value = config.getboolean('cram', key) + except ValueError: + value = config.get('cram', key) + self.error('--%s: invalid boolean value: %r' + % (key, value)) + else: + value = config.get('cram', key) + except (configparser.NoSectionError, configparser.NoOptionError): + pass + else: + defaults[key] = value + self.set_defaults(**defaults) + + eargs = os.environ.get('CRAM', '').strip() + if eargs: + import shlex + args = args or [] + args += shlex.split(eargs) + + try: + return optparse.OptionParser.parse_args(self, args, values) + except optparse.OptionValueError: + self.error(str(sys.exc_info()[1])) + +def main(args): + """Main entry point. + + args should not contain the script name. + """ + p = OptionParser(usage='cram [OPTIONS] TESTS...', prog='cram') + p.add_option('-V', '--version', action='store_true', + help='show version information and exit') + p.add_option('-q', '--quiet', action='store_true', + help="don't print diffs") + p.add_option('-v', '--verbose', action='store_true', + help='show filenames and test status') + p.add_option('-i', '--interactive', action='store_true', + help='interactively merge changed test output') + p.add_option('-y', '--yes', action='store_true', + help='answer yes to all questions') + p.add_option('-n', '--no', action='store_true', + help='answer no to all questions') + p.add_option('-E', '--preserve-env', action='store_true', + help="don't reset common environment variables") + p.add_option('--keep-tmpdir', action='store_true', + help='keep temporary directories') + p.add_option('--shell', action='store', default='/bin/sh', metavar='PATH', + help='shell to use for running tests') + p.add_option('--indent', action='store', default=2, metavar='NUM', + type='int', help='number of spaces to use for indentation') + opts, paths = p.parse_args(args) + + if opts.version: + sys.stdout.write("""Cram CLI testing framework (version 0.6) + +Copyright (C) 2010-2011 Brodie Rao and others +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +""") + return + + conflicts = [('-y', opts.yes, '-n', opts.no), + ('-q', opts.quiet, '-i', opts.interactive)] + for s1, o1, s2, o2 in conflicts: + if o1 and o2: + sys.stderr.write('options %s and %s are mutually exclusive\n' + % (s1, s2)) + return 2 + + patchcmd = None + if opts.interactive: + patchcmd = which('patch') + if not patchcmd: + sys.stderr.write('patch(1) required for -i\n') + return 2 + + if not paths: + sys.stdout.write(p.get_usage()) + return 2 + + badpaths = [path for path in paths if not os.path.exists(path)] + if badpaths: + sys.stderr.write('no such file: %s\n' % badpaths[0]) + return 2 + + tmpdir = os.environ['CRAMTMP'] = tempfile.mkdtemp('', 'cramtests-') + proctmp = os.path.join(tmpdir, 'tmp') + os.mkdir(proctmp) + for s in ('TMPDIR', 'TEMP', 'TMP'): + os.environ[s] = proctmp + + if not opts.preserve_env: + for s in ('LANG', 'LC_ALL', 'LANGUAGE'): + os.environ[s] = 'C' + os.environ['TZ'] = 'GMT' + os.environ['CDPATH'] = '' + os.environ['COLUMNS'] = '80' + os.environ['GREP_OPTIONS'] = '' + + if opts.yes: + answer = 'y' + elif opts.no: + answer = 'n' + else: + answer = None + + try: + return run(paths, tmpdir, opts.shell, opts.quiet, opts.verbose, + patchcmd, answer, opts.indent) + finally: + if opts.keep_tmpdir: + log('# Kept temporary directory: %s\n' % tmpdir) + else: + shutil.rmtree(tmpdir) + +if __name__ == '__main__': + try: + sys.exit(main(sys.argv[1:])) + except KeyboardInterrupt: + pass diff --git a/tests/scripts/generate_data.py b/tests/scripts/generate_data.py new file mode 100755 index 0000000..ac28dbb --- /dev/null +++ b/tests/scripts/generate_data.py @@ -0,0 +1,164 @@ +#!/usr/bin/python + +from __future__ import print_function + +import os, shutil, sys +import StringIO + +fastaSeq_1 = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC +AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG""" + +fastaSeq_2 = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC +AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC""" + +fastaSeq_3 = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC +ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT""" + +# file creation decorator +def fileMaker(func): + def inner(*args, **kwargs): + print(" - Creating file: %s..." % args[1], end='') + sys.stdout.flush() + retval = func(*args) + print("done.") + sys.stdout.flush() + return retval + return inner + +# symlink creation decorator +def fileLinker(func): + def inner(*args, **kwargs): + print(" - Creating symlink: %s..." % args[1], end='') + sys.stdout.flush() + retval = func(*args) + print("done.") + sys.stdout.flush() + return retval + return inner + +# return a copy of original, minues any lines that contain an entry in blacklist +def trimXmlElements(original, blacklist): + out = StringIO.StringIO() + for line in original.splitlines(): + if all(x not in line for x in blacklist): + out.write(line + '\n') + result = out.getvalue() + out.close() + return result + +class TestDataGenerator: + + def __init__(self, source, dest): + + # source/destination directories + self.testDataDir = source + self.generatedDataDir = dest + + # generated output files/symlinks & 'maker' functions + self.outputFiles = { + 'truncated.bam' : self.makeTruncatedBam, + 'chunking_emptyfilters.subreadset.xml' : self.makeChunkingXml, + 'chunking_missingfilters.subreadset.xml' : self.makeChunkingXml, + 'normal.fa' : self.makeNormalFasta + } + self.outputSymlinks = { + 'aligned.bam' : self.makeAlignedBamCopy, + 'aligned.bam.bai' : self.makeAlignedBamCopy, + 'aligned.bam.pbi' : self.makeAlignedBamCopy, + 'aligned2.bam' : self.makeAlignedBamCopy, + 'aligned2.bam.bai' : self.makeAlignedBamCopy, + 'aligned2.bam.pbi' : self.makeAlignedBamCopy, + 'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam' : self.makeChunkingSymlink, + 'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi' : self.makeChunkingSymlink, + 'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam' : self.makeChunkingSymlink, + 'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi' : self.makeChunkingSymlink, + 'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam' : self.makeChunkingSymlink, + 'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi' : self.makeChunkingSymlink, + 'missing_pbi.bam' : self.makeMissingPbiBam, + } + + def editChunkingXml(self, outputFn, removeFiltersNode): + inputXmlFn = os.path.join(self.testDataDir,'chunking','chunking.subreadset.xml') + outputXmlFn = os.path.join(self.generatedDataDir,outputFn) + + blacklist = ['pbds:Filter>', 'pbbase:Properties>', '') + + inputXml = '' + with open(inputXmlFn, 'r') as xml_infile: + inputXml = xml_infile.read() + outputXml = trimXmlElements(inputXml, blacklist) + with open(outputXmlFn, 'w') as xml_outfile: + xml_outfile.write(outputXml) + + @fileLinker + def makeAlignedBamCopy(self, outputFn): + source = os.path.join(self.testDataDir,outputFn) + dest = os.path.join(self.generatedDataDir, outputFn) + os.symlink(source, dest) + + @fileLinker + def makeChunkingSymlink(self, outputFn): + source = os.path.join(self.testDataDir,'chunking', outputFn) + dest = os.path.join(self.generatedDataDir, outputFn) + os.symlink(source, dest) + + @fileLinker + def makeMissingPbiBam(self, outputFn): + source = os.path.join(self.testDataDir, 'phi29.bam') + dest = os.path.join(self.generatedDataDir, outputFn) + os.symlink(source, dest) + + @fileMaker + def makeChunkingXml(self, outputFn): + if outputFn == 'chunking_emptyfilters.subreadset.xml': + removeFiltersNode = False + else: + removeFiltersNode = True + self.editChunkingXml(outputFn, removeFiltersNode) + + @fileMaker + def makeNormalFasta(self, outputFn): + content = ">1\n" + fastaSeq_1 + "\n>2\n" + fastaSeq_2 + "\n>3\n" + fastaSeq_3 + dest = os.path.join(self.generatedDataDir, outputFn) + with open(outputFn, 'w') as fasta_out: + fasta_out.write(content) + + @fileMaker + def makeTruncatedBam(self, outputFn): + source = os.path.join(self.testDataDir, 'phi29.bam') + dest = os.path.join(self.generatedDataDir, outputFn) + shutil.copyfile(source, dest) + with open(dest, 'r+b') as in_file: + in_file.truncate(200) + + # main entry point + def generate(self): + + # skip file if it exists + os.chdir(self.generatedDataDir) + filenames = self.outputFiles.keys() + for file in filenames: + if os.path.exists(file) : + del self.outputFiles[file] + + # skip symlink if it exists + symlinks = self.outputSymlinks.keys() + for link in symlinks: + if os.path.lexists(link): + del self.outputSymlinks[link] + + # only print message & run makers, if any files/symlinks to be created + # else silent success + if self.outputFiles or self.outputSymlinks: + print('Generating test data in %s ' % self.generatedDataDir) + for file, func in self.outputFiles.iteritems(): + func(file) + for link, func in self.outputSymlinks.iteritems(): + func(link) + +# script entry point +if __name__ == '__main__': + g = TestDataGenerator(sys.argv[1], sys.argv[2]) + g.generate() diff --git a/tests/src/CSharp/CheckSWIG.cs b/tests/src/CSharp/CheckSWIG.cs new file mode 100644 index 0000000..d748774 --- /dev/null +++ b/tests/src/CSharp/CheckSWIG.cs @@ -0,0 +1,15 @@ + + +using PacBio.BAM; + +public class CheckSWIG +{ + public static void Main() + { + var header = new BamHeader(); + header.ToSam(); + System.Console.WriteLine(""); + System.Console.WriteLine("pbbam SWIG binding to C# worked!"); + System.Console.WriteLine(""); + } +} diff --git a/tests/src/CSharp/TestPbbam.cs.in b/tests/src/CSharp/TestPbbam.cs.in new file mode 100644 index 0000000..2913fc6 --- /dev/null +++ b/tests/src/CSharp/TestPbbam.cs.in @@ -0,0 +1,134 @@ +#pragma warning disable 168, 219 + +using System; +using System.IO; +using System.Linq; +using System.Collections; +using System.Collections.Generic; +using System.Reflection; + +using PacBio.BAM; + +namespace TestStuff +{ + // + // This approach is the best we can do for now, without requiring nunit. + // + public class TestPbbam + { + public static readonly string DATA_DIR = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location) + "/TestData"; + public static readonly string BAM_FILENAME_1 = Path.Combine(DATA_DIR, "test_group_query", "test1.bam"); + public static readonly string BAM_FILENAME_2 = Path.Combine(DATA_DIR, "test_group_query", "test2.bam"); + public static readonly string STITCHING_FN_1 = Path.Combine(DATA_DIR, "polymerase", "production.subreads.bam"); + public static readonly string STITCHING_FN_2 = Path.Combine(DATA_DIR, "polymerase", "production.scraps.bam"); + public static readonly string FASTA_FILENAME = Path.Combine(DATA_DIR, "lambdaNEB.fa"); + + public TestPbbam () + { + } + + public static void TestExceptions() + { + try + { + var badFile = new BamFile("nonexistent.bam"); + } + catch (Exception e) + { + //Console.Write(e.ToString()); + Console.WriteLine("Exceptions - OK!"); + return; + } + throw new Exception("doh!"); + } + + + public static void TestCigar() + { + string s = "3=3I3D"; + var c = new CigarType(s); + string cs = c.ToStdString(); + if (s != cs) + { + throw new Exception("Cigar not working!"); + } + + // This used to crash + var c2 = CigarType.FromStdString("3=3I3D"); + + Console.WriteLine("TestCigar - OK!"); + } + + public static void TestBamFileEnumeration() + { + var bf = new BamFile(BAM_FILENAME_2); + var q = new EntireFileQuery(new DataSet(bf)); + + if (q.Count() != 4) + { + throw new Exception("Enumeration not working!"); + } + + Console.WriteLine("TesBamFileEnumeration - OK!"); + } + + public static void TestIndexedFasta() + { + var f = new IndexedFastaReader(FASTA_FILENAME); + bool check = (f.NumSequences() == 1 && + f.HasSequence("lambda_NEB3011") && + f.Subsequence("lambda_NEB3011:0-10") == "GGGCGGCGAC"); + if (!check) + { + throw new Exception("Indexed FASTA files not working"); + } + + var b = new BamRecord(); + var x = b.Impl(); + + Console.WriteLine("TestIndexedFasta - OK!"); + } + + public static void TestZmwQuery() + { + var d = new DataSet(BAM_FILENAME_2); + var q = new ZmwQuery(new IntList {1, 2, 3}, d); + var q2 = new ZmwQuery(new IntList { 14743 }, d); + + if (0 != q.Count() || 4 != q2.Count()) + { + throw new Exception("ZmwQuery not working"); + } + Console.WriteLine("TestZmwQuery - OK!"); + } + + public static void TestStitching() + { + var stitcher = new ZmwReadStitcher(STITCHING_FN_1, STITCHING_FN_2); + if (!stitcher.HasNext()) + { + throw new Exception("Error stitching via ZmwReadStitcher"); + } + var zmwRecord = stitcher.Next(); + + Console.WriteLine("TestStitching - OK!"); + } + + public void RunAllTests() + { + TestExceptions(); + TestCigar(); + TestBamFileEnumeration(); + TestIndexedFasta(); + TestZmwQuery(); + TestStitching(); + } + + public static void Main() + { + var t = new TestPbbam(); + t.RunAllTests(); + } + + } +} diff --git a/tests/src/CSharp/buildAssembly.sh.in b/tests/src/CSharp/buildAssembly.sh.in new file mode 100644 index 0000000..7e667b3 --- /dev/null +++ b/tests/src/CSharp/buildAssembly.sh.in @@ -0,0 +1,95 @@ +#!/bin/bash +set -euo pipefail +set -x + + +# This is a temporary hack to build windows C# bindings, while I work on getting it +# to work nicely through CMake --DHA + +# UNIX setup (before doing this!) +# - install swig, cmake, and mono +# +# Windows setup (before doing this!): +# - install msys2 (64-bit) from https://msys2.github.io/ +# - install: (pacman -S [package-name]); in all cases use the mingw-w64 64bit version when +# available: +# + gcc, g++, make +# + zlib-dev +# + swig +# + cmake +# - for cmake, generate MSYS makefiles + +if [ ${WIN32} ]; then + PLATFORM="Windows" +else + PLATFORM="Unix" +fi + +MSBUILD=${CSHARP_PROJECT_BUILDER} +CSC=${CSHARP_COMPILER} + +CSPROJ_ROOT=${PacBioBAM_CSharpLibDir} +CSPROJ=${PacBioBAM_CSharpLibDir}/PacBio.BAM.csproj +ASSEMBLY_ROOT=${PacBioBAM_CSharpLibDir}/bin/Debug + +# get expanded cmake generator expression ( $ or externally defined -DHTSLIB_LIBRARIES="") +# from cmd line +EXPANDED_HTSLIB_LIBRARIES=("$@") + +# +# Make the managed DLL +# +( cd $CSPROJ_ROOT; $MSBUILD $CSPROJ ) || { echo "Failed to build managed DLL" && exit -1; } + +# +# Copy the dependency libs +# +cp ${PacBioBAM_LibDir}/libpbbam${CMAKE_SHARED_LIBRARY_SUFFIX} $ASSEMBLY_ROOT +cp "$EXPANDED_HTSLIB_LIBRARIES" $ASSEMBLY_ROOT # Need "libhts*.dylib" + +if [ "$PLATFORM" == "Windows" ] +then + # stuff we need to bundle on windows + cp /mingw64/bin/zlib1.dll $ASSEMBLY_ROOT + cp /mingw64/bin/libwinpthread-1.dll $ASSEMBLY_ROOT + cp ${PacBioBAM_CSharpLibDir}/libPacBioBam.dll $ASSEMBLY_ROOT/PacBioBam.dll +else + # For UNIX this is .so, even Mac. Not sure why. + cp ${PacBioBAM_CSharpLibDir}/libPacBioBam.so $ASSEMBLY_ROOT +# cp ${HTSLIB_LIBRARIES_VERSIONED_LINK} $ASSEMBLY_ROOT # Need "libhts*.dylib" +fi + +# Bundle test data +mkdir -p $ASSEMBLY_ROOT/TestData +cp -rf ${PacBioBAM_TestsDir}/data/* $ASSEMBLY_ROOT/TestData + +# +# Make the "check" program, which we need to put next to the assembly to +# allow it to be resolved. +# + +CHECK_SRC=${CSharpTestRootDir}/CheckSWIG.cs +CHECK_BIN=$ASSEMBLY_ROOT/CheckSWIG.exe + + +if [ "$PLATFORM" == "Windows" ] +then + ( $CSC /lib:$ASSEMBLY_ROOT //r:PacBio.BAM.dll /out:$CHECK_BIN $(cygpath -w $CHECK_SRC) && $CHECK_BIN) +else + ( $CSC /lib:$ASSEMBLY_ROOT /r:PacBio.BAM.dll /out:$CHECK_BIN $CHECK_SRC && cd $ASSEMBLY_ROOT && mono CheckSWIG.exe ) +fi + + +# +# Build the fuller test suite, and bundle +# +TEST_SRC=${CSharpTestRootDir}/TestPbbam.cs +TEST_BIN=$ASSEMBLY_ROOT/TestPbbam.exe + + +if [ "$PLATFORM" == "Windows" ] +then + ( $CSC /lib:$ASSEMBLY_ROOT //r:PacBio.BAM.dll /out:$TEST_BIN $(cygpath -w $TEST_SRC) && $TEST_BIN ) +else + ( $CSC /lib:$ASSEMBLY_ROOT /r:PacBio.BAM.dll /out:$TEST_BIN $TEST_SRC && cd $ASSEMBLY_ROOT && mono TestPbbam.exe ) +fi diff --git a/tests/src/CSharp/check_swig.sh.in b/tests/src/CSharp/check_swig.sh.in new file mode 100755 index 0000000..ba07b75 --- /dev/null +++ b/tests/src/CSharp/check_swig.sh.in @@ -0,0 +1,18 @@ +#!/bin/sh +(cd ${PacBioBAM_CSharpLibDir}; xbuild PacBio.BAM.csproj) + +${CSHARP_COMPILER} -lib:${PacBioBAM_CSharpLibDir}/bin/Debug -r:PacBio.BAM.dll CheckSWIG.cs + +# +# For deployment these all need to be installed somewhere more sensible. +# This is just a hack for testing if the build works. +# +LIBRARY_PATHS=\ +${PacBioBAM_CSharpLibDir}:\ +${PacBioBAM_LibDir}:\ +${Htslib_LibDir} + +DYLD_LIBRARY_PATH=$LIBRARY_PATHS \ +LD_LIBRARY_PATH=$LIBRARY_PATHS \ +MONO_PATH=${PacBioBAM_CSharpLibDir}/bin/Debug \ +mono CheckSWIG.exe diff --git a/tests/src/R/check_swig.R.in b/tests/src/R/check_swig.R.in new file mode 100644 index 0000000..13b07c4 --- /dev/null +++ b/tests/src/R/check_swig.R.in @@ -0,0 +1,58 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +# load PacBioBAM lib & wrapper script + +# htslib_libname <- "@Htslib_Libraries@" +# dyn.load(htslib_libname) +# # htslib_libname <- paste(htslib_libpath, "libhts", sep="/") +# # dyn.load(paste(htslib_libname, ".dylib", sep="")) + +pbbam_libpath <- "@PacBioBAM_RLibDir@" +pbbam_libname <- paste(pbbam_libpath, "PacBioBam", sep="/") +dyn.load(paste(pbbam_libname, .Platform$dynlib.ext, sep="")) + +pbbam_wrapper <- paste(pbbam_libpath, "PacBioBam.R", sep="/") +source(pbbam_wrapper) + +cacheMetaData(1) + +h <- BamHeader() + +message = "\nR Wrapper OK.\n" +cat(message) +cat("\n") diff --git a/tests/src/R/test_pbbam.R b/tests/src/R/test_pbbam.R new file mode 100644 index 0000000..55c71a8 --- /dev/null +++ b/tests/src/R/test_pbbam.R @@ -0,0 +1,23 @@ + +# usage: R [args] < test_pbbam.R --args +args <- commandArgs(TRUE) +tests_path <- args[1] +lib_path <- args[2] +test_data_path <- args[3] + +# load PacBioBAM lib & wrapper script +pbbam_libname <- paste(lib_path, "PacBioBam", sep="/") +pbbam_wrapper <- paste(lib_path, "PacBioBam.R", sep="/") +dyn.load(paste(pbbam_libname, .Platform$dynlib.ext, sep="")) +source(pbbam_wrapper) +cacheMetaData(1) + +# init test utils & run test cases +source(paste(tests_path, "utils.R", sep="/")) +run_test_suite(tests_path) + +# print results & exit +results <- test_suite_results() +results$print_summary() +if (results$any_failed()) + quit(status=1) diff --git a/tests/src/R/test_pbbam.sh.in b/tests/src/R/test_pbbam.sh.in new file mode 100644 index 0000000..458b149 --- /dev/null +++ b/tests/src/R/test_pbbam.sh.in @@ -0,0 +1,54 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +#! /usr/bin/sh + +GENERATED_BAM=@PacBioBAM_TestsDir@/data/generated.bam + +touch $GENERATED_BAM +chmod 644 $GENERATED_BAM + +R --slave --no-save < @RTestRootDir@/test_pbbam.R --args \ + @RTestRootDir@/tests \ + @PacBioBAM_RLibDir@ \ + @PacBioBAM_TestsDir@/data + +STATUS=$? + +rm $GENERATED_BAM + +exit $STATUS \ No newline at end of file diff --git a/tests/src/R/tests/test_Accuracy.R b/tests/src/R/tests/test_Accuracy.R new file mode 100644 index 0000000..e7e98e6 --- /dev/null +++ b/tests/src/R/tests/test_Accuracy.R @@ -0,0 +1,62 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +test_case("Accuracy_Clamp", { + + a_zero <- Accuracy(0.0) + a_neg <- Accuracy(-0.5) + a_min <- Accuracy(0.0) + a_normal <- Accuracy(0.9) + a_max <- Accuracy(1.0) + a_tooLarge <- Accuracy(1.1) + + tolerance = 1e-5 + + assertTrue( abs(0.0 - a_zero$ToFloat()) <= tolerance ) + assertTrue( abs(0.0 - a_neg$ToFloat()) <= tolerance ) + assertTrue( abs(0.0 - a_min$ToFloat()) <= tolerance ) + assertTrue( abs(0.9 - a_normal$ToFloat()) <= tolerance ) + assertTrue( abs(1.0 - a_max$ToFloat()) <= tolerance ) + assertTrue( abs(1.0 - a_tooLarge$ToFloat()) <= tolerance ) + + # assertEqual(0.0, a_zero$ToFloat()) + # assertEqual(0.0, a_neg$ToFloat()) + # assertEqual(0.0, a_min$ToFloat()) + # assertEqual(0.9, a_normal$ToFloat()) + # assertEqual(1.0, a_max$ToFloat()) + # assertEqual(1.0, a_tooLarge$ToFloat()) +}) diff --git a/tests/src/R/tests/test_BamFile.R b/tests/src/R/tests/test_BamFile.R new file mode 100644 index 0000000..a53b243 --- /dev/null +++ b/tests/src/R/tests/test_BamFile.R @@ -0,0 +1,76 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +test_case("BamFile_NonExistentFile", { + result <- tryCatch( + { + f <- BamFile("does_not_exist.bam") + assertTrue(FALSE) # should have thrown + invisible() + }, + warning = function(w) { + assertTrue(TRUE) + invisible() + }, + error = function(e) { + assertTrue(TRUE) + invisible() + } + ) + return(result) +}) + +test_case("BamFile_Ctor", { + + fn <- paste(test_data_path, "aligned.bam", sep="/") + + result <- tryCatch( + { + f <- BamFile(fn) + invisible() + }, + warning = function(w) { + assertTrue(FALSE) + invisible() + }, + error = function(e) { + assertTrue(FALSE) # should not throw + invisible() + } + ) + return(result) +}) diff --git a/tests/src/R/tests/test_BamHeader.R b/tests/src/R/tests/test_BamHeader.R new file mode 100644 index 0000000..cd2716c --- /dev/null +++ b/tests/src/R/tests/test_BamHeader.R @@ -0,0 +1,193 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +empty_program <- function(header) { + result <- tryCatch( + { + pg <- header$Program("foo") + assertTrue(FALSE) # should have thrown + invisible() + }, + warning = function(w) { + assertTrue(TRUE) + invisible() + } + ) + return(result) +} + +empty_readgroup <- function(header) { + result <- tryCatch( + { + pg <- header$ReadGroup("foo") + assertTrue(FALSE) # should have thrown + invisible() + }, + warning = function(w) { + assertTrue(TRUE) + invisible() + } + ) + return(result) +} + +empty_sequenceid <- function(header) { + result <- tryCatch( + { + pg <- header$SequenceId("foo") + assertTrue(FALSE) # should have thrown + invisible() + }, + warning = function(w) { + assertTrue(TRUE) + invisible() + } + ) + return(result) +} + +test_case("BamHeader_Defaults", { + + header <- BamHeader() + + assertEqual(0L, nchar(header$Version())) + assertEqual(0L, nchar(header$SortOrder())) + assertTrue(header$ReadGroups()$empty()) + assertTrue(header$Sequences()$empty()) + assertTrue(header$Programs()$empty()) + + pg <- empty_program(header) + rg <- empty_readgroup(header) + id <- empty_sequenceid(header) + + # TODO: get comment fetching working + #assertEqual(1, length(header$Comments())) +}) + +test_case("BamHeader_Decode", { + + text <- paste("@HD\tVN:1.1\tSO:queryname\tpb:3.0.1", + "@SQ\tSN:chr1\tLN:2038\tSP:chocobo", + "@SQ\tSN:chr2\tLN:3042\tSP:chocobo", + "@RG\tID:rg1\tSM:control", + "@RG\tID:rg2\tSM:condition1", + "@RG\tID:rg3\tSM:condition1", + "@PG\tID:_foo_\tPN:ide", + "@CO\tipsum and so on", + "@CO\tcitation needed", + sep="\n" + ) + + header <- BamHeader(text) + + assertEqual("1.1", header$Version()) + assertEqual("queryname", header$SortOrder()) + assertEqual("3.0.1", header$PacBioBamVersion()) + + assertEqual(3L, header$ReadGroups()$size()) + assertTrue(header$HasReadGroup("rg1")) + assertTrue(header$HasReadGroup("rg2")) + assertTrue(header$HasReadGroup("rg3")) + assertEqual("control", header$ReadGroup("rg1")$Sample()) + assertEqual("condition1", header$ReadGroup("rg2")$Sample()) + assertEqual("condition1", header$ReadGroup("rg3")$Sample()) + + assertEqual(2L, header$Sequences()$size()) + assertTrue(header$HasSequence("chr1")) + assertTrue(header$HasSequence("chr2")) + assertEqual("chocobo", header$Sequence("chr1")$Species()) + assertEqual("chocobo", header$Sequence("chr2")$Species()) + assertEqual("2038", header$Sequence("chr1")$Length()) + assertEqual("3042", header$Sequence("chr2")$Length()) + + assertEqual(1L, header$Programs()$size()) + assertTrue(header$HasProgram("_foo_")) + assertEqual("ide", header$Program("_foo_")$Name()) + + # TODO: get comment fetching working + # assertEqual(2, header$Comments()$size()) + # assertEqual("ipsum and so on", header$Comments()[1]) + # assertEqual("citation needed", header$Comments()[2]) +}) + +test_case("BamHeader_Encode", { + + expectedText <- paste("@HD\tVN:1.1\tSO:queryname\tpb:3.0.1", + "@SQ\tSN:chr1\tLN:2038\tSP:chocobo", + "@SQ\tSN:chr2\tLN:3042\tSP:chocobo", + "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL", + "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL", + "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL", + "@PG\tID:_foo_\tPN:ide", + "@CO\tipsum and so on", + "@CO\tcitation needed", + "", + sep="\n" + ) + + rg1 <- ReadGroupInfo("rg1") + rg1$Sample("control") + rg2 <- ReadGroupInfo("rg2") + rg2$Sample("condition1") + rg3 <- ReadGroupInfo("rg3") + rg3$Sample("condition1") + + seq1 <- SequenceInfo("chr1") + seq1$Length("2038") + seq1$Species("chocobo") + seq2 <- SequenceInfo("chr2") + seq2$Length("3042") + seq2$Species("chocobo") + + prog1 <- ProgramInfo("_foo_") + prog1$Name("ide") + + header <- BamHeader() + header$Version("1.1") + header$SortOrder("queryname") + header$PacBioBamVersion("3.0.1") + header$AddReadGroup(rg1) + header$AddReadGroup(rg2) + header$AddReadGroup(rg3) + header$AddSequence(seq1) + header$AddSequence(seq2) + header$AddProgram(prog1) + header$AddComment("ipsum and so on") + header$AddComment("citation needed") + + assertEqual(expectedText, header$ToSam()) +}) diff --git a/tests/src/R/tests/test_Cigar.R b/tests/src/R/tests/test_Cigar.R new file mode 100644 index 0000000..dd3e544 --- /dev/null +++ b/tests/src/R/tests/test_Cigar.R @@ -0,0 +1,219 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +test_case("Cigar_TypeToChar", { + assertEqual('M', CigarOperation_TypeToChar('ALIGNMENT_MATCH')) + assertEqual('I', CigarOperation_TypeToChar('INSERTION')) + assertEqual('D', CigarOperation_TypeToChar('DELETION')) + assertEqual('N', CigarOperation_TypeToChar('REFERENCE_SKIP')) + assertEqual('S', CigarOperation_TypeToChar('SOFT_CLIP')) + assertEqual('H', CigarOperation_TypeToChar('HARD_CLIP')) + assertEqual('P', CigarOperation_TypeToChar('PADDING')) + assertEqual('=', CigarOperation_TypeToChar('SEQUENCE_MATCH')) + assertEqual('X', CigarOperation_TypeToChar('SEQUENCE_MISMATCH')) +}) + +test_case("Cigar_CharToType", { + + assertEqual('ALIGNMENT_MATCH', CigarOperation_CharToType('M')) + assertEqual('INSERTION', CigarOperation_CharToType('I')) + assertEqual('DELETION', CigarOperation_CharToType('D')) + assertEqual('REFERENCE_SKIP', CigarOperation_CharToType('N')) + assertEqual('SOFT_CLIP', CigarOperation_CharToType('S')) + assertEqual('HARD_CLIP', CigarOperation_CharToType('H')) + assertEqual('PADDING', CigarOperation_CharToType('P')) + assertEqual('SEQUENCE_MATCH', CigarOperation_CharToType('=')) + assertEqual('SEQUENCE_MISMATCH', CigarOperation_CharToType('X')) +}) + +test_case("Cigar_SetType", { + + m = CigarOperation() + i = CigarOperation() + d = CigarOperation() + n = CigarOperation() + s = CigarOperation() + h = CigarOperation() + p = CigarOperation() + e = CigarOperation() + x = CigarOperation() + + m$Type('ALIGNMENT_MATCH') + i$Type('INSERTION') + d$Type('DELETION') + n$Type('REFERENCE_SKIP') + s$Type('SOFT_CLIP') + h$Type('HARD_CLIP') + p$Type('PADDING') + e$Type('SEQUENCE_MATCH') + x$Type('SEQUENCE_MISMATCH') + + assertEqual('M', m$Char()) + assertEqual('I', i$Char()) + assertEqual('D', d$Char()) + assertEqual('N', n$Char()) + assertEqual('S', s$Char()) + assertEqual('H', h$Char()) + assertEqual('P', p$Char()) + assertEqual('=', e$Char()) + assertEqual('X', x$Char()) +}) + +test_case("Cigar_SetChar", { + + m = CigarOperation() + i = CigarOperation() + d = CigarOperation() + n = CigarOperation() + s = CigarOperation() + h = CigarOperation() + p = CigarOperation() + e = CigarOperation() + x = CigarOperation() + + m$Char('M') + i$Char('I') + d$Char('D') + n$Char('N') + s$Char('S') + h$Char('H') + p$Char('P') + e$Char('=') + x$Char('X') + + assertEqual('ALIGNMENT_MATCH', m$Type()) + assertEqual('INSERTION', i$Type()) + assertEqual('DELETION', d$Type()) + assertEqual('REFERENCE_SKIP', n$Type()) + assertEqual('SOFT_CLIP', s$Type()) + assertEqual('HARD_CLIP', h$Type()) + assertEqual('PADDING', p$Type()) + assertEqual('SEQUENCE_MATCH', e$Type()) + assertEqual('SEQUENCE_MISMATCH', x$Type()) +}) + +test_case("Cigar_CigarOpCtors", { + + c1 <- CigarOperation("S", 10) + c2 <- CigarOperation(CigarOperation_TypeToChar('SOFT_CLIP'), 10) + + assertEqual('S', c1$Char()) + assertEqual('S', c2$Char()) + assertEqual('SOFT_CLIP', c1$Type()) + assertEqual('SOFT_CLIP', c2$Type()) + assertEqual(10L, c1$Length()) + assertEqual(10L, c2$Length()) +}) + +test_case("Cigar_FromEmptyString", { + + s <- "" + cigar <- Cigar(s) + assertEqual(0L, cigar$size()) +}) + +test_case("Cigar_FromString", { + + singleCigarString <- "100=" + multiCigarString <- "100=2D34I6=6X6=" + + singleCigar <- Cigar(singleCigarString) + multiCigar <- Cigar(multiCigarString) + + assertEqual(1L, singleCigar$size()) + + c <- singleCigar$front() + assertEqual('=', c$Char()) + assertEqual('SEQUENCE_MATCH', c$Type()) + assertEqual(100L, c$Length()) + + assertEqual(6L, multiCigar$size()) + + # haven't quite figured out [ ] accessors via SWIG, + # but this method does work w/ !ZERO!-based indices + op0 <- multiCigar$'__getitem__'(0) + op1 <- multiCigar$'__getitem__'(1) + op2 <- multiCigar$'__getitem__'(2) + op3 <- multiCigar$'__getitem__'(3) + op4 <- multiCigar$'__getitem__'(4) + op5 <- multiCigar$'__getitem__'(5) + + assertEqual('=', op0$Char()) + assertEqual('D', op1$Char()) + assertEqual('I', op2$Char()) + assertEqual('=', op3$Char()) + assertEqual('X', op4$Char()) + assertEqual('=', op5$Char()) + assertEqual('SEQUENCE_MATCH', op0$Type()) + assertEqual('DELETION', op1$Type()) + assertEqual('INSERTION', op2$Type()) + assertEqual('SEQUENCE_MATCH', op3$Type()) + assertEqual('SEQUENCE_MISMATCH', op4$Type()) + assertEqual('SEQUENCE_MATCH', op5$Type()) + assertEqual(100L, op0$Length()) + assertEqual(2L, op1$Length()) + assertEqual(34L, op2$Length()) + assertEqual(6L, op3$Length()) + assertEqual(6L, op4$Length()) + assertEqual(6L, op5$Length()) +}) + +test_case("Cigar_ToEmptyString", { + + cigar <- Cigar() + assertEqual(0L, nchar(cigar$ToStdString())) # empty string is 1 +}) + +test_case("Cigar_ToString", { + + singleCigarString <- "100=" + multiCigarString <- "100=2D34I6=6X6=" + + singleCigar <- Cigar() + singleCigar$push_back( CigarOperation(CigarOperation_TypeToChar('SEQUENCE_MATCH'), 100) ) + + multiCigar <- Cigar() + multiCigar$push_back(CigarOperation('=', 100)) + multiCigar$push_back(CigarOperation('D', 2)) + multiCigar$push_back(CigarOperation('I', 34)) + multiCigar$push_back(CigarOperation('=', 6)) + multiCigar$push_back(CigarOperation('X', 6)) + multiCigar$push_back(CigarOperation('=', 6)) + + assertEqual(singleCigarString, singleCigar$ToStdString()) + assertEqual(multiCigarString, multiCigar$ToStdString()) +}) diff --git a/tests/src/R/tests/test_EndToEnd.R b/tests/src/R/tests/test_EndToEnd.R new file mode 100644 index 0000000..65f76b2 --- /dev/null +++ b/tests/src/R/tests/test_EndToEnd.R @@ -0,0 +1,105 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +originalNames <- function(inputFn, generatedFn) { + + result <- tryCatch( + { + file <- BamFile(inputFn) + writer <- BamWriter(generatedFn, file$Header()) + + ds <- DataSet(inputFn) + entireFile <- EntireFileQuery(ds) + + names_in <- list() + iter <- entireFile$begin() + end <- entireFile$end() + while ( iter$'__ne__'(end) ) { + record <- iter$value() + names_in <- c(names_in, record$FullName()) + writer$Write(record) + iter$incr() + } + writer$'delete_BamWriter'() + return(names_in) + }, + error = function(e) { + assertEqual("why:", e$message) # should not throw + return(list()) + } + ) + return(result) +} + +generatedNames <- function(generatedFn) { + + result <- tryCatch( + { + ds <- DataSet(generatedFn) + entireFile <- EntireFileQuery(ds) + + names_out <- list() + iter <- entireFile$begin() + end <- entireFile$end() + while ( iter$'__ne__'(end) ) { + names_out <- c(names_out, iter$FullName()) + iter$incr() + } + return(names_out) + }, + error = function(e) { + assertEqual("why:", e$message) # should not throw + return(list()) + } + ) + return(result) +} + +#test_case("EndToEnd_CopyFileAndReadBack", { +# +# inputFn <- paste(test_data_path, "aligned.bam", sep="/") +# generatedFn <- paste(test_data_path, "generated.bam", sep="/") +# +# # loop over original file, store names, write to generated file +# names_in <- originalNames(inputFn, generatedFn) +# +# # read names from new file +# names_out <- generatedNames(generatedFn) +# +# # ensure equal +# assertEqual(names_in, names_out) +#}) diff --git a/tests/src/R/tests/test_Frames.R b/tests/src/R/tests/test_Frames.R new file mode 100644 index 0000000..f9d6eb5 --- /dev/null +++ b/tests/src/R/tests/test_Frames.R @@ -0,0 +1,95 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTA, +# SPECIA, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +testFrames_rawData <- list( + 0, 8, 140, 0, 0, 7, 4, 0, 85, 2, + 1, 3, 2, 10, 1, 20, 47, 10, 9, 60, + 20, 3, 12, 5, 13, 165, 6, 14, 22, 12, + 2, 4, 9, 218, 27, 3, 15, 2, 17, 2, + 45, 24, 89, 10, 7, 1, 11, 15, 0, 7, + 0, 28, 17, 12, 6, 10, 37, 0, 12, 52, + 0, 7, 1, 14, 3, 26, 12, 0, 20, 17, + 2, 13, 2, 9, 13, 7, 15, 29, 3, 6, + 2, 1, 28, 10, 3, 14, 7, 1, 22, 1, + 6, 6, 0, 19, 31, 6, 2, 14, 0, 0, + 1000, 947, 948 +) + +encodedFrames_rawData <- list( + 0, 8, 102, 0, 0, 7, 4, 0, 75, 2, 1, 3, 2, + 10, 1, 20, 47, 10, 9, 60, 20, 3, 12, 5, 13, 115, + 6, 14, 22, 12, 2, 4, 9, 135, 27, 3, 15, 2, 17, + 2, 45, 24, 77, 10, 7, 1, 11, 15, 0, 7, 0, 28, + 17, 12, 6, 10, 37, 0, 12, 52, 0, 7, 1, 14, 3, + 26, 12, 0, 20, 17, 2, 13, 2, 9, 13, 7, 15, 29, + 3, 6, 2, 1, 28, 10, 3, 14, 7, 1, 22, 1, 6, + 6, 0, 19, 31, 6, 2, 14, 0, 0, + 255, 254, 255 +) + +testFrames <- UInt16List() +for (x in testFrames_rawData) + testFrames$push_back(x) + +encodedFrames <- UInt16List() +for (x in encodedFrames_rawData) + encodedFrames$push_back(x) + +test_case("Frames_Basic", { + + f <- Frames() + assertEqual(0L, f$Data()$size()) + + f2 <- Frames(testFrames) + d <- f2$Data() + assertEqual(length(testFrames), length(d)) + + numFrames <- length(testFrames) + for (i in 1:numFrames) + assertEqual(testFrames[i], d[i]) +}) + +test_case("Frames_Downsample", { + + f <- Frames(testFrames) + d <- f$Data() + assertEqual(length(encodedFrames), length(d)) + + numFrames <- length(encodedFrames) + for (i in 1:numFrames) + assertEqual(encodedFrames[i], d[i]) +}) diff --git a/tests/src/R/tests/test_Intervals.R b/tests/src/R/tests/test_Intervals.R new file mode 100644 index 0000000..0071750 --- /dev/null +++ b/tests/src/R/tests/test_Intervals.R @@ -0,0 +1,340 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +test_case("Intervals_UnmappedPosition", { + assertEqual(-1L, UnmappedPosition()) +}) + +test_case("Intervals_Ctors", { + + empty <- PositionInterval() + single <- PositionInterval(4) + normal <- PositionInterval(5, 8) + + assertEqual(0L, empty$Start()) + assertEqual(0L, empty$Stop()) + assertEqual(4L, single$Start()) + assertEqual(5L, single$Stop()) + assertEqual(5L, normal$Start()) + assertEqual(8L, normal$Stop()) +}) + +test_case("Intervals_Equality", { + + empty <- PositionInterval() + empty2 <- PositionInterval() + singleton <- PositionInterval(4) + sameAsSingleton <- PositionInterval(4, 5) + normal <- PositionInterval(5, 8) + sameAsNormal <- PositionInterval(5, 8) + different <- PositionInterval(20, 40) + + # self-equality + assertEqual(empty, empty) + assertEqual(singleton, singleton) + assertEqual(normal, normal) + assertEqual(different, different) + + # same values + # TODO: fix this to work with == or *anything* cleaner + assertTrue(empty$'__eq__'(empty2)) + assertTrue(singleton$'__eq__'(sameAsSingleton)) + assertTrue(normal$'__eq__'(sameAsNormal)) + + # different values + assertNotEqual(empty, singleton) + assertNotEqual(empty, normal) + assertNotEqual(empty, different) + assertNotEqual(singleton, normal) + assertNotEqual(normal, different) +}) + +test_case("Intervals_Copy", { + + interval1 <- PositionInterval(5,8) + interval2 <- PositionInterval(interval1) + interval3 <- interval1 + + # TODO: fix this to work with == or *anything* cleaner + assertTrue(interval1$'__eq__'(interval1)) + assertTrue(interval1$'__eq__'(interval2)) + assertTrue(interval1$'__eq__'(interval3)) +}) + +test_case("Intervals_Modifiers", { + + interval1 <- PositionInterval(5,8) + interval2 <- PositionInterval(interval1) + interval2$Start(2) + interval2$Stop(10) + + assertNotEqual(interval1, interval2) + assertEqual(2L, interval2$Start()) + assertEqual(10L, interval2$Stop()) +}) + +test_case("Intervals_Cover", { + + a <- PositionInterval(2,4) + b <- PositionInterval(3,5) + c <- PositionInterval(6,8) + d <- PositionInterval(1,7) + e <- PositionInterval(5,8) + + # 0123456789 + # a -- + # b -- + # c -- + # d ------ + # e --- + + # self-cover + assertTrue(a$Covers(a)) + assertTrue(a$CoveredBy(a)) + + # basic covers/covered + assertTrue(b$CoveredBy(d)) + assertTrue(d$Covers(b)) + assertNotEqual(b, d) + assertFalse(b$Covers(d)) + + # completely disjoint + assertFalse(b$Covers(c)) + assertFalse(c$Covers(b)) + assertFalse(b$CoveredBy(c)) + assertFalse(c$CoveredBy(b)) + + # b.stop == e.start + assertFalse(b$Covers(e)) + assertFalse(b$CoveredBy(e)) + + # shared endpoint, start contained + assertTrue(e$Covers(c)) + assertTrue(c$CoveredBy(e)) +}) + +test_case("Intervals_Intersect", { + + a <- PositionInterval(2,4) + b <- PositionInterval(3,5) + c <- PositionInterval(6,8) + d <- PositionInterval(1,7) + e <- PositionInterval(5,8) + + # 0123456789 + # a -- + # b -- + # c -- + # d ------ + # e --- + + # self-intersection + assertTrue(a$Intersects(a)) + + # intersection is commutative + assertTrue(a$Intersects(b)) + assertTrue(b$Intersects(a)) + + # covered implies intersection + assertTrue(d$Covers(a)) + assertTrue(a$Intersects(d)) + assertTrue(d$Intersects(a)) + + # c.start > b.stop (obvious disjoint) + assertFalse(b$Intersects(c)) + + # b.stop == e.start (intervals are right-open, so disjoint) + assertFalse(b$Intersects(e)) +}) + +test_case("Intervals_Validity", { + + a <- PositionInterval() # default ctor + b <- PositionInterval(0,0) # start == stop (zero) + c <- PositionInterval(4,4) # start == stop (nonzero) + d <- PositionInterval(0,1) # start < stop (start is zero) + e <- PositionInterval(4,5) # start < stop (start is nonzero) + f <- PositionInterval(5,4) # start > stop + + assertFalse(a$IsValid()) + assertFalse(b$IsValid()) + assertFalse(c$IsValid()) + assertTrue(d$IsValid()) + assertTrue(e$IsValid()) + assertFalse(f$IsValid()) +}) + +test_case("Intervals_Length",{ + + a <- PositionInterval(2,4) + b <- PositionInterval(3,5) + c <- PositionInterval(6,8) + d <- PositionInterval(1,7) + e <- PositionInterval(5,8) + + assertEqual(2L, a$Length()) + assertEqual(2L, b$Length()) + assertEqual(2L, c$Length()) + assertEqual(6L, d$Length()) + assertEqual(3L, e$Length()) +}) + +test_case("GenomicIntervals_Ctors", { + + empty <- GenomicInterval() + normal <- GenomicInterval("seq1", 100, 200) + + assertEqual("", empty$Name()) + assertEqual(0L, empty$Start()) + assertEqual(0L, empty$Stop()) + + assertEqual("seq1", normal$Name()) + assertEqual(100L, normal$Start()) + assertEqual(200L, normal$Stop()) +}) + +test_case("GenomicIntervals_Copy", { + + a <- GenomicInterval("seq1", 10, 20) + b <- GenomicInterval(a) + c <- a + + # TODO: fix this to work with == or *anything* cleaner + assertTrue(a$'__eq__'(a)) + assertTrue(a$'__eq__'(b)) + assertTrue(a$'__eq__'(c)) +}) + +test_case("GenomicIntervals_Modifiers", { + + a <- GenomicInterval("seq1", 10, 20) + + b <- GenomicInterval(a) + b$Name("seq5") + b$Start(2) + b$Stop(10) + + c <- GenomicInterval(a) + c$Interval(b$Interval()) + + assertNotEqual(a, b) + + assertEqual("seq5", b$Name()) + assertEqual(2L, b$Start()) + assertEqual(10L, b$Stop()) + + assertEqual(a$Name(), c$Name()) + + # TODO: fix this to work with == or *anything* cleaner + assertTrue(b$Interval()$'__eq__'(c$Interval())) +}) + +test_case("GenomicIntervals_Cover", { + + a <- GenomicInterval("seq1",2,4) + b <- GenomicInterval("seq1",3,5) + c <- GenomicInterval("seq1",6,8) + d <- GenomicInterval("seq1",1,7) + e <- GenomicInterval("seq1",5,8) + f <- GenomicInterval("seq2",3,5) # same as b, different ref + + # 0123456789 + # a -- + # b -- + # c -- + # d ------ + # e --- + + # self-cover + assertTrue(a$Covers(a)) + assertTrue(a$CoveredBy(a)) + + # basic covers/covered + assertTrue(b$CoveredBy(d)) + assertTrue(d$Covers(b)) + assertNotEqual(b, d) + assertFalse(b$Covers(d)) + + # same coords as b, but different ref + assertFalse(f$CoveredBy(d)) + assertFalse(d$Covers(f)) + assertNotEqual(f, d) + assertFalse(f$Covers(d)) + + # obvious disjoint + assertFalse(b$Covers(c)) + assertFalse(c$Covers(b)) + assertFalse(b$CoveredBy(c)) + assertFalse(c$CoveredBy(b)) + + # b.stop == e.start (intervals are right-open, so disjoint) + assertFalse(b$Covers(e)) + assertFalse(b$CoveredBy(e)) + + # shared endpoint, start contained + assertTrue(e$Covers(c)) + assertTrue(c$CoveredBy(e)) + + # assertTrue(FALSE) +}) + +test_case("GenomicIntervals_Validity", { + + a <- GenomicInterval() # default + b <- GenomicInterval("seq1",0,0) # valid id, start == stop (zero) + c <- GenomicInterval("seq1",4,4) # valid id, start == stop (non-zero) + d <- GenomicInterval("seq",0,1) # valid id, start < stop (start == zero) OK + e <- GenomicInterval("seq1",4,5) # valid id, start < stop (start > zero) OK + f <- GenomicInterval("seq1",5,4) # valid id, start > stop + g <- GenomicInterval("",0,0) # invalid id, start == stop (zero) + h <- GenomicInterval("",4,4) # invalid id, start == stop (non-zero) + i <- GenomicInterval("",0,1) # invalid id, start < stop (start == zero) + j <- GenomicInterval("",4,5) # invalid id, start < stop (start > zero) + k <- GenomicInterval("",5,4) # invalid id, start > stop + + assertTrue(d$IsValid()) + assertTrue(e$IsValid()) + assertFalse(a$IsValid()) + assertFalse(b$IsValid()) + assertFalse(c$IsValid()) + assertFalse(f$IsValid()) + assertFalse(g$IsValid()) + assertFalse(h$IsValid()) + assertFalse(i$IsValid()) + assertFalse(j$IsValid()) + assertFalse(k$IsValid()) +}) diff --git a/tests/src/R/tests/test_PolymeraseStitching.R b/tests/src/R/tests/test_PolymeraseStitching.R new file mode 100644 index 0000000..401ecce --- /dev/null +++ b/tests/src/R/tests/test_PolymeraseStitching.R @@ -0,0 +1,415 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF +# USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +compareContainers <- function(c1, c2) { + + assertEqual(length(c1), length(c2)) + + numElements <- length(c1) + for (i in 1:numElements) + assertEqual(c1[i], c2[i]) +} + +compareFrames <- function(f1, f2) { + + d1 <- f1$Data() + d2 <- f2$Data() + compareContainers(d1, d2) +} + +compareRecords <- function(b1, b2) { + + assertTrue(b1$HasDeletionQV()) + assertTrue(b1$HasDeletionTag()) + assertTrue(b1$HasInsertionQV()) + assertTrue(b1$HasMergeQV()) + assertTrue(b1$HasSubstitutionQV()) + assertTrue(b1$HasSubstitutionTag()) + assertTrue(b1$HasLabelQV()) + assertTrue(b1$HasAltLabelQV()) + assertTrue(b1$HasAltLabelTag()) + assertTrue(b1$HasPkmean()) + assertTrue(b1$HasPkmid()) + assertTrue(b1$HasPulseCall()) + assertTrue(b1$HasIPD()) + assertTrue(b1$HasPulseWidth()) + assertTrue(b1$HasPrePulseFrames()) + assertTrue(b1$HasPulseCallWidth()) + assertTrue(b1$HasPulseMergeQV()) + + assertTrue(b2$HasDeletionQV()) + assertTrue(b2$HasDeletionTag()) + assertTrue(b2$HasInsertionQV()) + assertTrue(b2$HasMergeQV()) + assertTrue(b2$HasSubstitutionQV()) + assertTrue(b2$HasSubstitutionTag()) + assertTrue(b2$HasLabelQV()) + assertTrue(b2$HasAltLabelQV()) + assertTrue(b2$HasAltLabelTag()) + assertTrue(b2$HasPkmean()) + assertTrue(b2$HasPkmid()) + assertTrue(b2$HasPulseCall()) + assertTrue(b2$HasIPD()) + assertTrue(b2$HasPulseWidth()) + assertTrue(b2$HasPrePulseFrames()) + assertTrue(b2$HasPulseCallWidth()) + assertTrue(b2$HasPulseMergeQV()) + + assertEqual(b1$FullName(), b2$FullName()) + assertEqual(b1$HoleNumber(), b2$HoleNumber()) + assertEqual(b1$NumPasses(), b2$NumPasses()) + assertEqual(b1$Sequence(), b2$Sequence()) + assertEqual(b1$DeletionTag(), b2$DeletionTag()) + assertEqual(b1$SubstitutionTag(), b2$SubstitutionTag()) + assertEqual(b1$AltLabelTag(), b2$AltLabelTag()) + assertEqual(b1$PulseCall(), b2$PulseCall()) + + # compareContainers(b1$Pkmean(), b2$Pkmean()) + # compareContainers(b1$Pkmid(), b2$Pkmid()) + # + # compareFrames(b1$IPD(), b2$IPD()) + # compareFrames(b1$PulseWidth(), b2$PulseWidth()) + # compareFrames(b1$PrePulseFrames(), b2$PrePulseFrames()) + # compareFrames(b1$PulseCallWidth(), b2$PulseCallWidth()) + + assertEqual(b1$ReadGroup()$Id(), b2$ReadGroup()$Id()) + + assertEqual(b1$Qualities()$Fastq(), b2$Qualities()$Fastq()) + assertEqual(b1$DeletionQV()$Fastq(), b2$DeletionQV()$Fastq()) + assertEqual(b1$InsertionQV()$Fastq(), b2$InsertionQV()$Fastq()) + assertEqual(b1$MergeQV()$Fastq(), b2$MergeQV()$Fastq()) + assertEqual(b1$SubstitutionQV()$Fastq(), b2$SubstitutionQV()$Fastq()) + assertEqual(b1$LabelQV()$Fastq(), b2$LabelQV()$Fastq()) + assertEqual(b1$AltLabelQV()$Fastq(), b2$AltLabelQV()$Fastq()) + assertEqual(b1$PulseMergeQV()$Fastq(), b2$PulseMergeQV()$Fastq()) + + return +} + +getVirtualRecord <- function(fn1, fn2) { + + result <- tryCatch( + { + vpr <- ZmwReadStitcher(fn1, fn2) + assertTrue(vpr$HasNext()) + virtualRecord <- vpr$Next() + return(virtualRecord) + }, + error = function(e) { + print(paste('e:',e)) + assertTrue(FALSE) # should not throw + return + } + ) + return(result) +} + +getPolymeraseRecord <- function(fn) { + + result <- tryCatch( + { + ds <- DataSet(fn) + entireFile <- EntireFileQuery(ds) + polyIter <- entireFile$begin() + polyEnd <- entireFile$end() + assertTrue(polyIter$'__ne__'(polyEnd)) + polyRecord <- polyIter$value() + return(polyRecord) + }, + error = function(e) { + print(paste('e:',e)) + assertTrue(FALSE) # should not throw + return + } + ) + return(result) +} + +test_case("PolymeraseStitching_VirtualRegions", { + + subreadsFn <- paste(test_data_path, "polymerase/internal.subreads.bam", sep="/") + scrapsFn <- paste(test_data_path, "polymerase/internal.scraps.bam", sep="/") + virtualRecord <- getVirtualRecord(subreadsFn, scrapsFn) + + # -- ADAPTER -- # + + adapter <- virtualRecord$VirtualRegionsTable('ADAPTER') + assertEqual(7L, adapter$size()) + + region <- adapter$'__getitem__'(0) + assertEqual(3047L, region$beginPos) + assertEqual(3095L, region$endPos) + + region <- adapter$'__getitem__'(1) + assertEqual(3650L, region$beginPos) + assertEqual(3700L, region$endPos) + + region <- adapter$'__getitem__'(2) + assertEqual(4289L, region$beginPos) + assertEqual(4335L, region$endPos) + + region <- adapter$'__getitem__'(3) + assertEqual(4888L, region$beginPos) + assertEqual(4939L, region$endPos) + + region <- adapter$'__getitem__'(4) + assertEqual(5498L, region$beginPos) + assertEqual(5546L, region$endPos) + + region <- adapter$'__getitem__'(5) + assertEqual(6116L, region$beginPos) + assertEqual(6173L, region$endPos) + + region <- adapter$'__getitem__'(6) + assertEqual(6740L, region$beginPos) + assertEqual(6790L, region$endPos) + + # -- BARCODE -- # + + barcode = virtualRecord$VirtualRegionsTable('BARCODE') + assertEqual(14L, barcode$size()) + + region <- barcode$'__getitem__'(0) + assertEqual(3025L, region$beginPos) + assertEqual(3047L, region$endPos) + + region <- barcode$'__getitem__'(1) + assertEqual(3095L, region$beginPos) + assertEqual(3116L, region$endPos) + + region <- barcode$'__getitem__'(2) + assertEqual(3628L, region$beginPos) + assertEqual(3650L, region$endPos) + + region <- barcode$'__getitem__'(3) + assertEqual(3700L, region$beginPos) + assertEqual(3722L, region$endPos) + + region <- barcode$'__getitem__'(4) + assertEqual(4267L, region$beginPos) + assertEqual(4289L, region$endPos) + + region <- barcode$'__getitem__'(5) + assertEqual(4335L, region$beginPos) + assertEqual(4356L, region$endPos) + + region <- barcode$'__getitem__'(6) + assertEqual(4864L, region$beginPos) + assertEqual(4888L, region$endPos) + + region <- barcode$'__getitem__'(7) + assertEqual(4939L, region$beginPos) + assertEqual(4960L, region$endPos) + + region <- barcode$'__getitem__'(8) + assertEqual(5477L, region$beginPos) + assertEqual(5498L, region$endPos) + + region <- barcode$'__getitem__'(9) + assertEqual(5546L, region$beginPos) + assertEqual(5571L, region$endPos) + + region <- barcode$'__getitem__'(10) + assertEqual(6087L, region$beginPos) + assertEqual(6116L, region$endPos) + + region <- barcode$'__getitem__'(11) + assertEqual(6173L, region$beginPos) + assertEqual(6199L, region$endPos) + + region <- barcode$'__getitem__'(12) + assertEqual(6719L, region$beginPos) + assertEqual(6740L, region$endPos) + + region <- barcode$'__getitem__'(13) + assertEqual(6790L, region$beginPos) + assertEqual(6812L, region$endPos) + + # -- LQREGION -- # + + lqregion = virtualRecord$VirtualRegionsTable('LQREGION') + assertEqual(2L, lqregion$size()) + + region <- lqregion$'__getitem__'(0) + assertEqual(0L, region$beginPos) + assertEqual(2659L, region$endPos) + + region <- lqregion$'__getitem__'(1) + assertEqual(7034L, region$beginPos) + assertEqual(7035L, region$endPos) + + # -- HQREGION -- # + + hqregion = virtualRecord$VirtualRegionsTable('HQREGION') + assertEqual(1L, hqregion$size()) + + region <- hqregion$'__getitem__'(0) + assertEqual(2659L, region$beginPos) + assertEqual(7034L, region$endPos) +}) + +test_case("PolymeraseStitching_InternalSubreadsToOriginal", { + + # stitch virtual polymerase record + subreadsFn <- paste(test_data_path, "polymerase/internal.subreads.bam", sep="/") + scrapsFn <- paste(test_data_path, "polymerase/internal.scraps.bam", sep="/") + virtualRecord <- getVirtualRecord(subreadsFn, scrapsFn) + + # fetch original polymerase record + polyFn <- paste(test_data_path, "polymerase/internal.polymerase.bam", sep="/") + polyRecord <- getPolymeraseRecord(polyFn) + + # check + compareRecords(polyRecord, virtualRecord) +}) + +test_case("PolymeraseStitching_InternalHQToOriginal", { + + # stitch virtual polymerase record + hqRegionFn <- paste(test_data_path, "polymerase/internal.hqregions.bam", sep="/") + lqRegionFn <- paste(test_data_path, "polymerase/internal.lqregions.bam", sep="/") + virtualRecord <- getVirtualRecord(hqRegionFn, lqRegionFn) + + # fetch original polymerase record + polyFn <- paste(test_data_path, "polymerase/internal.polymerase.bam", sep="/") + polyRecord <- getPolymeraseRecord(polyFn) + + # check + compareRecords(polyRecord, virtualRecord) +}) + +test_case("PolymeraseStitching_ProductionSubreadsToOriginal", { + + # stitch virtual polymerase record + subreadsFn <- paste(test_data_path, "polymerase/production.subreads.bam", sep="/") + scrapsFn <- paste(test_data_path, "polymerase/production.scraps.bam", sep="/") + virtualRecord <- getVirtualRecord(subreadsFn, scrapsFn) + + # fetch original polymerase record + polyFn <- paste(test_data_path, "polymerase/production.polymerase.bam", sep="/") + polyRecord <- getPolymeraseRecord(polyFn) + + # compare + assertEqual(polyRecord$FullName(), virtualRecord$FullName()) + assertEqual(polyRecord$HoleNumber(), virtualRecord$HoleNumber()) + assertEqual(polyRecord$NumPasses(), virtualRecord$NumPasses()) + assertEqual(polyRecord$Sequence(), virtualRecord$Sequence()) + assertEqual(polyRecord$DeletionTag(), virtualRecord$DeletionTag()) + assertEqual(polyRecord$SubstitutionTag(), virtualRecord$SubstitutionTag()) + + compareFrames(polyRecord$IPD(), virtualRecord$IPDV1Frames()) + assertEqual(polyRecord$ReadGroup()$Id(), virtualRecord$ReadGroup()$Id()) + + tolerance = 1e-5 + assertTrue( abs(polyRecord$ReadAccuracy()$ToFloat() - virtualRecord$ReadAccuracy()$ToFloat()) <= tolerance ) + # assertEqual(polyRecord$ReadAccuracy()$ToFloat(), virtualRecord$ReadAccuracy()$ToFloat()) + + assertEqual(polyRecord$Qualities()$Fastq(), virtualRecord$Qualities()$Fastq()) + assertEqual(polyRecord$DeletionQV()$Fastq(), virtualRecord$DeletionQV()$Fastq()) + assertEqual(polyRecord$InsertionQV()$Fastq(), virtualRecord$InsertionQV()$Fastq()) + assertEqual(polyRecord$MergeQV()$Fastq(), virtualRecord$MergeQV()$Fastq()) + assertEqual(polyRecord$SubstitutionQV()$Fastq(), virtualRecord$SubstitutionQV()$Fastq()) +}) + +test_case("PolymeraseStitching_ProductionHQToOriginal", { + + # stitch virtual polymerase record + hqRegionFn <- paste(test_data_path, "polymerase/production_hq.hqregion.bam", sep="/") + lqRegionFn <- paste(test_data_path, "polymerase/production_hq.scraps.bam", sep="/") + virtualRecord <- getVirtualRecord(hqRegionFn, lqRegionFn) + + # fetch original polymerase record + polyFn <- paste(test_data_path, "polymerase/production.polymerase.bam", sep="/") + polyRecord <- getPolymeraseRecord(polyFn) + + # compare + assertEqual(polyRecord$FullName(), virtualRecord$FullName()) + assertEqual(polyRecord$HoleNumber(), virtualRecord$HoleNumber()) + assertEqual(polyRecord$NumPasses(), virtualRecord$NumPasses()) + assertEqual(polyRecord$Sequence(), virtualRecord$Sequence()) + assertEqual(polyRecord$DeletionTag(), virtualRecord$DeletionTag()) + assertEqual(polyRecord$SubstitutionTag(), virtualRecord$SubstitutionTag()) + + compareFrames(polyRecord$IPD(), virtualRecord$IPDV1Frames()) + assertEqual(polyRecord$ReadGroup()$Id(), virtualRecord$ReadGroup()$Id()) + + tolerance = 1e-5 + assertTrue( abs(polyRecord$ReadAccuracy()$ToFloat() - virtualRecord$ReadAccuracy()$ToFloat()) <= tolerance ) + # assertEqual(polyRecord$ReadAccuracy()$ToInt(), virtualRecord$ReadAccuracy()$ToInt()) + + assertEqual(polyRecord$Qualities()$Fastq(), virtualRecord$Qualities()$Fastq()) + assertEqual(polyRecord$DeletionQV()$Fastq(), virtualRecord$DeletionQV()$Fastq()) + assertEqual(polyRecord$InsertionQV()$Fastq(), virtualRecord$InsertionQV()$Fastq()) + assertEqual(polyRecord$MergeQV()$Fastq(), virtualRecord$MergeQV()$Fastq()) + assertEqual(polyRecord$SubstitutionQV()$Fastq(), virtualRecord$SubstitutionQV()$Fastq()) + + assertTrue(polyRecord$HasDeletionQV()) + assertTrue(polyRecord$HasDeletionTag()) + assertTrue(polyRecord$HasInsertionQV()) + assertTrue(polyRecord$HasMergeQV()) + assertTrue(polyRecord$HasSubstitutionQV()) + assertTrue(polyRecord$HasSubstitutionTag()) + assertTrue(polyRecord$HasIPD()) + assertFalse(polyRecord$HasLabelQV()) + assertFalse(polyRecord$HasAltLabelQV()) + assertFalse(polyRecord$HasAltLabelTag()) + assertFalse(polyRecord$HasPkmean()) + assertFalse(polyRecord$HasPkmid()) + assertFalse(polyRecord$HasPulseCall()) + assertFalse(polyRecord$HasPulseWidth()) + assertFalse(polyRecord$HasPrePulseFrames()) + assertFalse(polyRecord$HasPulseCallWidth()) + assertFalse(polyRecord$HasPulseCall()) + + assertTrue(virtualRecord$HasDeletionQV()) + assertTrue(virtualRecord$HasDeletionTag()) + assertTrue(virtualRecord$HasInsertionQV()) + assertTrue(virtualRecord$HasMergeQV()) + assertTrue(virtualRecord$HasSubstitutionQV()) + assertTrue(virtualRecord$HasSubstitutionTag()) + assertTrue(virtualRecord$HasIPD()) + assertFalse(virtualRecord$HasLabelQV()) + assertFalse(virtualRecord$HasAltLabelQV()) + assertFalse(virtualRecord$HasAltLabelTag()) + assertFalse(virtualRecord$HasPkmean()) + assertFalse(virtualRecord$HasPkmid()) + assertFalse(virtualRecord$HasPulseCall()) + assertFalse(virtualRecord$HasPulseWidth()) + assertFalse(virtualRecord$HasPrePulseFrames()) + assertFalse(virtualRecord$HasPulseCallWidth()) + assertFalse(virtualRecord$HasPulseCall()) +}) diff --git a/tests/src/R/tests/test_QualityValues.R b/tests/src/R/tests/test_QualityValues.R new file mode 100644 index 0000000..b0671d0 --- /dev/null +++ b/tests/src/R/tests/test_QualityValues.R @@ -0,0 +1,113 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +test_case("QualityValue_Defaults", { + + value <- QualityValue() + assertEqual(0L, value$ToInt()) + assertEqual('!', value$Fastq()) +}) + +test_case("QualityValue_FromNumber", { + + zero <- QualityValue(0) + thirtythree <- QualityValue(33) + normal <- QualityValue(42) + maxQV <- QualityValue(93) + tooHigh <- QualityValue(94) + max8bit <- QualityValue(126) + + assertEqual(0L, zero$ToInt()) + assertEqual(33L, thirtythree$ToInt()) + assertEqual(42L, normal$ToInt()) + assertEqual(93L, maxQV$ToInt()) + assertEqual(93L, tooHigh$ToInt()) + assertEqual(93L, max8bit$ToInt()) + + assertEqual('!', zero$Fastq()) + assertEqual('B', thirtythree$Fastq()) + assertEqual('K', normal$Fastq()) + assertEqual('~', maxQV$Fastq()) + assertEqual('~', tooHigh$Fastq()) + assertEqual('~', max8bit$Fastq()) +}) + +test_case("QualityValue_FromFastq", { + + zero <- QualityValue_FromFastq('!') + thirtythree <- QualityValue_FromFastq('B') + normal <- QualityValue_FromFastq('K') + maxQV <- QualityValue_FromFastq('~') + + assertEqual(0L, zero$ToInt()) + assertEqual(33L, thirtythree$ToInt()) + assertEqual(42L, normal$ToInt()) + assertEqual(93L, maxQV$ToInt()) +}) + +test_case("QualityValues_Defaults", { + values <- QualityValues() + assertEqual(0L, nchar(values$Fastq())) +}) + +test_case("QualityValues_FromNumbers", { + + fastqString <- "~~~KKBB!!" + values <- c(93, 93, 93, 42, 42, 33, 33, 0, 0) + + qvs <- QualityValues() + for (v in values) + qvs$push_back(QualityValue(v)) + + assertEqual(fastqString, qvs$Fastq()) +}) + +test_case("QualityValues_FromFastq", { + + fastqString <- "~~~KKBB!!" + values <- c(93L, 93L, 93L, 42L, 42L, 33L, 33L, 0L, 0L) + + qvs <- QualityValues(fastqString) + assertEqual(nchar(fastqString), qvs$size()) + assertEqual(length(values), qvs$size()) + + numValues <- length(values) + for ( i in 1:numValues ) { + qv <- qvs$'__getitem__'(i-1) + assertEqual(values[i], qv$ToInt()) + } +}) diff --git a/tests/src/R/tests/utils.R b/tests/src/R/tests/utils.R new file mode 100644 index 0000000..ff793a6 --- /dev/null +++ b/tests/src/R/tests/utils.R @@ -0,0 +1,307 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +# main test suite runner +run_test_suite <- function(path) { + test_files <- dir(path, "^test.*\\.[rR]$", full.names = TRUE) + lapply(test_files, run_test_file) + invisible() +} + +run_test_file <- function(filename) { + source(filename) + invisible() +} + +# main test case definition +# +# Example: +# test_case("name", { +# ...tested code here... +# }) +# +test_case <- function(name, code) { + test_case_runner(name, substitute(code)) + invisible() +} + +# main assert definitions +# +# assertEqual(expected, actual) +# assertNotEqual(expected, actual) +# assertTrue(expr) +# assertFalse(expr) +# + +assertEqual <- function(expected, actual) { + assertHelper(identical(expected, actual), + TRUE, + expression_string("expected"), + deparse(expected), + expression_string("actual"), + deparse(actual), + "==") +} + +assertAllEqual <- function(expected, actual) { + assertHelper(all.equal(expected, actual), + TRUE, + expression_string("expected"), + deparse(expected), + expression_string("actual"), + deparse(actual), + "==") +} + +assertNotEqual <- function(expected, actual) { + assertHelper(identical(expected, actual), + FALSE, + expression_string("expected"), + deparse(expected), + expression_string("actual"), + deparse(actual), + "!=") +} + +assertTrue <- function(expr) { + assertHelper(as.vector(expr), + TRUE, + "TRUE", + "TRUE", + expression_string("expr"), + deparse(expr), + "==") +} + +assertFalse <- function(expr) { + assertHelper(as.vector(expr), + FALSE, + "FALSE", + "FALSE", + expression_string("expr"), + deparse(expr), + "==") +} + +# TODO: (if needed) assertLessThan, assertGreaterThan, assertNull, etc + +# ------------------------------- # +# internals +# ------------------------------- # + +expression_string <- function(name, env = parent.frame()) { + subs <- do.call("substitute", list(as.name(name), env)) + paste0(deparse(subs, width.cutoff = 500), collapse = "\n") +} + +assertHelper <- function(compare, + to, + expected_expr, + expected_value, + actual_expr, + actual_value, + compare_type) +{ + success <- identical(compare, to) + + result <- make_assert_result(success, + expected_expr, + expected_value, + actual_expr, + actual_value, + compare_type) + + + # record result with testCaseCollector + testCaseResults <- test_case_results() + testCaseResults$add_result(result) + invisible() +} + +make_assert_result <- function(success, + expected_expr, + expected_value, + actual_expr, + actual_value, + compare_type) +{ + structure(list( + success = success, + expectedExpression = expected_expr, + expectedValue = expected_value, + actualExpression = actual_expr, + actualValue = actual_value, + compareType = compare_type + )) +} + +TestCaseResults <- setRefClass("TestCaseResults", + + fields = list( + test = "character", + anyFailed = "logical", + results = "list" + ), + methods = list( + initialize = function(...) { + test <<- "" + anyFailed <<- FALSE + + initFields(...) + }, + + start_test = function(name) { + test <<- name + results <<- list() + }, + + add_result = function(result) { + if (!result$success) + anyFailed <<- TRUE + results <<- c(results, list(result)) + }, + + end_test = function() { + + # summarize test case results + testOutput <- format_results() + + # report to test collector + suiteResults <- test_suite_results() + suiteResults$add_test_case_result(anyFailed, testOutput) + + # reset + test <<- "" + anyFailed <<- FALSE + results <<- list() + }, + + format_results = function() { + + lines <- list() + + status <- "OK" + if (anyFailed) + status <- "FAILED" + + headerLine <- paste("TestCase:", test, "...", status, sep=" ") + lines <- c(lines, list(headerLine)) + + for (result in results) { + if (!result$success) { + valueOfLabel <- paste(result$actualExpression, result$compareType, result$expectedExpression, sep=" ") + valueOf <- paste(" Value of:", valueOfLabel, sep=" ") + actual <- paste(" Actual:", result$actualValue, sep=" ") + expected <- paste(" Expected:", result$expectedValue, sep=" ") + lines <- c(lines, valueOf, actual, expected, "") + } + } + invisible(lines) + } + ) +) + +TestSuiteResults <- setRefClass("TestSuiteResults", + + fields = list( + numTests = "integer", + numFailed = "integer", + results = "list" + ), + methods = list( + initialize = function(...) { + numTests <<- 0L + numFailed <<- 0L + results <<- list() + + initFields(...) + }, + + add_test_case_result = function(testHasFailures, testOutput) { #(results) + numTests <<- numTests + 1L + if (testHasFailures) + numFailed <<- numFailed + 1L + results <<- c(results, testOutput) + }, + + any_failed = function() { + return (numFailed != 0L) + }, + + print_summary = function(...) { + + cat("\n") + cat("-------------------------\n") + cat("Tests Complete\n") + cat("-------------------------\n") + cat("\n") + + for (result in results) { + cat(result) + cat("\n") + } + cat("-------------------------\n") + + if (numFailed == 1L) { + footer <- paste(numFailed, "test failed out of", numTests, sep=" ") + } else { + footer <- paste(numFailed, "tests failed out of", numTests, sep=" ") + } + cat(footer) + cat("\n\n") + } + ) +) + +test_env = new.env() +test_env$testSuiteResults <- TestSuiteResults$new() +test_env$testCaseResults <- TestCaseResults$new() + +test_suite_results <- function() { + test_env$testSuiteResults +} + +test_case_results <- function() { + test_env$testCaseResults +} + +test_case_runner <- function(name, code) { + testCaseResults <- test_case_results() + testCaseResults$start_test(name) + eval(code, test_env) + testCaseResults$end_test() +} diff --git a/tests/src/TestData.h.in b/tests/src/TestData.h.in new file mode 100644 index 0000000..b25d262 --- /dev/null +++ b/tests/src/TestData.h.in @@ -0,0 +1,58 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef TESTDATA_H +#define TESTDATA_H + +#include + +namespace PacBio { +namespace BAM { +namespace tests { + +const std::string Source_Dir = std::string("@PacBioBAM_TestsDir@"); +const std::string Bin_Dir = std::string("@CMAKE_CURRENT_BINARY_DIR@"); +const std::string Data_Dir = std::string("@PacBioBAM_TestsDir@/data"); +const std::string Generated_Dir = std::string("@GeneratedDir@"); +const std::string GeneratedData_Dir = std::string("@GeneratedTestDataDir@"); +const std::string Bam2Sam = std::string("@PacBioBAM_BinDir@/bam2sam"); + +} // namespace tests +} // namespace BAM +} // namespace PacBio + +#endif // TESTDATA_H diff --git a/tests/src/cram/bam2sam.t.in b/tests/src/cram/bam2sam.t.in new file mode 100644 index 0000000..66645c4 --- /dev/null +++ b/tests/src/cram/bam2sam.t.in @@ -0,0 +1,63 @@ +Setup: + + $ BAM2SAM="@PacBioBAM_BinDir@/bam2sam" && export BAM2SAM + + $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR + +Normal: + + $ $BAM2SAM < $DATADIR/phi29.bam | head -n 5 + @HD\tVN:3.0.0\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\tPU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0 (esc) + @PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0 (esc) + @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2 (esc) + m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/2067_4072\t4\t*\t0\t255\t*\t*\t0\t0\tAAGTCATGTATAGAGTTATTGGCTCAGCGGTGGCAAGCAGCCAACTCAGCTCCTTTCGGGCTTGTTAGCAGCCGGATCCACACTCTGAAATTCCTGCAGCTCGAGTTATTTGATAGTAAAAGTGGGTCATCAAACCGCAACTACGCCACCCCGGTACCTGAACAGGCTTCGGTTTCATTTTGAGACGAGAAAAACCCACTTTGAAGTTTTCGAAAATCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAAGAGAACTTGATGTCAGTGTTAGTCGTCAGGAGAGCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGATGTAAGGTTTTCTGACGCAGATATTGTTGGCAACGCTTAAAAAGTGGGATTCGTGTGGCCAGTAGCCCAGGTTATCTTTCGGGTCCACGATGTCTTTGATAACGTCCAGAATTTCAGTACCAGTTCAGGTGAATAGAACGGTATCGCAGTTTCAGATAATACGATCATGACAGCCTGCGCTGGCTGTGATGGTCGTATAGCGTGCCCAGGCCGTGATAAAGACACCCATCCGGGGTATATACAGGGTCTTTCGTTTCCTCCCTAGCCTAAGAAAGAAAACCCAGAGCACCGTTCTCTTCAAGGTATGGCACTTTACCGTAACGTCCGGGTTGGAAAGCGAAATTTACCGTACAGGCCTGGTTCAGCATCAGCTTCCCGCCAGCTGTTTGATGGCGTCCCTTCAGAGGTAGTTTTGATATACGTCCATTTTGTCGATAAAGTCCTTGAACAGGCAAGTGGTTCCTTTGAAACTTCAGACCAAGAGGATAATATTCAACATTGTACAGGTCGTAGTGTCTTTCATCAGTTCCAGATCAAACATTAGAACAGCCACAAGGTCAGCAAATTTCACCGCCGGAAGATTTCAGGTAATTCGTTACGCTTGTAAAGAAATTGGGACGCTTGATCTGGGATGGTCGGGATCTAGCCTCCTTCAGTTCAAATTCACTTAACGAATGTGCTGAATGTGCCAGAGGGTAATCCTCGTCCAAAGACGGTATTTACCCTCGAAACGATCGGCTCGCCGTACGGCAACGCAGGCGAGAGTACATTCTGCAGCTGGGTACAGGGAATTAAACATCAAAAACCATACCTTCGCCGATCTCTTTTTCCTTTAAAACGTCATTCAGCCACGGTGAAGACCAACCACGGTAAGGCATAACGAACTTCCTTGTCCAGACCCAGGCTCAGGGTCCGGGAAAACCTTTTTAAACTTCTTGTGGTGATAATGTCTTTGAAGCCTTTCAGAAGTCAGAACCTGCCGTCATACGATCCAAGACCACTGCTTGAACCTGGATCAGCAGACTTCTGCGATAATCTGAATATCGTTTTTAATGTAGGCAATATTTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAAGTCGATGTCGCCTTTCAGTAGCCGTCAGTTAAAAGTCTTTCGCAATTTTCTTAACGCGGAAAGGCAAGTTTCTAGAGAGTCGTAAGATCACGTGTGGAATCTTGCGTTTACCCTTGTAACCCAGGCAATACAATCATATAACCACTGGCCCAATGCGAGAAATGATGGTGTTGTAGGTATTTTGGCAGACCATCTGCGGACCGCATTTAAAACGTTACGTTCCAGCCAGTTGATGATGAAATGCGCCATCAAAATTTCAGATTTGTGGAAGTACAGGTCCCAGCCTGAAAACTTTGCAGAAACCCAAGCCATAAAATTCATCCAGGGAGTTACCAATCTTATACTCGGAGTGGTCTTCGATGTTCATTGTAACCGTATGCCCATACGCGGCAATCTTCAAGCCTTGGTGTAGTCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTCATATGTCATTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTTATCCGCCTCACATTTCCCTATAGTGAGTCGTATTAATTTCGCGGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:4072\tqs:i:2067\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc) + +Explicit Filename (not stdin): + + $ $BAM2SAM $DATADIR/phi29.bam | head -n 5 + @HD\tVN:3.0.0\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\tPU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0 (esc) + @PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0 (esc) + @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2 (esc) + m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/2067_4072\t4\t*\t0\t255\t*\t*\t0\t0\tAAGTCATGTATAGAGTTATTGGCTCAGCGGTGGCAAGCAGCCAACTCAGCTCCTTTCGGGCTTGTTAGCAGCCGGATCCACACTCTGAAATTCCTGCAGCTCGAGTTATTTGATAGTAAAAGTGGGTCATCAAACCGCAACTACGCCACCCCGGTACCTGAACAGGCTTCGGTTTCATTTTGAGACGAGAAAAACCCACTTTGAAGTTTTCGAAAATCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAAGAGAACTTGATGTCAGTGTTAGTCGTCAGGAGAGCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGATGTAAGGTTTTCTGACGCAGATATTGTTGGCAACGCTTAAAAAGTGGGATTCGTGTGGCCAGTAGCCCAGGTTATCTTTCGGGTCCACGATGTCTTTGATAACGTCCAGAATTTCAGTACCAGTTCAGGTGAATAGAACGGTATCGCAGTTTCAGATAATACGATCATGACAGCCTGCGCTGGCTGTGATGGTCGTATAGCGTGCCCAGGCCGTGATAAAGACACCCATCCGGGGTATATACAGGGTCTTTCGTTTCCTCCCTAGCCTAAGAAAGAAAACCCAGAGCACCGTTCTCTTCAAGGTATGGCACTTTACCGTAACGTCCGGGTTGGAAAGCGAAATTTACCGTACAGGCCTGGTTCAGCATCAGCTTCCCGCCAGCTGTTTGATGGCGTCCCTTCAGAGGTAGTTTTGATATACGTCCATTTTGTCGATAAAGTCCTTGAACAGGCAAGTGGTTCCTTTGAAACTTCAGACCAAGAGGATAATATTCAACATTGTACAGGTCGTAGTGTCTTTCATCAGTTCCAGATCAAACATTAGAACAGCCACAAGGTCAGCAAATTTCACCGCCGGAAGATTTCAGGTAATTCGTTACGCTTGTAAAGAAATTGGGACGCTTGATCTGGGATGGTCGGGATCTAGCCTCCTTCAGTTCAAATTCACTTAACGAATGTGCTGAATGTGCCAGAGGGTAATCCTCGTCCAAAGACGGTATTTACCCTCGAAACGATCGGCTCGCCGTACGGCAACGCAGGCGAGAGTACATTCTGCAGCTGGGTACAGGGAATTAAACATCAAAAACCATACCTTCGCCGATCTCTTTTTCCTTTAAAACGTCATTCAGCCACGGTGAAGACCAACCACGGTAAGGCATAACGAACTTCCTTGTCCAGACCCAGGCTCAGGGTCCGGGAAAACCTTTTTAAACTTCTTGTGGTGATAATGTCTTTGAAGCCTTTCAGAAGTCAGAACCTGCCGTCATACGATCCAAGACCACTGCTTGAACCTGGATCAGCAGACTTCTGCGATAATCTGAATATCGTTTTTAATGTAGGCAATATTTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAAGTCGATGTCGCCTTTCAGTAGCCGTCAGTTAAAAGTCTTTCGCAATTTTCTTAACGCGGAAAGGCAAGTTTCTAGAGAGTCGTAAGATCACGTGTGGAATCTTGCGTTTACCCTTGTAACCCAGGCAATACAATCATATAACCACTGGCCCAATGCGAGAAATGATGGTGTTGTAGGTATTTTGGCAGACCATCTGCGGACCGCATTTAAAACGTTACGTTCCAGCCAGTTGATGATGAAATGCGCCATCAAAATTTCAGATTTGTGGAAGTACAGGTCCCAGCCTGAAAACTTTGCAGAAACCCAAGCCATAAAATTCATCCAGGGAGTTACCAATCTTATACTCGGAGTGGTCTTCGATGTTCATTGTAACCGTATGCCCATACGCGGCAATCTTCAAGCCTTGGTGTAGTCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTCATATGTCATTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTTATCCGCCTCACATTTCCCTATAGTGAGTCGTATTAATTTCGCGGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:4072\tqs:i:2067\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc) + +Header-Only: + + $ $BAM2SAM --header-only < $DATADIR/phi29.bam | head -n 5 + @HD\tVN:3.0.0\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\tPU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0 (esc) + @PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0 (esc) + @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2 (esc) + +No-Header: + + $ $BAM2SAM --no-header < $DATADIR/phi29.bam | head -n 5 + m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/2067_4072\t4\t*\t0\t255\t*\t*\t0\t0\tAAGTCATGTATAGAGTTATTGGCTCAGCGGTGGCAAGCAGCCAACTCAGCTCCTTTCGGGCTTGTTAGCAGCCGGATCCACACTCTGAAATTCCTGCAGCTCGAGTTATTTGATAGTAAAAGTGGGTCATCAAACCGCAACTACGCCACCCCGGTACCTGAACAGGCTTCGGTTTCATTTTGAGACGAGAAAAACCCACTTTGAAGTTTTCGAAAATCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAAGAGAACTTGATGTCAGTGTTAGTCGTCAGGAGAGCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGATGTAAGGTTTTCTGACGCAGATATTGTTGGCAACGCTTAAAAAGTGGGATTCGTGTGGCCAGTAGCCCAGGTTATCTTTCGGGTCCACGATGTCTTTGATAACGTCCAGAATTTCAGTACCAGTTCAGGTGAATAGAACGGTATCGCAGTTTCAGATAATACGATCATGACAGCCTGCGCTGGCTGTGATGGTCGTATAGCGTGCCCAGGCCGTGATAAAGACACCCATCCGGGGTATATACAGGGTCTTTCGTTTCCTCCCTAGCCTAAGAAAGAAAACCCAGAGCACCGTTCTCTTCAAGGTATGGCACTTTACCGTAACGTCCGGGTTGGAAAGCGAAATTTACCGTACAGGCCTGGTTCAGCATCAGCTTCCCGCCAGCTGTTTGATGGCGTCCCTTCAGAGGTAGTTTTGATATACGTCCATTTTGTCGATAAAGTCCTTGAACAGGCAAGTGGTTCCTTTGAAACTTCAGACCAAGAGGATAATATTCAACATTGTACAGGTCGTAGTGTCTTTCATCAGTTCCAGATCAAACATTAGAACAGCCACAAGGTCAGCAAATTTCACCGCCGGAAGATTTCAGGTAATTCGTTACGCTTGTAAAGAAATTGGGACGCTTGATCTGGGATGGTCGGGATCTAGCCTCCTTCAGTTCAAATTCACTTAACGAATGTGCTGAATGTGCCAGAGGGTAATCCTCGTCCAAAGACGGTATTTACCCTCGAAACGATCGGCTCGCCGTACGGCAACGCAGGCGAGAGTACATTCTGCAGCTGGGTACAGGGAATTAAACATCAAAAACCATACCTTCGCCGATCTCTTTTTCCTTTAAAACGTCATTCAGCCACGGTGAAGACCAACCACGGTAAGGCATAACGAACTTCCTTGTCCAGACCCAGGCTCAGGGTCCGGGAAAACCTTTTTAAACTTCTTGTGGTGATAATGTCTTTGAAGCCTTTCAGAAGTCAGAACCTGCCGTCATACGATCCAAGACCACTGCTTGAACCTGGATCAGCAGACTTCTGCGATAATCTGAATATCGTTTTTAATGTAGGCAATATTTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAAGTCGATGTCGCCTTTCAGTAGCCGTCAGTTAAAAGTCTTTCGCAATTTTCTTAACGCGGAAAGGCAAGTTTCTAGAGAGTCGTAAGATCACGTGTGGAATCTTGCGTTTACCCTTGTAACCCAGGCAATACAATCATATAACCACTGGCCCAATGCGAGAAATGATGGTGTTGTAGGTATTTTGGCAGACCATCTGCGGACCGCATTTAAAACGTTACGTTCCAGCCAGTTGATGATGAAATGCGCCATCAAAATTTCAGATTTGTGGAAGTACAGGTCCCAGCCTGAAAACTTTGCAGAAACCCAAGCCATAAAATTCATCCAGGGAGTTACCAATCTTATACTCGGAGTGGTCTTCGATGTTCATTGTAACCGTATGCCCATACGCGGCAATCTTCAAGCCTTGGTGTAGTCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTCATATGTCATTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTTATCCGCCTCACATTTCCCTATAGTGAGTCGTATTAATTTCGCGGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:4072\tqs:i:2067\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc) + m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/4151_6183\t4\t*\t0\t255\t*\t*\t0\t0\tGATCCCGCGAATTAATTACGACTCACTATAGGGGAATTGTGAGCGGATAACAATTCCCGCCTCTAGAAATAATTTTGTTTAAACTTTTAAGAAAGGAGATATTACATATGAAACACAGCCACGTAAAATGTATTCCTGCGACTTGGAGACTACCACCAAGGTGAAGATTTGCCGCGTAATGGGCATACGGTTTACATGAAACATCGAAGAACAAACTCGAGTATAAGATTGGTAACTCCCCTGGATGAATTATGGCTTGGGTTACTGAAAGTTCGAGGTCTGACCTGTACTTCGCACAAATCTGAAAATTTGATGGCCGCAAATTTCAATTCATCACTGGCTGGAACGTAAACGGTTTTAAATGGTCCGCAGATCGGTCTGTGCCAAATACCCTGATCAACACATCATTTCTTCGCAATGCGGCCAGTGTAATGATTGATATCTTGCCCTGGGTTGACAAGGGGTAAACGCAAGATCCACACCTGTGATCTACGACCTCTCTGAAGAAAACTGCGTTTCCGGTTAAGAAAATTGCGAAAGACTTTAAGCTGAACGGTACTGAAAAGCGACATGACTATCATAATGAGCGCCCGGTCGTTACAAAATCACCCCGGAAAGAATATGCCTACATTTAAAAACGATATTCAGATTATCGCAAGAACTCTGCTGATCAGTTCAAGCAAGGGTCTGGATCGTAAATGACGGCAGGTTCTGACTCTCCTGAAAGGCTTCAAAAGACATTATCACCACCTAAAAGAAGTTTAAAAAGGTTTTTTCACCGACCCTGAGCCTAGGGCTGGACAAGGAAAGTTGTTAATGCCCATACCGTGGTGGTTTCACCTGGCTGAAAGACCGTTTTAAAGAAAAAGAGATCGGCGAAGGTATGGTTTTTGATGTTAATTCCCTGTAACCAAGCCTTCAATGTACTCTCGCCTGCTTGCCGTCACACGGGCGAGCGACGTATTCGAAAGGGTAAAATACGTTCTGGGACGGAGGATTTACCCTCTGCAATTCGGCACATTCCGTTGTGAATTTGGAACTGAAAGGAAGGCTTAGATCCCGACCATCCCAGATCAAGCGTTCCCATTTCTAACAAAGGGTAACGAATACCTGAAATCTTCCAGGCGGTGAAATTGCTGACCTGTGGCTGTCTAAATGTTTGATCTTGGAAACTGATGAAAGAGCACTACGACCTGGTACAATGTTGAATATATCTCTGGTCTGAAGTTCAAAGCAACCACTGGCCTGTTCAAGGACTTTATCGACAATGGACGTATATCAAAAGACTACCTCTGAAGACGCCATCAAACAGCTGGCGAAGCTGATGCTGACAAGCCTGTACGGTAAATTCGCGTCCCACCCGGACGTTTACCGGGTAAAGTGCCATATGCTGAAAGAGAAAGCGGTGCTCTGGTTTTTCGTCTAGGTGGAAGGAGGAAACGAAGACACTGTATATACCGCCGAATGGGTGTCTTTATCCAAGCGGCCTGGCACGCTATACGACCATCACAGGCAAGCGCAGGCTTTTGTTAATGATCGTATTATCTACTGCGATTACCGATTCTACTTCACTGACTGGTACTGAAATCTGGACGTTATCAAAGACATCGTAGACCCGAAGAAACTGGGCTACTGGCACACGAATTCCACTTTAAGCGTGCAAAATATCTGCGTCAGAAAACCTACATCCCAGGATATTTACATGAAAGAAGTAGACGGCAAACTGGTAGAGGGCTCTCCGTGACGACTACACTGACATCAAGTTCTCTGTGAAATGCGCAGGCAAATGACGGCACAAAATCCAAAAAAGGAAGTGACTTTCGAAAACTTCAAAAGTGGGTTCTCGTAAAATGAAACCGAAAGCTGTTCAGGTTTAAACCCGGGTGGCGTAGTGCCTGGTTGATGAACACTTTTTACTATCAAAATAACTTCGAAAGCTGCAGGAATTCAAGCTGATCCGGCTGCTAACAAAGCCCGAAGGAAGCTGAGTTGGCTGCTGCCACCGTGAGCAATACTCTAAATACATGACTCT\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:6183\tqs:i:4151\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc) + m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/6234_8214\t4\t*\t0\t255\t*\t*\t0\t0\tAGAGTCATGTATAAGAGTTATTGCTCAGCGGTGGCAGCAGACAACTCAGCTTCCTTTCGGGCCTTTGTTAGCAGCCGGATCCAAGCTTGAATTCCTGCAAGCTCGAGTTATTTGATAGTAAAAGTGTCATCAAACCAGCACTACGGCCGAACCCGGTACCTGAACAGATTCGTTTCATTTTACGAGAAAAACCCACTTTGAAGTTTTGCCGAAAGTCACTTCTTTTTGATTTGTCCGTCATGCTGCGCATTTCACAGAGACTTGAATGTCAGTGTAGTCGTCATCGGGGGGGGGAAGAGCCCTCTACCAGTTTTGCCGTCTACTTCTTTCATGTAAATATCTGGATGTAGGTTTTCTGAACGCAGATATTTGCAGCTTAAAAGTGGATTCGTGTGCCCAGTAGCCCGTTTTCTTCGGGTCCTACGATGTCTTTTGATAACGTCCAGAATTTCAGTACCAGTCAGGTGAATAGAATCGGTATCGCAGTAGATAAATACGATCATAACAAGCCTGCGCTGCCTGTTGATGGTCGTATAGCGTGCCCAGGCCCGTGATAAAGAACCATCGGGGTATATAACAGGGTCTTTCGTTCCTCCTCACCTAGACGAAAAACCCAGAGCACCGTTCTCTTTCAAGGTATGGCCTTTACCGGTAACGTCCGGGTTGGACGCGAATTTAAGCCGTAACAGGCTGTCTCAGCATACAGCTTTCGCCAGCCTGTTTGATGGCCGTCTTCAGAGGTAGTTTTGATATACGTTCCATTTGTCGATAAAGTCCCTTGAGCAGGCCCAGTGGGTTGGCTTTGAACTCAGACGCAGAATATATTCAACATTGTAACAGGTCGTAGTGCTCTTTCATCAGTTCAGATTCAACATTAGACAAGCCACAGGTCAGCATTTCACCGCCGGGAAGAATTTCAAGGTATTCGTTTACCCTTGGTAGAAATGGAACGCTTGTAATCTGGATGGTCGGGATCTAGCCTTTTCAGTTCAAAATTCACACGAATGTGCTGAATGTGCAGAGGGTAATACCTCGTCCCAGACGTATTTACCCTCGAATAAGCGAATCGGCTCGCCGTATCGCAGCAGGCGAGAGTAAACATTTGAGCTGGGTAACAGGGAATTACATCCAAAACCATACCCTTTCGCACGATCTCTTTTTCTTTAAAACGTCATTCAGCCAGGTGAAACCACCAGGTAGGCATAACGAACTTCCTGTCCAGACCCAAGGCTCAGGTCGGGAAAACTTTTTAAACTTCCTTGTGGTGATAATGTCTTTTGAAGCTTTCAGAGAGTCAGAACCTGCGTCATACGATCCAGACCGCTGCTTGAAGCTGGATCAGCAAGCTTCTGCGATAATCTGAATATCGTTTATTAATTAGGCATATTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAGTCGATGTCGCCCCTTTCAAGTACCGTCAGCTAAAGTCTTTCGCATTTTCTTACCGGAAACGGCAGTTTCTTCAGAGAGTCGTAAGATCACGTGTGGATCTTGCGTTTACCGCTTGTAACCCAGGCAAATATCAATCATATACCACTGGCCATGCGAGAAATGATGGTGTTGTAGGTTATTGGCAGACCATCTGCGGACCATTTTAAAACCGTTACGTTCAGCCAGTTGATGAATGAATGCGCCATGCAAATTTCAGATTGTGGAAGTACAAGGTCAGCCTGACTTTCAGAACCCAAAGCCATAAATTCATCCAGGGAGTTACCATCTTATACTCCGGAGTTGGTCTTCGATGTTCATGTAACCGTATGCCCATACGCGGCAAATCTTCAACCTTGGTGTGTAGTCCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTTCATAATGTATATCTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTATCCCGCTCACAATCCCCTATCAGTGAGTCGTATTAATTTCGCGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:8214\tqs:i:6234\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc) + m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/8294_10277\t4\t*\t0\t255\t*\t*\t0\t0\tGATTCCCGCGAAATTAATACGAATCACTATAAGGGGAATTGTGAGCGGATAACAATTCCCCTCTAGAAATAATTTTGTTTAACTTTAAGAGGGACGATATACATATGAACACATGCCTACGTAAAATGTATTCCTGCGAACTGTTGAGACTACCACCAAGGTTGAAGATTTGCCGCGTAATGGGCATACGGTTACATGAACATCGAAGACCACTCCGATATGAAGATTGGTTAACCCCTGGATGAATTTATGGCTTGGGTTCTGAAAGTTCAGGCTGACCTGTACTTCACAATCTGAAATTTGATGGCCGCATTCATCAATCACTGGCTGGAACGTAAAACGGTTTAAAAATGGTCCCGCAGATGGTCTGACAAATTAACTACAACACCATCATTTCTCGCATGGGCCCAGTGGTATATGAAATTGATATTTGCCTGGGTTACAAGGAGGTAAACGCAAGATCCACACGTGGATCTACGACTCTTCTGAAGAAACCTGGCCGTTTCCGTTAAGAAAATGCGAAAGAACTTAAGCTGACGGTAACTGAAAGGCGACATCGACTATCATATAATGAAGCGCCCGTCGTTACAAAATCACCCCGGAAGAATATGCCTTACATTAAAAAACGATATTCAGATTTCGCAGAAGCTCTGCTGATCCAGTTCAAAGCAGGGTCCTGGATCGTAATGACGGCAGGTTCTGACTCTCTGAAAGGCTTCAAAGAACATTATCACCCACCAAGAAGTTTAAAAAGGTTTTCCCGACAACTGAGCCTGGGTCTGGACAAGGAAGTTTCGTTTGCCTACCGTGGTGGTTTTCAACCTGCTGACTGAACCGTTTTAAAAGAAAATAGAGATCGGCGGAAAGGTATGGTTTTTGATGTTAATTCCTGTAACCAGCCTCAAAATGTACTCTCGCCTGCTGCCGTACGGCGGCCGATCGTATTCGAAGGGTAAATACGTCTGGGACCGAGGATAGCCCTCTGCACATTCAGCACATTCGTTGTGAAATTTGAACTGAAGGAAGCTGATCCCGACGCATCCAGATCAAGCGTTCCCATTTTCTACAAGGTAACGAATACCTGAAATCTTCCCGGCGGTGAAATTGCTGCCTGTGGCTGTCTAATGTTGATCTGGAAACTGATGAAAGAGCACTACGAGACCTGTACAATGTTGAATATATCTCTGGTCTGAAGTTCAAAGCAACCACTGGCCTGTTCAAGGACTTTATCGACAAATGGCGTATTATCAAAACTACCTCTGAAGACGCCATCAAACAGCTGGCGAAGCTGATGCTGACAGCCTGTACGGTAAATTCGCGTCGCAACCCGGACGTTTCCGTAAAGTGCCCATACCTGAAAGAGAAACGGTGCTCTGGGTTTTCGTCTAGGTGAGGAGGAAACGAAAGACCCTGTAATATACCCGATGGTGTCTTTTATCACGGCCTGGGCACGCTAGTACGACCAATCACAGCAGCGCAGGCTTGTTATGATCGTATTTCTACTGCGGATACCGATTCTATTCCACCTGACTGGTACTGAAATTCTGGAACGTTATCAAAGACATCGTAGACCCGAAGAAACTGGGCTACTGGGGCACCACGAATCCACTTTTAAGCGTGGCAAAATATCTGACGTCAGAAAACCTACATCCAGGATATTTACATGAAAGAAGTAGACGGCAACTGTAGAGGGCTCTTCCTGACGAACCTACACTGACATCAAGTTCTCTGTGAAATGCGCAGGCATGACGGACCAAAATCAAAAAGGAAGTGAACTTTTCGAAAACTTCAAAGTGGGTTTTCTCGTAAAATGAAACCGAAGCCTGTCAGGTACCGGGTGGCGTAGTGCTGGTTGATCGGACACTTTACTATCAATAACTCGAGCTGCAGAATTCCAAGCTTGGATTCCGGCTGCTAACAAAGCCCGAAAGGAAGCTGAGTTGGCTGCTGCACCGCTGAGCAATAACTCTATACATGACTCAT\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:10277\tqs:i:8294\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc) + m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/10327_12283\t4\t*\t0\t255\t*\t*\t0\t0\tAGAGTCATGTATAGAGTTATTGCTCAGCGGTGGCAGCACCAACTCAGCTTCCTTTCGGCTTTGTTAGCAGCCGATCCAAGCTTGAATTCCTGCAGCTCGGAGTTATTTGATAGTAAAAGTTGTCATCCAAACGCAGCACTACGCCCACCCGTACCTGAACAGGCTTTCGGTTTCATTTTACGAGAAAAACACTTTTGAAAGTTTTCGAAAGTCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAGAGAACTTGATGTCAGTGTAGTCGTCAGGAGAGCCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGAATGTAGGTTTTTCTGACGCAGATTATTTTGCACGCTTAAAAGTGGATTCGTGTGGCCCCAGTAGCCCAGTTTCTTCGGTCTACGATGTCTTTGATACGTCCAGAATTTCAGTAAACAGTCAGGTGAATAGAAATCCGGTATCGCAGTAGAATAATACGATCATAACAACCTGCGCTGCTGTGTGGTCGTATAGCGTGCCCAGGCCGTGATAACAGACACCTCGGGGTAATATACAGGGTCTTTCCGTTCCTCCTCAACCTAGACGAAACCCAGAGCACCGTTCTCTTTTCAGGTATGGCACTTTAACCGGTACGTCCGGGTTGGACGCGAATTTACCGTAGCAGGCTGTTCAGCATCAGCTTTCGCCAGCCTGTTTGATGGCGCTCTTCAGAGGTAGTTTGAATATACGTCCATTTGTCGAATAAAGTCCTTGGAACAGGCCCAGTGGTTGCTTTGAACTTCCAGACCAGAGATATATTTCAACATTGTACAGGTCGTAGTGCTCTTTCCACTCAGTTCCAGATCAACATTAAGACAGCCACAGGTCAGATTTCCCCGCCGGAAGATTCAGGTAATTCTAGTTACCCTTGTAGAAATGGCGACGCTTGATCTGGATGGTCGGGATCCTAGCTTCCCTTCAGTTCAAATTCACAACGAATGTTGCTGAATCTGTGCAGAGGGTAATCCTCGGTCCAGACGTATTTACCCTCGAATACGATGCTCGCCGTACGGCAGCAGCGAGAGTACATTTGAGCTGGTACAGGGAATTAACATCAAAAAACATACTTCGCCGATCTCTTTTTCTTTAAAACGGTCATTCAGCCAGGTGAAACCACCACGGTAGGCATAACGAAACTTCCTGTCCAGACCCAGGCTCAGGTCGGAAAACTTGTTAAACTTCTTGGTGGTGATAATGTCTTTGAAAGCCTTTCAGGAAGTCAGAACCATGCCGTCATCCGATCCAGACCCCTGCTTTGAACTGGAATCAGCAGAGGCTCTGCGATAATCGAATATCGTTTTTAAATGTAGGCATATTTTCTTCGGGGTGATTTGTAACGCGACCGGGCGCTCATTATGATAGTCGATGTCGCCTTTCAGTACCGTCAGCTTAAAGTCTTTCGCAATTTTCTTAACCGACGGCAGTTTCTTCAGAGAGGTCGTAGATCACGGTGTGGATCTTGCGTTTACCCTTGTAACCAGGCAAATATCAATCATATACCACTGGCCCATGCGAGAATGATGGTGTTGTAGGTATTTGGCAGACGCATCTGCGGACCATTTAAACCGTTACGTTCCAGCCAGTTGATGATGAATGCGCCCATCATTTCAGATTTGTGGAAGGTACAGGTCAGCCTGAACTTGTCAGAAACCCAAGCCATAAATTCATCCAGGGAGTACATCTTATAATCTCGAAGTGGTCTTCGATGTTCATGTAACCGTATGCCCATACGCGCAATCTTCACCTTGGTGGTAGTCTGCAGTCGCAGAATAATTTTACGTGGCATGTGTTTCATATGTTATTAGTCTCCTTCTTAAAGTTAAACAAAATTATTTTTAGAAGGGGAATTGTTATCCGCTCACAATTCCCCTATAGTGGAGTCGTATTAATTTCGCGGGTATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:12283\tqs:i:10327\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc) + +Invalid-Args: + + $ $BAM2SAM --header-only --no-header < $DATADIR/phi29.bam + + ERROR: conflicting arguments requested: --no-header and --header-only + + Usage: bam2sam [options] [input] + + bam2sam converts a BAM file to SAM. It is essentially a stripped-down 'samtools + view', mostly useful for testing/debugging without requiring samtools. Input BAM + file is read from a file or stdin, and SAM output is written to stdout. + + Options: + -h, --help show this help message and exit + --version show program's version number and exit + + Options: + input Input BAM file. If not provided, stdin will be used as input. + --no-header Omit header from output. + --header-only Print only the header (no records). + [1] + diff --git a/tests/src/cram/pbindexdump_cpp.t.in b/tests/src/cram/pbindexdump_cpp.t.in new file mode 100644 index 0000000..18a210c --- /dev/null +++ b/tests/src/cram/pbindexdump_cpp.t.in @@ -0,0 +1,39 @@ +Setup: + + $ PBINDEXDUMP="@PacBioBAM_BinDir@/pbindexdump" && export PBINDEXDUMP + + $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR + +Normal C++: + + $ $PBINDEXDUMP --format=cpp $DATADIR/polymerase/production_hq.hqregion.bam.pbi + PbiRawData rawData; + rawData.Version(PbiFile::Version_3_0_1); + rawData.FileSections(PbiFile::BASIC); + rawData.NumReads(1); + + PbiRawBasicData& basicData = rawData.BasicData(); + basicData.rgId_ = {-898246524}; + basicData.qStart_ = {2659}; + basicData.qEnd_ = {7034}; + basicData.holeNumber_ = {0}; + basicData.readQual_ = {0.01}; + basicData.ctxtFlag_ = {0}; + basicData.fileOffset_ = {20054016}; + + +--(leave the blank lines above this)-- + +Request C++, with JSON options (stdout includes usage/help, so we just want to check stderr): + + $ $PBINDEXDUMP --format=cpp --json-indent-level=2 $DATADIR/polymerase/production_hq.hqregion.bam.pbi > /dev/null + + ERROR: JSON formatting options not valid on non-JSON output + + [1] + + $ $PBINDEXDUMP --format=cpp --json-raw $DATADIR/polymerase/production_hq.hqregion.bam.pbi > /dev/null + + ERROR: JSON formatting options not valid on non-JSON output + + [1] diff --git a/tests/src/cram/pbindexdump_json.t.in b/tests/src/cram/pbindexdump_json.t.in new file mode 100644 index 0000000..0c1cbcd --- /dev/null +++ b/tests/src/cram/pbindexdump_json.t.in @@ -0,0 +1,83 @@ +Setup: + + $ PBINDEXDUMP="@PacBioBAM_BinDir@/pbindexdump" && export PBINDEXDUMP + + $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR + +Default settings (JSON): + + $ $PBINDEXDUMP $DATADIR/polymerase/production_hq.hqregion.bam.pbi + { + "fileSections": [ + "BasicData" + ], + "numReads": 1, + "reads": [ + { + "contextFlag": 0, + "fileOffset": 20054016, + "holeNumber": 0, + "qEnd": 7034, + "qStart": 2659, + "readQuality": 0.00999999977648258, + "rgId": -898246524 + } + ], + "version": "3.0.1" + } + +JSON indent level(2): + + $ $PBINDEXDUMP --json-indent-level=2 $DATADIR/polymerase/production_hq.hqregion.bam.pbi + { + "fileSections": [ + "BasicData" + ], + "numReads": 1, + "reads": [ + { + "contextFlag": 0, + "fileOffset": 20054016, + "holeNumber": 0, + "qEnd": 7034, + "qStart": 2659, + "readQuality": 0.00999999977648258, + "rgId": -898246524 + } + ], + "version": "3.0.1" + } + +JSON raw: + + $ $PBINDEXDUMP --json-raw $DATADIR/polymerase/production_hq.hqregion.bam.pbi + { + "basicData": { + "ctxtFlag": [ + 0 + ], + "fileOffset": [ + 20054016 + ], + "holeNumber": [ + 0 + ], + "qEnd": [ + 7034 + ], + "qStart": [ + 2659 + ], + "readQual": [ + 0.00999999977648258 + ], + "rgId": [ + -898246524 + ] + }, + "fileSections": [ + "BasicData" + ], + "numReads": 1, + "version": "3.0.1" + } diff --git a/tests/src/cram/pbmerge_aligned_ordering.t.in b/tests/src/cram/pbmerge_aligned_ordering.t.in new file mode 100644 index 0000000..58171bb --- /dev/null +++ b/tests/src/cram/pbmerge_aligned_ordering.t.in @@ -0,0 +1,197 @@ +Setup: + + $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN + $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE + $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM + + $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR + $ INPUT_1="$DATADIR/dataset/bam_mapping_1.bam" && export INPUT_1 + $ INPUT_2="$DATADIR/dataset/bam_mapping_2.bam" && export INPUT_2 + + $ MERGED_BAM="@GeneratedTestDataDir@/aligned_ordering_merged.bam" && export MERGED_BAM + $ MERGED_BAM_PBI="@GeneratedTestDataDir@/aligned_ordering_merged.bam.pbi" && export MERGED_BAM_PBI + +Sanity Check: + + $ $BAM2SAM --header-only $INPUT_1 + @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc) + @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc) + @RG\tID:a9a22406c5\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377 (esc) + @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread (esc) + + $ $BAM2SAM --no-header $INPUT_1 | cut -f 1,3,4 | head -n 10 + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc) + + $ $BAM2SAM --header-only $INPUT_2 + @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc) + @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc) + @RG\tID:a9a22406c5\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377 (esc) + @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread (esc) + + $ $BAM2SAM --no-header $INPUT_2 | cut -f 1,3,4 | head -n 10 + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc) + +Normal Merge: + + $ $PBMERGE $INPUT_1 $INPUT_2 > $MERGED_BAM + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc) + @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc) + @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc) + @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20 + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc) + + $ rm $MERGED_BAM + +Shuffle Input: + + $ $PBMERGE $INPUT_2 $INPUT_2 > $MERGED_BAM + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc) + @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc) + @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc) + @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20 + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7046_7293\tlambda_NEB3011\t5136 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/38025/6255_7894\tlambda_NEB3011\t5427 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5311_5508\tlambda_NEB3011\t5943 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/36363/899_1197\tlambda_NEB3011\t6258 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/36363/605_853\tlambda_NEB3011\t6312 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/31174/0_1029\tlambda_NEB3011\t6487 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/31174/1075_1271\tlambda_NEB3011\t6499 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/38025/5743_6211\tlambda_NEB3011\t6606 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/50257/6944_7361\tlambda_NEB3011\t6942 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/50257/6546_6903\tlambda_NEB3011\t7010 (esc) + + $ rm $MERGED_BAM + +Explicit Output Filename (also enables PBI): + + $ $PBMERGE -o $MERGED_BAM $INPUT_1 $INPUT_2 + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc) + @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc) + @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc) + @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20 + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc) + + $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found" + Found + + $ rm $MERGED_BAM + $ rm $MERGED_BAM_PBI + +Explicit Output Filename (with disabled PBI): + + $ $PBMERGE -o $MERGED_BAM --no-pbi $INPUT_1 $INPUT_2 + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc) + @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc) + @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc) + @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20 + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc) + + $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found" + Not found + + $ rm $MERGED_BAM diff --git a/tests/src/cram/pbmerge_dataset.t.in b/tests/src/cram/pbmerge_dataset.t.in new file mode 100644 index 0000000..1c7cb7a --- /dev/null +++ b/tests/src/cram/pbmerge_dataset.t.in @@ -0,0 +1,144 @@ +Setup: + + $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN + $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE + $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM + + $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR + $ INPUT_XML="$DATADIR/polymerase/consolidate.subread.dataset.xml" && export INPUT_XML + $ BAM_1="$DATADIR/polymerase/production.subreads.bam" && export BAM_1 + $ BAM_2="$DATADIR/polymerase/production.scraps.bam" && export BAM_2 + + $ MERGED_BAM="@GeneratedTestDataDir@/merged.bam" && export MERGED_BAM + $ MERGED_BAM_PBI="@GeneratedTestDataDir@/merged.bam.pbi" && export MERGED_BAM_PBI + +Sanity Check: + + $ $BAM2SAM --no-header $BAM_1 | cut -f 1 + ArminsFakeMovie/0/2659_3025 + ArminsFakeMovie/0/3116_3628 + ArminsFakeMovie/0/3722_4267 + ArminsFakeMovie/0/4356_4864 + ArminsFakeMovie/0/4960_5477 + ArminsFakeMovie/0/5571_6087 + ArminsFakeMovie/0/6199_6719 + ArminsFakeMovie/0/6812_7034 + + $ $BAM2SAM --no-header $BAM_2 | cut -f 1 + ArminsFakeMovie/0/0_2659 + ArminsFakeMovie/0/3025_3047 + ArminsFakeMovie/0/3047_3095 + ArminsFakeMovie/0/3095_3116 + ArminsFakeMovie/0/3628_3650 + ArminsFakeMovie/0/3650_3700 + ArminsFakeMovie/0/3700_3722 + ArminsFakeMovie/0/4267_4289 + ArminsFakeMovie/0/4289_4335 + ArminsFakeMovie/0/4335_4356 + ArminsFakeMovie/0/4864_4888 + ArminsFakeMovie/0/4888_4939 + ArminsFakeMovie/0/4939_4960 + ArminsFakeMovie/0/5477_5498 + ArminsFakeMovie/0/5498_5546 + ArminsFakeMovie/0/5546_5571 + ArminsFakeMovie/0/6087_6116 + ArminsFakeMovie/0/6116_6173 + ArminsFakeMovie/0/6173_6199 + ArminsFakeMovie/0/6719_6740 + ArminsFakeMovie/0/6740_6790 + ArminsFakeMovie/0/6790_6812 + ArminsFakeMovie/0/7034_7035 + +Normal Merge from XML: + + $ $PBMERGE -o $MERGED_BAM $INPUT_XML + + $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found" + Found + + $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found" + Found + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:8aaede36\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @PG\tID:BAZ_FORMAT\tVN:0.3.0 (esc) + @PG\tID:PPA-BAZ2BAM\tVN:0.1.0 (esc) + @PG\tID:PPA-BAZWRITER\tVN:0.2.0 (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1 + ArminsFakeMovie/0/4267_4289 + ArminsFakeMovie/0/4289_4335 + ArminsFakeMovie/0/4335_4356 + ArminsFakeMovie/0/4356_4864 + ArminsFakeMovie/0/4864_4888 + ArminsFakeMovie/0/4888_4939 + ArminsFakeMovie/0/4939_4960 + ArminsFakeMovie/0/4960_5477 + + $ rm $MERGED_BAM + $ rm $MERGED_BAM_PBI + +Normal Merge from XML (disabled PBI): + + $ $PBMERGE --no-pbi -o $MERGED_BAM $INPUT_XML + + $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found" + Found + + $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found" + Not found + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:8aaede36\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @PG\tID:BAZ_FORMAT\tVN:0.3.0 (esc) + @PG\tID:PPA-BAZ2BAM\tVN:0.1.0 (esc) + @PG\tID:PPA-BAZWRITER\tVN:0.2.0 (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1 + ArminsFakeMovie/0/4267_4289 + ArminsFakeMovie/0/4289_4335 + ArminsFakeMovie/0/4335_4356 + ArminsFakeMovie/0/4356_4864 + ArminsFakeMovie/0/4864_4888 + ArminsFakeMovie/0/4888_4939 + ArminsFakeMovie/0/4939_4960 + ArminsFakeMovie/0/4960_5477 + + $ rm $MERGED_BAM + +Write to stdout: + + $ $PBMERGE --no-pbi $INPUT_XML > $MERGED_BAM + + $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found" + Found + + $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found" + Not found + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:8aaede36\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @PG\tID:BAZ_FORMAT\tVN:0.3.0 (esc) + @PG\tID:PPA-BAZ2BAM\tVN:0.1.0 (esc) + @PG\tID:PPA-BAZWRITER\tVN:0.2.0 (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1 + ArminsFakeMovie/0/4267_4289 + ArminsFakeMovie/0/4289_4335 + ArminsFakeMovie/0/4335_4356 + ArminsFakeMovie/0/4356_4864 + ArminsFakeMovie/0/4864_4888 + ArminsFakeMovie/0/4888_4939 + ArminsFakeMovie/0/4939_4960 + ArminsFakeMovie/0/4960_5477 + + $ rm $MERGED_BAM diff --git a/tests/src/cram/pbmerge_fofn.t.in b/tests/src/cram/pbmerge_fofn.t.in new file mode 100644 index 0000000..34e9af6 --- /dev/null +++ b/tests/src/cram/pbmerge_fofn.t.in @@ -0,0 +1,120 @@ +Setup: + + $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN + $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE + $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM + + $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR + $ INPUT_FOFN="$DATADIR/merge.fofn" && export INPUT_FOFN + $ INPUT_1="$DATADIR/aligned.bam" && export INPUT_1 + $ INPUT_2="$DATADIR/aligned2.bam" && export INPUT_2 + + $ MERGED_BAM="@GeneratedTestDataDir@/aligned_ordering_merged.bam" && export MERGED_BAM + $ MERGED_BAM_PBI="@GeneratedTestDataDir@/aligned_ordering_merged.bam.pbi" && export MERGED_BAM_PBI + +Sanity Check: + + $ $BAM2SAM --header-only $INPUT_1 + @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc) + @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc) + @RG\tID:0d7b28fa\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\tPU:singleInsertion\tPM:SEQUEL (esc) + @PG\tID:bwa\tPN:bwa\tVN:0.7.10-r1017-dirty\tCL:bwa mem lambdaNEB.fa singleInsertion.fasta (esc) + + $ $BAM2SAM --no-header $INPUT_1 | cut -f 1,3,4 | head -n 10 + singleInsertion/100/0_49\tlambda_NEB3011\t5211 (esc) + singleInsertion/200/0_49\tlambda_NEB3011\t5211 (esc) + singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc) + singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc) + + $ $BAM2SAM --header-only $INPUT_2 + @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc) + @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc) + @RG\tID:b89a4406\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;FRAMERATEHZ=100;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc) + @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread (esc) + + $ $BAM2SAM --no-header $INPUT_2 | cut -f 1,3,4 | head -n 10 + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + +Normal Merge from FOFN: + + $ $PBMERGE -o $MERGED_BAM $INPUT_FOFN + + $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found" + Found + + $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found" + Found + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc) + @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc) + @RG\tID:0d7b28fa\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\tPU:singleInsertion\tPM:SEQUEL (esc) + @RG\tID:b89a4406\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;FRAMERATEHZ=100\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc) + @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread (esc) + @PG\tID:bwa\tPN:bwa\tVN:0.7.10-r1017-dirty\tCL:bwa mem lambdaNEB.fa singleInsertion.fasta (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20 + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + singleInsertion/100/0_49\tlambda_NEB3011\t5211 (esc) + singleInsertion/200/0_49\tlambda_NEB3011\t5211 (esc) + singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc) + singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc) + + $ rm $MERGED_BAM + $ rm $MERGED_BAM_PBI + +Normal Merge from FOFN (disabled PBI): + + $ $PBMERGE --no-pbi -o $MERGED_BAM $INPUT_FOFN + + $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found" + Found + + $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found" + Not found + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc) + @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc) + @RG\tID:0d7b28fa\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\tPU:singleInsertion\tPM:SEQUEL (esc) + @RG\tID:b89a4406\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;FRAMERATEHZ=100\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc) + @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread (esc) + @PG\tID:bwa\tPN:bwa\tVN:0.7.10-r1017-dirty\tCL:bwa mem lambdaNEB.fa singleInsertion.fasta (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20 + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + singleInsertion/100/0_49\tlambda_NEB3011\t5211 (esc) + singleInsertion/200/0_49\tlambda_NEB3011\t5211 (esc) + singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc) + singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc) + + $ rm $MERGED_BAM diff --git a/tests/src/cram/pbmerge_mixed_ordering.t.in b/tests/src/cram/pbmerge_mixed_ordering.t.in new file mode 100644 index 0000000..6f1f3f9 --- /dev/null +++ b/tests/src/cram/pbmerge_mixed_ordering.t.in @@ -0,0 +1,57 @@ +Setup: + + $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN + $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE + $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM + + $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR + $ UNALIGNED_BAM="$DATADIR/polymerase/internal.hqregions.bam" && export UNALIGNED_BAM + $ ALIGNED_BAM="$DATADIR/dataset/bam_mapping_1.bam" && export ALIGNED_BAM + + $ MERGED_BAM="@GeneratedTestDataDir@/mixed_ordering_merged.bam" && export MERGED_BAM + +Sanity Check: + + $ $BAM2SAM --header-only $UNALIGNED_BAM + @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie (esc) + @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc) + @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc) + @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc) + + $ $BAM2SAM --no-header $UNALIGNED_BAM | cut -f 1 + ArminsFakeMovie/100000/2659_7034 + + $ $BAM2SAM --header-only $ALIGNED_BAM + @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc) + @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc) + @RG\tID:a9a22406c5\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377 (esc) + @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread (esc) + + $ $BAM2SAM --no-header $ALIGNED_BAM | cut -f 1,3,4 | head -n 10 + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc) + m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc) + +Normal Merge - should fail: + + $ $PBMERGE $UNALIGNED_BAM $ALIGNED_BAM > $MERGED_BAM + ERROR: BAM file sort orders do not match, aborting merge + [1] + +Shuffle Input - should fail: + + $ $PBMERGE $ALIGNED_BAM $UNALIGNED_BAM > $MERGED_BAM + ERROR: BAM file sort orders do not match, aborting merge + [1] + +Cleanup: + + $ rm $MERGED_BAM diff --git a/tests/src/cram/pbmerge_pacbio_ordering.t.in b/tests/src/cram/pbmerge_pacbio_ordering.t.in new file mode 100644 index 0000000..f52759f --- /dev/null +++ b/tests/src/cram/pbmerge_pacbio_ordering.t.in @@ -0,0 +1,457 @@ +Setup: + + $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN + $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE + $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM + + $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR + $ HQREGION_BAM="$DATADIR/polymerase/internal.hqregions.bam" && export HQREGION_BAM + $ SCRAPS_BAM="$DATADIR/polymerase/internal.scraps.bam" && export SCRAPS_BAM + + $ MERGED_BAM="@GeneratedTestDataDir@/pacbio_ordering_merged.bam" && export MERGED_BAM + $ MERGED_BAM_PBI="@GeneratedTestDataDir@/pacbio_ordering_merged.bam.pbi" && export MERGED_BAM_PBI + +Sanity Check: + + $ $BAM2SAM --header-only $HQREGION_BAM + @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie (esc) + @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc) + @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc) + @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc) + + $ $BAM2SAM --no-header $HQREGION_BAM | cut -f 1 + ArminsFakeMovie/100000/2659_7034 + + $ $BAM2SAM --header-only $SCRAPS_BAM + @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie (esc) + @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc) + @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc) + @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc) + + $ $BAM2SAM --no-header $SCRAPS_BAM | cut -f 1 + ArminsFakeMovie/100000/0_2659 + ArminsFakeMovie/100000/3025_3047 + ArminsFakeMovie/100000/3047_3095 + ArminsFakeMovie/100000/3095_3116 + ArminsFakeMovie/100000/3628_3650 + ArminsFakeMovie/100000/3650_3700 + ArminsFakeMovie/100000/3700_3722 + ArminsFakeMovie/100000/4267_4289 + ArminsFakeMovie/100000/4289_4335 + ArminsFakeMovie/100000/4335_4356 + ArminsFakeMovie/100000/4864_4888 + ArminsFakeMovie/100000/4888_4939 + ArminsFakeMovie/100000/4939_4960 + ArminsFakeMovie/100000/5477_5498 + ArminsFakeMovie/100000/5498_5546 + ArminsFakeMovie/100000/5546_5571 + ArminsFakeMovie/100000/6087_6116 + ArminsFakeMovie/100000/6116_6173 + ArminsFakeMovie/100000/6173_6199 + ArminsFakeMovie/100000/6719_6740 + ArminsFakeMovie/100000/6740_6790 + ArminsFakeMovie/100000/6790_6812 + ArminsFakeMovie/100000/7034_7035 + ArminsFakeMovie/200000/0_2659 + ArminsFakeMovie/200000/3025_3047 + ArminsFakeMovie/200000/3047_3095 + ArminsFakeMovie/200000/3095_3116 + ArminsFakeMovie/200000/3628_3650 + ArminsFakeMovie/200000/3650_3700 + ArminsFakeMovie/200000/3700_3722 + ArminsFakeMovie/200000/4267_4289 + ArminsFakeMovie/200000/4289_4335 + ArminsFakeMovie/200000/4335_4356 + ArminsFakeMovie/200000/4864_4888 + ArminsFakeMovie/200000/4888_4939 + ArminsFakeMovie/200000/4939_4960 + ArminsFakeMovie/200000/5477_5498 + ArminsFakeMovie/200000/5498_5546 + ArminsFakeMovie/200000/5546_5571 + ArminsFakeMovie/200000/6087_6116 + ArminsFakeMovie/200000/6116_6173 + ArminsFakeMovie/200000/6173_6199 + ArminsFakeMovie/200000/6719_6740 + ArminsFakeMovie/200000/6740_6790 + ArminsFakeMovie/200000/6790_6812 + ArminsFakeMovie/200000/7034_7035 + ArminsFakeMovie/300000/0_2659 + ArminsFakeMovie/300000/3025_3047 + ArminsFakeMovie/300000/3047_3095 + ArminsFakeMovie/300000/3095_3116 + ArminsFakeMovie/300000/3628_3650 + ArminsFakeMovie/300000/3650_3700 + ArminsFakeMovie/300000/3700_3722 + ArminsFakeMovie/300000/4267_4289 + ArminsFakeMovie/300000/4289_4335 + ArminsFakeMovie/300000/4335_4356 + ArminsFakeMovie/300000/4864_4888 + ArminsFakeMovie/300000/4888_4939 + ArminsFakeMovie/300000/4939_4960 + ArminsFakeMovie/300000/5477_5498 + ArminsFakeMovie/300000/5498_5546 + ArminsFakeMovie/300000/5546_5571 + ArminsFakeMovie/300000/6087_6116 + ArminsFakeMovie/300000/6116_6173 + ArminsFakeMovie/300000/6173_6199 + ArminsFakeMovie/300000/6719_6740 + ArminsFakeMovie/300000/6740_6790 + ArminsFakeMovie/300000/6790_6812 + ArminsFakeMovie/300000/7034_7035 + +Normal Merge: + + $ $PBMERGE $HQREGION_BAM $SCRAPS_BAM > $MERGED_BAM + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc) + @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc) + @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1 + ArminsFakeMovie/100000/0_2659 + ArminsFakeMovie/100000/2659_7034 + ArminsFakeMovie/100000/3025_3047 + ArminsFakeMovie/100000/3047_3095 + ArminsFakeMovie/100000/3095_3116 + ArminsFakeMovie/100000/3628_3650 + ArminsFakeMovie/100000/3650_3700 + ArminsFakeMovie/100000/3700_3722 + ArminsFakeMovie/100000/4267_4289 + ArminsFakeMovie/100000/4289_4335 + ArminsFakeMovie/100000/4335_4356 + ArminsFakeMovie/100000/4864_4888 + ArminsFakeMovie/100000/4888_4939 + ArminsFakeMovie/100000/4939_4960 + ArminsFakeMovie/100000/5477_5498 + ArminsFakeMovie/100000/5498_5546 + ArminsFakeMovie/100000/5546_5571 + ArminsFakeMovie/100000/6087_6116 + ArminsFakeMovie/100000/6116_6173 + ArminsFakeMovie/100000/6173_6199 + ArminsFakeMovie/100000/6719_6740 + ArminsFakeMovie/100000/6740_6790 + ArminsFakeMovie/100000/6790_6812 + ArminsFakeMovie/100000/7034_7035 + ArminsFakeMovie/200000/0_2659 + ArminsFakeMovie/200000/3025_3047 + ArminsFakeMovie/200000/3047_3095 + ArminsFakeMovie/200000/3095_3116 + ArminsFakeMovie/200000/3628_3650 + ArminsFakeMovie/200000/3650_3700 + ArminsFakeMovie/200000/3700_3722 + ArminsFakeMovie/200000/4267_4289 + ArminsFakeMovie/200000/4289_4335 + ArminsFakeMovie/200000/4335_4356 + ArminsFakeMovie/200000/4864_4888 + ArminsFakeMovie/200000/4888_4939 + ArminsFakeMovie/200000/4939_4960 + ArminsFakeMovie/200000/5477_5498 + ArminsFakeMovie/200000/5498_5546 + ArminsFakeMovie/200000/5546_5571 + ArminsFakeMovie/200000/6087_6116 + ArminsFakeMovie/200000/6116_6173 + ArminsFakeMovie/200000/6173_6199 + ArminsFakeMovie/200000/6719_6740 + ArminsFakeMovie/200000/6740_6790 + ArminsFakeMovie/200000/6790_6812 + ArminsFakeMovie/200000/7034_7035 + ArminsFakeMovie/300000/0_2659 + ArminsFakeMovie/300000/3025_3047 + ArminsFakeMovie/300000/3047_3095 + ArminsFakeMovie/300000/3095_3116 + ArminsFakeMovie/300000/3628_3650 + ArminsFakeMovie/300000/3650_3700 + ArminsFakeMovie/300000/3700_3722 + ArminsFakeMovie/300000/4267_4289 + ArminsFakeMovie/300000/4289_4335 + ArminsFakeMovie/300000/4335_4356 + ArminsFakeMovie/300000/4864_4888 + ArminsFakeMovie/300000/4888_4939 + ArminsFakeMovie/300000/4939_4960 + ArminsFakeMovie/300000/5477_5498 + ArminsFakeMovie/300000/5498_5546 + ArminsFakeMovie/300000/5546_5571 + ArminsFakeMovie/300000/6087_6116 + ArminsFakeMovie/300000/6116_6173 + ArminsFakeMovie/300000/6173_6199 + ArminsFakeMovie/300000/6719_6740 + ArminsFakeMovie/300000/6740_6790 + ArminsFakeMovie/300000/6790_6812 + ArminsFakeMovie/300000/7034_7035 + + $ rm $MERGED_BAM + +Shuffle Input: + + $ $PBMERGE $SCRAPS_BAM $HQREGION_BAM > $MERGED_BAM + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc) + @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc) + @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1 + ArminsFakeMovie/100000/0_2659 + ArminsFakeMovie/100000/2659_7034 + ArminsFakeMovie/100000/3025_3047 + ArminsFakeMovie/100000/3047_3095 + ArminsFakeMovie/100000/3095_3116 + ArminsFakeMovie/100000/3628_3650 + ArminsFakeMovie/100000/3650_3700 + ArminsFakeMovie/100000/3700_3722 + ArminsFakeMovie/100000/4267_4289 + ArminsFakeMovie/100000/4289_4335 + ArminsFakeMovie/100000/4335_4356 + ArminsFakeMovie/100000/4864_4888 + ArminsFakeMovie/100000/4888_4939 + ArminsFakeMovie/100000/4939_4960 + ArminsFakeMovie/100000/5477_5498 + ArminsFakeMovie/100000/5498_5546 + ArminsFakeMovie/100000/5546_5571 + ArminsFakeMovie/100000/6087_6116 + ArminsFakeMovie/100000/6116_6173 + ArminsFakeMovie/100000/6173_6199 + ArminsFakeMovie/100000/6719_6740 + ArminsFakeMovie/100000/6740_6790 + ArminsFakeMovie/100000/6790_6812 + ArminsFakeMovie/100000/7034_7035 + ArminsFakeMovie/200000/0_2659 + ArminsFakeMovie/200000/3025_3047 + ArminsFakeMovie/200000/3047_3095 + ArminsFakeMovie/200000/3095_3116 + ArminsFakeMovie/200000/3628_3650 + ArminsFakeMovie/200000/3650_3700 + ArminsFakeMovie/200000/3700_3722 + ArminsFakeMovie/200000/4267_4289 + ArminsFakeMovie/200000/4289_4335 + ArminsFakeMovie/200000/4335_4356 + ArminsFakeMovie/200000/4864_4888 + ArminsFakeMovie/200000/4888_4939 + ArminsFakeMovie/200000/4939_4960 + ArminsFakeMovie/200000/5477_5498 + ArminsFakeMovie/200000/5498_5546 + ArminsFakeMovie/200000/5546_5571 + ArminsFakeMovie/200000/6087_6116 + ArminsFakeMovie/200000/6116_6173 + ArminsFakeMovie/200000/6173_6199 + ArminsFakeMovie/200000/6719_6740 + ArminsFakeMovie/200000/6740_6790 + ArminsFakeMovie/200000/6790_6812 + ArminsFakeMovie/200000/7034_7035 + ArminsFakeMovie/300000/0_2659 + ArminsFakeMovie/300000/3025_3047 + ArminsFakeMovie/300000/3047_3095 + ArminsFakeMovie/300000/3095_3116 + ArminsFakeMovie/300000/3628_3650 + ArminsFakeMovie/300000/3650_3700 + ArminsFakeMovie/300000/3700_3722 + ArminsFakeMovie/300000/4267_4289 + ArminsFakeMovie/300000/4289_4335 + ArminsFakeMovie/300000/4335_4356 + ArminsFakeMovie/300000/4864_4888 + ArminsFakeMovie/300000/4888_4939 + ArminsFakeMovie/300000/4939_4960 + ArminsFakeMovie/300000/5477_5498 + ArminsFakeMovie/300000/5498_5546 + ArminsFakeMovie/300000/5546_5571 + ArminsFakeMovie/300000/6087_6116 + ArminsFakeMovie/300000/6116_6173 + ArminsFakeMovie/300000/6173_6199 + ArminsFakeMovie/300000/6719_6740 + ArminsFakeMovie/300000/6740_6790 + ArminsFakeMovie/300000/6790_6812 + ArminsFakeMovie/300000/7034_7035 + + $ rm $MERGED_BAM + +Explicit Output Filename (also enables PBI): + + $ $PBMERGE -o $MERGED_BAM $HQREGION_BAM $SCRAPS_BAM + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc) + @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc) + @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1 + ArminsFakeMovie/100000/0_2659 + ArminsFakeMovie/100000/2659_7034 + ArminsFakeMovie/100000/3025_3047 + ArminsFakeMovie/100000/3047_3095 + ArminsFakeMovie/100000/3095_3116 + ArminsFakeMovie/100000/3628_3650 + ArminsFakeMovie/100000/3650_3700 + ArminsFakeMovie/100000/3700_3722 + ArminsFakeMovie/100000/4267_4289 + ArminsFakeMovie/100000/4289_4335 + ArminsFakeMovie/100000/4335_4356 + ArminsFakeMovie/100000/4864_4888 + ArminsFakeMovie/100000/4888_4939 + ArminsFakeMovie/100000/4939_4960 + ArminsFakeMovie/100000/5477_5498 + ArminsFakeMovie/100000/5498_5546 + ArminsFakeMovie/100000/5546_5571 + ArminsFakeMovie/100000/6087_6116 + ArminsFakeMovie/100000/6116_6173 + ArminsFakeMovie/100000/6173_6199 + ArminsFakeMovie/100000/6719_6740 + ArminsFakeMovie/100000/6740_6790 + ArminsFakeMovie/100000/6790_6812 + ArminsFakeMovie/100000/7034_7035 + ArminsFakeMovie/200000/0_2659 + ArminsFakeMovie/200000/3025_3047 + ArminsFakeMovie/200000/3047_3095 + ArminsFakeMovie/200000/3095_3116 + ArminsFakeMovie/200000/3628_3650 + ArminsFakeMovie/200000/3650_3700 + ArminsFakeMovie/200000/3700_3722 + ArminsFakeMovie/200000/4267_4289 + ArminsFakeMovie/200000/4289_4335 + ArminsFakeMovie/200000/4335_4356 + ArminsFakeMovie/200000/4864_4888 + ArminsFakeMovie/200000/4888_4939 + ArminsFakeMovie/200000/4939_4960 + ArminsFakeMovie/200000/5477_5498 + ArminsFakeMovie/200000/5498_5546 + ArminsFakeMovie/200000/5546_5571 + ArminsFakeMovie/200000/6087_6116 + ArminsFakeMovie/200000/6116_6173 + ArminsFakeMovie/200000/6173_6199 + ArminsFakeMovie/200000/6719_6740 + ArminsFakeMovie/200000/6740_6790 + ArminsFakeMovie/200000/6790_6812 + ArminsFakeMovie/200000/7034_7035 + ArminsFakeMovie/300000/0_2659 + ArminsFakeMovie/300000/3025_3047 + ArminsFakeMovie/300000/3047_3095 + ArminsFakeMovie/300000/3095_3116 + ArminsFakeMovie/300000/3628_3650 + ArminsFakeMovie/300000/3650_3700 + ArminsFakeMovie/300000/3700_3722 + ArminsFakeMovie/300000/4267_4289 + ArminsFakeMovie/300000/4289_4335 + ArminsFakeMovie/300000/4335_4356 + ArminsFakeMovie/300000/4864_4888 + ArminsFakeMovie/300000/4888_4939 + ArminsFakeMovie/300000/4939_4960 + ArminsFakeMovie/300000/5477_5498 + ArminsFakeMovie/300000/5498_5546 + ArminsFakeMovie/300000/5546_5571 + ArminsFakeMovie/300000/6087_6116 + ArminsFakeMovie/300000/6116_6173 + ArminsFakeMovie/300000/6173_6199 + ArminsFakeMovie/300000/6719_6740 + ArminsFakeMovie/300000/6740_6790 + ArminsFakeMovie/300000/6790_6812 + ArminsFakeMovie/300000/7034_7035 + + $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found" + Found + + $ rm $MERGED_BAM + $ rm $MERGED_BAM_PBI + +Explicit Output Filename (with disabled PBI): + + $ $PBMERGE -o $MERGED_BAM --no-pbi $HQREGION_BAM $SCRAPS_BAM + + $ $BAM2SAM --header-only $MERGED_BAM + @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc) + @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc) + @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc) + @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc) + @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc) + @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc) + + $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1 + ArminsFakeMovie/100000/0_2659 + ArminsFakeMovie/100000/2659_7034 + ArminsFakeMovie/100000/3025_3047 + ArminsFakeMovie/100000/3047_3095 + ArminsFakeMovie/100000/3095_3116 + ArminsFakeMovie/100000/3628_3650 + ArminsFakeMovie/100000/3650_3700 + ArminsFakeMovie/100000/3700_3722 + ArminsFakeMovie/100000/4267_4289 + ArminsFakeMovie/100000/4289_4335 + ArminsFakeMovie/100000/4335_4356 + ArminsFakeMovie/100000/4864_4888 + ArminsFakeMovie/100000/4888_4939 + ArminsFakeMovie/100000/4939_4960 + ArminsFakeMovie/100000/5477_5498 + ArminsFakeMovie/100000/5498_5546 + ArminsFakeMovie/100000/5546_5571 + ArminsFakeMovie/100000/6087_6116 + ArminsFakeMovie/100000/6116_6173 + ArminsFakeMovie/100000/6173_6199 + ArminsFakeMovie/100000/6719_6740 + ArminsFakeMovie/100000/6740_6790 + ArminsFakeMovie/100000/6790_6812 + ArminsFakeMovie/100000/7034_7035 + ArminsFakeMovie/200000/0_2659 + ArminsFakeMovie/200000/3025_3047 + ArminsFakeMovie/200000/3047_3095 + ArminsFakeMovie/200000/3095_3116 + ArminsFakeMovie/200000/3628_3650 + ArminsFakeMovie/200000/3650_3700 + ArminsFakeMovie/200000/3700_3722 + ArminsFakeMovie/200000/4267_4289 + ArminsFakeMovie/200000/4289_4335 + ArminsFakeMovie/200000/4335_4356 + ArminsFakeMovie/200000/4864_4888 + ArminsFakeMovie/200000/4888_4939 + ArminsFakeMovie/200000/4939_4960 + ArminsFakeMovie/200000/5477_5498 + ArminsFakeMovie/200000/5498_5546 + ArminsFakeMovie/200000/5546_5571 + ArminsFakeMovie/200000/6087_6116 + ArminsFakeMovie/200000/6116_6173 + ArminsFakeMovie/200000/6173_6199 + ArminsFakeMovie/200000/6719_6740 + ArminsFakeMovie/200000/6740_6790 + ArminsFakeMovie/200000/6790_6812 + ArminsFakeMovie/200000/7034_7035 + ArminsFakeMovie/300000/0_2659 + ArminsFakeMovie/300000/3025_3047 + ArminsFakeMovie/300000/3047_3095 + ArminsFakeMovie/300000/3095_3116 + ArminsFakeMovie/300000/3628_3650 + ArminsFakeMovie/300000/3650_3700 + ArminsFakeMovie/300000/3700_3722 + ArminsFakeMovie/300000/4267_4289 + ArminsFakeMovie/300000/4289_4335 + ArminsFakeMovie/300000/4335_4356 + ArminsFakeMovie/300000/4864_4888 + ArminsFakeMovie/300000/4888_4939 + ArminsFakeMovie/300000/4939_4960 + ArminsFakeMovie/300000/5477_5498 + ArminsFakeMovie/300000/5498_5546 + ArminsFakeMovie/300000/5546_5571 + ArminsFakeMovie/300000/6087_6116 + ArminsFakeMovie/300000/6116_6173 + ArminsFakeMovie/300000/6173_6199 + ArminsFakeMovie/300000/6719_6740 + ArminsFakeMovie/300000/6740_6790 + ArminsFakeMovie/300000/6790_6812 + ArminsFakeMovie/300000/7034_7035 + + $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found" + Not found + + $ rm $MERGED_BAM diff --git a/tests/src/python/check_swig.py b/tests/src/python/check_swig.py new file mode 100755 index 0000000..9a17f7e --- /dev/null +++ b/tests/src/python/check_swig.py @@ -0,0 +1,44 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +try: + import PacBioBam as bam + header = bam.BamHeader() + print "\nPython wrapper OK.\n" +except ImportError: + print "\nPython wrapper failed!\n" + diff --git a/tests/src/python/test/__init__.py b/tests/src/python/test/__init__.py new file mode 100755 index 0000000..fb74df1 --- /dev/null +++ b/tests/src/python/test/__init__.py @@ -0,0 +1,39 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +# EMPTY ON PURPOSE. +# This file just needs to exist for unit test discovery. diff --git a/tests/src/python/test/config.py.in b/tests/src/python/test/config.py.in new file mode 100644 index 0000000..c2b900e --- /dev/null +++ b/tests/src/python/test/config.py.in @@ -0,0 +1,43 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +class TestData(object): + def __init__(self): + + # Main test data directory + self.directory = "@PacBioBAM_TestsDir@/data" + \ No newline at end of file diff --git a/tests/src/python/test/test_Accuracy.py b/tests/src/python/test/test_Accuracy.py new file mode 100755 index 0000000..a8b2112 --- /dev/null +++ b/tests/src/python/test/test_Accuracy.py @@ -0,0 +1,65 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +import PacBioBam +import config +import unittest + +class AccuracyTest(unittest.TestCase): + + # ------------ SETUP -------------- + + def runTest(self): + self.test_clamp() + + # ------------ TESTS -------------- + + def test_clamp(self): + a_zero = PacBioBam.Accuracy(0.0) + a_neg = PacBioBam.Accuracy(-0.5) + a_min = PacBioBam.Accuracy(0.0) + a_normal = PacBioBam.Accuracy(0.9) + a_max = PacBioBam.Accuracy(1.0) + a_tooLarge = PacBioBam.Accuracy(1.1) + + self.assertAlmostEqual(float(0.0), float(a_zero)) + self.assertAlmostEqual(float(0.0), float(a_neg)) + self.assertAlmostEqual(float(0.0), float(a_min)) + self.assertAlmostEqual(float(0.9), float(a_normal)) + self.assertAlmostEqual(float(1.0), float(a_max)) + self.assertAlmostEqual(float(1.0), float(a_tooLarge)) + \ No newline at end of file diff --git a/tests/src/python/test/test_BamFile.py b/tests/src/python/test/test_BamFile.py new file mode 100755 index 0000000..26062c6 --- /dev/null +++ b/tests/src/python/test/test_BamFile.py @@ -0,0 +1,62 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +import PacBioBam +import config +import unittest + +class BamFileTest(unittest.TestCase): + + # ------------ SETUP -------------- + + def setUp(self): + self.data = config.TestData() + self.bamFn = self.data.directory + "/aligned.bam" + + def runTest(self): + self.test_ctor() + self.test_nonExistentFile() + + # ------------ TESTS -------------- + + def test_ctor(self): + f = PacBioBam.BamFile(self.bamFn) + + def test_nonExistentFile(self): + with self.assertRaises(RuntimeError): + f = PacBioBam.BamFile("non_existent_file.bam") + diff --git a/tests/src/python/test/test_BamHeader.py b/tests/src/python/test/test_BamHeader.py new file mode 100755 index 0000000..065eee3 --- /dev/null +++ b/tests/src/python/test/test_BamHeader.py @@ -0,0 +1,157 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +import PacBioBam +import config +import unittest + +class BamHeaderTest(unittest.TestCase): + + # ------------ SETUP -------------- + + def runTest(self): + self.test_defaultCtor() + self.test_decode() + self.test_encode() + + # ------------ TESTS -------------- + + def test_defaultCtor(self): + + header = PacBioBam.BamHeader() + + self.assertFalse(header.Version()) + self.assertFalse(header.SortOrder()) + self.assertEqual(0, len(header.ReadGroups())) + self.assertEqual(0, len(header.Sequences())) + self.assertEqual(0, len(header.Programs())) + self.assertEqual(0, len(header.Comments())) + + with self.assertRaises(RuntimeError): + pg = header.Program("foo") + rg = header.ReadGroup("foo") + sq = header.SequenceId("foo") + sl = header.SequenceLength(42) + sn = header.SequenceName(42) + + + def test_decode(self): + + text = ("@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n" + "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n" + "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n" + "@RG\tID:rg1\tSM:control\n" + "@RG\tID:rg2\tSM:condition1\n" + "@RG\tID:rg3\tSM:condition1\n" + "@PG\tID:_foo_\tPN:ide\n" + "@CO\tipsum and so on\n" + "@CO\tcitation needed\n") + + header = PacBioBam.BamHeader(text) + + self.assertEqual("1.1", header.Version()) + self.assertEqual("queryname", header.SortOrder()) + self.assertEqual("3.0.1", header.PacBioBamVersion()) + + self.assertEqual(3, len(header.ReadGroups())) + self.assertTrue(header.HasReadGroup("rg1")) + self.assertTrue(header.HasReadGroup("rg2")) + self.assertTrue(header.HasReadGroup("rg3")) + self.assertEqual("control", header.ReadGroup("rg1").Sample()) + self.assertEqual("condition1", header.ReadGroup("rg2").Sample()) + self.assertEqual("condition1", header.ReadGroup("rg3").Sample()) + + self.assertEqual(2, len(header.Sequences())) + self.assertTrue(header.HasSequence("chr1")) + self.assertTrue(header.HasSequence("chr2")) + self.assertEqual("chocobo", header.Sequence("chr1").Species()) + self.assertEqual("chocobo", header.Sequence("chr2").Species()) + self.assertEqual("2038", header.Sequence("chr1").Length()) + self.assertEqual("3042", header.Sequence("chr2").Length()) + + self.assertEqual(1, len(header.Programs())) + self.assertTrue(header.HasProgram("_foo_")) + self.assertEqual("ide", header.Program("_foo_").Name()) + + self.assertEqual(2, len(header.Comments())) + self.assertEqual("ipsum and so on", header.Comments()[0]) + self.assertEqual("citation needed", header.Comments()[1]) + + def test_encode(self): + + expectedText = ("@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n" + "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n" + "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n" + "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n" + "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n" + "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n" + "@PG\tID:_foo_\tPN:ide\n" + "@CO\tipsum and so on\n" + "@CO\tcitation needed\n") + + rg1 = PacBioBam.ReadGroupInfo("rg1") + rg1.Sample("control") + rg2 = PacBioBam.ReadGroupInfo("rg2") + rg2.Sample("condition1") + rg3 = PacBioBam.ReadGroupInfo("rg3") + rg3.Sample("condition1") + + seq1 = PacBioBam.SequenceInfo("chr1") + seq1.Length("2038") + seq1.Species("chocobo") + seq2 = PacBioBam.SequenceInfo("chr2") + seq2.Length("3042") + seq2.Species("chocobo") + + prog1 = PacBioBam.ProgramInfo("_foo_") + prog1.Name("ide") + + header = PacBioBam.BamHeader() + header.Version("1.1") + header.SortOrder("queryname") + header.PacBioBamVersion("3.0.1") + header.AddReadGroup(rg1) + header.AddReadGroup(rg2) + header.AddReadGroup(rg3) + header.AddSequence(seq1) + header.AddSequence(seq2) + header.AddProgram(prog1) + header.AddComment("ipsum and so on") + header.AddComment("citation needed") + + self.assertEqual(expectedText, header.ToSam()) + diff --git a/tests/src/python/test/test_Cigar.py b/tests/src/python/test/test_Cigar.py new file mode 100755 index 0000000..7b3f1df --- /dev/null +++ b/tests/src/python/test/test_Cigar.py @@ -0,0 +1,214 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +import PacBioBam +import config +import unittest + +class CigarTest(unittest.TestCase): + + # ------------ SETUP -------------- + + def runTest(self): + self.test_typeToChar() + self.test_charToType() + self.test_setType() + self.test_setChar() + self.test_cigarOpCtors() + self.test_fromEmptyString() + self.test_fromString() + self.test_toEmptyString() + self.test_toString() + + # ------------ TESTS -------------- + + def test_typeToChar(self): + self.assertEqual('M', PacBioBam.CigarOperation.TypeToChar(PacBioBam.ALIGNMENT_MATCH)) + self.assertEqual('I', PacBioBam.CigarOperation.TypeToChar(PacBioBam.INSERTION)) + self.assertEqual('D', PacBioBam.CigarOperation.TypeToChar(PacBioBam.DELETION)) + self.assertEqual('N', PacBioBam.CigarOperation.TypeToChar(PacBioBam.REFERENCE_SKIP)) + self.assertEqual('S', PacBioBam.CigarOperation.TypeToChar(PacBioBam.SOFT_CLIP)) + self.assertEqual('H', PacBioBam.CigarOperation.TypeToChar(PacBioBam.HARD_CLIP)) + self.assertEqual('P', PacBioBam.CigarOperation.TypeToChar(PacBioBam.PADDING)) + self.assertEqual('=', PacBioBam.CigarOperation.TypeToChar(PacBioBam.SEQUENCE_MATCH)) + self.assertEqual('X', PacBioBam.CigarOperation.TypeToChar(PacBioBam.SEQUENCE_MISMATCH)) + + def test_charToType(self): + self.assertEqual(PacBioBam.ALIGNMENT_MATCH, PacBioBam.CigarOperation.CharToType('M')) + self.assertEqual(PacBioBam.INSERTION, PacBioBam.CigarOperation.CharToType('I')) + self.assertEqual(PacBioBam.DELETION, PacBioBam.CigarOperation.CharToType('D')) + self.assertEqual(PacBioBam.REFERENCE_SKIP, PacBioBam.CigarOperation.CharToType('N')) + self.assertEqual(PacBioBam.SOFT_CLIP, PacBioBam.CigarOperation.CharToType('S')) + self.assertEqual(PacBioBam.HARD_CLIP, PacBioBam.CigarOperation.CharToType('H')) + self.assertEqual(PacBioBam.PADDING, PacBioBam.CigarOperation.CharToType('P')) + self.assertEqual(PacBioBam.SEQUENCE_MATCH, PacBioBam.CigarOperation.CharToType('=')) + self.assertEqual(PacBioBam.SEQUENCE_MISMATCH, PacBioBam.CigarOperation.CharToType('X')) + + def test_setType(self): + m = PacBioBam.CigarOperation() + i = PacBioBam.CigarOperation() + d = PacBioBam.CigarOperation() + n = PacBioBam.CigarOperation() + s = PacBioBam.CigarOperation() + h = PacBioBam.CigarOperation() + p = PacBioBam.CigarOperation() + e = PacBioBam.CigarOperation() + x = PacBioBam.CigarOperation() + + m.Type(PacBioBam.ALIGNMENT_MATCH) + i.Type(PacBioBam.INSERTION) + d.Type(PacBioBam.DELETION) + n.Type(PacBioBam.REFERENCE_SKIP) + s.Type(PacBioBam.SOFT_CLIP) + h.Type(PacBioBam.HARD_CLIP) + p.Type(PacBioBam.PADDING) + e.Type(PacBioBam.SEQUENCE_MATCH) + x.Type(PacBioBam.SEQUENCE_MISMATCH) + + self.assertEqual('M', m.Char()) + self.assertEqual('I', i.Char()) + self.assertEqual('D', d.Char()) + self.assertEqual('N', n.Char()) + self.assertEqual('S', s.Char()) + self.assertEqual('H', h.Char()) + self.assertEqual('P', p.Char()) + self.assertEqual('=', e.Char()) + self.assertEqual('X', x.Char()) + + def test_setChar(self): + m = PacBioBam.CigarOperation() + i = PacBioBam.CigarOperation() + d = PacBioBam.CigarOperation() + n = PacBioBam.CigarOperation() + s = PacBioBam.CigarOperation() + h = PacBioBam.CigarOperation() + p = PacBioBam.CigarOperation() + e = PacBioBam.CigarOperation() + x = PacBioBam.CigarOperation() + + m.Char('M') + i.Char('I') + d.Char('D') + n.Char('N') + s.Char('S') + h.Char('H') + p.Char('P') + e.Char('=') + x.Char('X') + + self.assertEqual(PacBioBam.ALIGNMENT_MATCH, m.Type()) + self.assertEqual(PacBioBam.INSERTION, i.Type()) + self.assertEqual(PacBioBam.DELETION, d.Type()) + self.assertEqual(PacBioBam.REFERENCE_SKIP, n.Type()) + self.assertEqual(PacBioBam.SOFT_CLIP, s.Type()) + self.assertEqual(PacBioBam.HARD_CLIP, h.Type()) + self.assertEqual(PacBioBam.PADDING, p.Type()) + self.assertEqual(PacBioBam.SEQUENCE_MATCH, e.Type()) + self.assertEqual(PacBioBam.SEQUENCE_MISMATCH, x.Type()) + + def test_cigarOpCtors(self): + c1 = PacBioBam.CigarOperation('S', 10) + c2 = PacBioBam.CigarOperation(PacBioBam.SOFT_CLIP, 10) + + self.assertEqual('S', c1.Char()) + self.assertEqual('S', c2.Char()) + self.assertEqual(PacBioBam.SOFT_CLIP, c1.Type()) + self.assertEqual(PacBioBam.SOFT_CLIP, c2.Type()) + self.assertEqual(10, c1.Length()) + self.assertEqual(10, c2.Length()) + + def test_fromEmptyString(self): + s = "" + cigar = PacBioBam.Cigar(s) + self.assertEqual(0, len(cigar)) + + def test_fromString(self): + singleCigarString = "100=" + multiCigarString = "100=2D34I6=6X6=" + + singleCigar = PacBioBam.Cigar(singleCigarString) + multiCigar = PacBioBam.Cigar(multiCigarString) + + self.assertEqual(1, len(singleCigar)) + c = singleCigar[0] + self.assertEqual('=', c.Char()) + self.assertEqual(100, c.Length()) + + self.assertEqual(6, len(multiCigar)) + op0 = multiCigar[0] + op1 = multiCigar[1] + op2 = multiCigar[2] + op3 = multiCigar[3] + op4 = multiCigar[4] + op5 = multiCigar[5] + + self.assertEqual('=', op0.Char()) + self.assertEqual('D', op1.Char()) + self.assertEqual('I', op2.Char()) + self.assertEqual('=', op3.Char()) + self.assertEqual('X', op4.Char()) + self.assertEqual('=', op5.Char()) + self.assertEqual(100, op0.Length()) + self.assertEqual(2, op1.Length()) + self.assertEqual(34, op2.Length()) + self.assertEqual(6, op3.Length()) + self.assertEqual(6, op4.Length()) + self.assertEqual(6, op5.Length()) + + def test_toEmptyString(self): + cigar = PacBioBam.Cigar() + self.assertFalse(cigar.ToStdString()) + + def test_toString(self): + + singleCigarString = "100=" + multiCigarString = "100=2D34I6=6X6=" + + singleCigar = PacBioBam.Cigar() + singleCigar.append(PacBioBam.CigarOperation(PacBioBam.SEQUENCE_MATCH, 100)) + + multiCigar = PacBioBam.Cigar() + multiCigar.append(PacBioBam.CigarOperation(PacBioBam.SEQUENCE_MATCH, 100)) + multiCigar.append(PacBioBam.CigarOperation(PacBioBam.DELETION, 2)) + multiCigar.append(PacBioBam.CigarOperation(PacBioBam.INSERTION, 34)) + multiCigar.append(PacBioBam.CigarOperation(PacBioBam.SEQUENCE_MATCH, 6)) + multiCigar.append(PacBioBam.CigarOperation(PacBioBam.SEQUENCE_MISMATCH,6)) + multiCigar.append(PacBioBam.CigarOperation(PacBioBam.SEQUENCE_MATCH, 6)) + + self.assertEqual(singleCigarString, singleCigar.ToStdString()) + self.assertEqual(multiCigarString, multiCigar.ToStdString()) + diff --git a/tests/src/python/test/test_EndToEnd.py b/tests/src/python/test/test_EndToEnd.py new file mode 100755 index 0000000..90a76c2 --- /dev/null +++ b/tests/src/python/test/test_EndToEnd.py @@ -0,0 +1,89 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +import PacBioBam +import config + +import os +import unittest + +class EndToEndTest(unittest.TestCase): + + def originalNames(self): + # loop over original file, store names, write to generated file + try: + file = PacBioBam.BamFile(self.ex2BamFn) + writer = PacBioBam.BamWriter(self.generatedBamFn, file.Header()) + + dataset = PacBioBam.DataSet(self.ex2BamFn) + entireFile = PacBioBam.EntireFileQuery(dataset) + + names_in = [] + for record in PacBioBam.Iterate(entireFile): + names_in.append(record.FullName()) + writer.Write(record) + return names_in + + except RuntimeError: + self.assertTrue(False) # should not throw + + def generatedNames(self): + try: + # open dataset on generated BAM file, read in names + dataset = PacBioBam.DataSet(self.generatedBamFn) + entireFile = PacBioBam.EntireFileQuery(dataset) + names_out = [] + for record in PacBioBam.Iterate(entireFile): + names_out.append(record.FullName()) + return names_out + + except RuntimeError: + self.assertTrue(False) # should not throw + + def runTest(self): + + self.testData = config.TestData() + self.ex2BamFn = self.testData.directory + "/aligned.bam" + self.generatedBamFn = self.testData.directory + "/generated.bam" + + # compare input records to generated copy's records + names_in = self.originalNames() + names_out = self.generatedNames() + self.assertEqual(names_in, names_out) + + # clean up + os.remove(self.generatedBamFn) diff --git a/tests/src/python/test/test_Frames.py b/tests/src/python/test/test_Frames.py new file mode 100755 index 0000000..7855bbb --- /dev/null +++ b/tests/src/python/test/test_Frames.py @@ -0,0 +1,95 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +import PacBioBam +import config +import unittest + +class FramesTest(unittest.TestCase): + + # ------------ SETUP -------------- + + def setUp(self): + self.testframes = [ + 0, 8, 140, 0, 0, 7, 4, 0, 85, 2, + 1, 3, 2, 10, 1, 20, 47, 10, 9, 60, + 20, 3, 12, 5, 13, 165, 6, 14, 22, 12, + 2, 4, 9, 218, 27, 3, 15, 2, 17, 2, + 45, 24, 89, 10, 7, 1, 11, 15, 0, 7, + 0, 28, 17, 12, 6, 10, 37, 0, 12, 52, + 0, 7, 1, 14, 3, 26, 12, 0, 20, 17, + 2, 13, 2, 9, 13, 7, 15, 29, 3, 6, + 2, 1, 28, 10, 3, 14, 7, 1, 22, 1, + 6, 6, 0, 19, 31, 6, 2, 14, 0, 0, + 1000, 947, 948 + ] + + self.encoded = [ + 0, 8, 102, 0, 0, 7, 4, 0, 75, 2, 1, 3, 2, + 10, 1, 20, 47, 10, 9, 60, 20, 3, 12, 5, 13, 115, + 6, 14, 22, 12, 2, 4, 9, 135, 27, 3, 15, 2, 17, + 2, 45, 24, 77, 10, 7, 1, 11, 15, 0, 7, 0, 28, + 17, 12, 6, 10, 37, 0, 12, 52, 0, 7, 1, 14, 3, + 26, 12, 0, 20, 17, 2, 13, 2, 9, 13, 7, 15, 29, + 3, 6, 2, 1, 28, 10, 3, 14, 7, 1, 22, 1, 6, + 6, 0, 19, 31, 6, 2, 14, 0, 0, + 255, 254, 255 + ] + + def runTest(self): + self.test_ctors() + self.test_encode() + + # ------------ TESTS -------------- + + def test_ctors(self): + f = PacBioBam.Frames() + self.assertEqual(0, len(f.Data())) + + f2 = PacBioBam.Frames(self.testframes) + d = f2.Data() + self.assertEqual(len(self.testframes), len(d)) + for i, v in enumerate(d): + self.assertEqual(int(self.testframes[i]), int(v)) + + def test_encode(self): + f = PacBioBam.Frames(self.testframes) + e = f.Encode() + self.assertEqual(len(self.encoded), len(e)) + for i, v in enumerate(e): + self.assertEqual(int(self.encoded[i]), int(v)) + \ No newline at end of file diff --git a/tests/src/python/test/test_Intervals.py b/tests/src/python/test/test_Intervals.py new file mode 100755 index 0000000..9e484eb --- /dev/null +++ b/tests/src/python/test/test_Intervals.py @@ -0,0 +1,348 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +import PacBioBam +import config +import unittest + +class IntervalsTest(unittest.TestCase): + + # ------------ SETUP -------------- + + def runTest(self): + self.test_unmappedPosition() + self.test_ctors() + self.test_equality() + self.test_copy() + self.test_modifiers() + self.test_cover() + self.test_intersect() + self.test_validity() + self.test_length() + + # ------------ TESTS -------------- + + def test_unmappedPosition(self): + self.assertEqual(-1, PacBioBam.UnmappedPosition) + + def test_ctors(self): + empty = PacBioBam.PositionInterval() + single = PacBioBam.PositionInterval(4) + normal = PacBioBam.PositionInterval(5, 8) + + self.assertEqual(0, empty.Start()) + self.assertEqual(0, empty.Stop()) + self.assertEqual(4, single.Start()) + self.assertEqual(5, single.Stop()) + self.assertEqual(5, normal.Start()) + self.assertEqual(8, normal.Stop()) + + def test_equality(self): + + empty = PacBioBam.PositionInterval() + empty2 = PacBioBam.PositionInterval() + singleton = PacBioBam.PositionInterval(4) + sameAsSingleton = PacBioBam.PositionInterval(4, 5) + normal = PacBioBam.PositionInterval(5, 8) + sameAsNormal = PacBioBam.PositionInterval(5, 8) + different = PacBioBam.PositionInterval(20, 40) + + # self-equality + self.assertEqual(empty, empty) + self.assertEqual(singleton, singleton) + self.assertEqual(normal, normal) + self.assertEqual(different, different) + + # same values + self.assertEqual(empty, empty2) + self.assertEqual(singleton, sameAsSingleton) + self.assertEqual(normal, sameAsNormal) + + # different values + self.assertNotEqual(empty, singleton) + self.assertNotEqual(empty, normal) + self.assertNotEqual(empty, different) + self.assertNotEqual(singleton, normal) + self.assertNotEqual(normal, different) + + def test_copy(self): + interval1 = PacBioBam.PositionInterval(5,8) + interval2 = PacBioBam.PositionInterval(interval1) + interval3 = interval1 + + self.assertEqual(interval1, interval1) + self.assertEqual(interval1, interval2) + self.assertEqual(interval1, interval3) + + def test_modifiers(self): + + interval1 = PacBioBam.PositionInterval(5,8) + interval2 = PacBioBam.PositionInterval(interval1) + interval2.Start(2) + interval2.Stop(10) + + self.assertNotEqual(interval1, interval2) + self.assertEqual(2, interval2.Start()) + self.assertEqual(10, interval2.Stop()) + + def test_cover(self): + + a = PacBioBam.PositionInterval(2,4) + b = PacBioBam.PositionInterval(3,5) + c = PacBioBam.PositionInterval(6,8) + d = PacBioBam.PositionInterval(1,7) + e = PacBioBam.PositionInterval(5,8) + + # 0123456789 + # a -- + # b -- + # c -- + # d ------ + # e --- + + # self-cover + self.assertTrue(a.Covers(a)) + self.assertTrue(a.CoveredBy(a)) + + # basic covers/covered + self.assertTrue(b.CoveredBy(d)) + self.assertTrue(d.Covers(b)) + self.assertNotEqual(b, d) + self.assertFalse(b.Covers(d)) + + # completely disjoint + self.assertFalse(b.Covers(c)) + self.assertFalse(c.Covers(b)) + self.assertFalse(b.CoveredBy(c)) + self.assertFalse(c.CoveredBy(b)) + + # b.stop == e.start + self.assertFalse(b.Covers(e)) + self.assertFalse(b.CoveredBy(e)) + + # shared endpoint, start contained + self.assertTrue(e.Covers(c)) + self.assertTrue(c.CoveredBy(e)) + + def test_intersect(self): + + a = PacBioBam.PositionInterval(2,4) + b = PacBioBam.PositionInterval(3,5) + c = PacBioBam.PositionInterval(6,8) + d = PacBioBam.PositionInterval(1,7) + e = PacBioBam.PositionInterval(5,8) + + # 0123456789 + # a -- + # b -- + # c -- + # d ------ + # e --- + + # self-intersection + self.assertTrue(a.Intersects(a)) + + # intersection is commutative + self.assertTrue(a.Intersects(b)) + self.assertTrue(b.Intersects(a)) + + # covered implies intersection + self.assertTrue(d.Covers(a)) + self.assertTrue(a.Intersects(d)) + self.assertTrue(d.Intersects(a)) + + # c.start > b.stop (obvious disjoint) + self.assertFalse(b.Intersects(c)) + + # b.stop == e.start (intervals are right-open, so disjoint) + self.assertFalse(b.Intersects(e)) + + def test_validity(self): + + a = PacBioBam.PositionInterval() # default ctor + b = PacBioBam.PositionInterval(0,0) # start == stop (zero) + c = PacBioBam.PositionInterval(4,4) # start == stop (nonzero) + d = PacBioBam.PositionInterval(0,1) # start < stop (start is zero) + e = PacBioBam.PositionInterval(4,5) # start < stop (start is nonzero) + f = PacBioBam.PositionInterval(5,4) # start > stop + + self.assertFalse(a.IsValid()) + self.assertFalse(b.IsValid()) + self.assertFalse(c.IsValid()) + self.assertTrue(d.IsValid()) + self.assertTrue(e.IsValid()) + self.assertFalse(f.IsValid()) + + def test_length(self): + + a = PacBioBam.PositionInterval(2,4) + b = PacBioBam.PositionInterval(3,5) + c = PacBioBam.PositionInterval(6,8) + d = PacBioBam.PositionInterval(1,7) + e = PacBioBam.PositionInterval(5,8) + + self.assertEqual(2, a.Length()) + self.assertEqual(2, b.Length()) + self.assertEqual(2, c.Length()) + self.assertEqual(6, d.Length()) + self.assertEqual(3, e.Length()) + +class GenomicIntervalsTest(unittest.TestCase): + + # ------------ SETUP -------------- + + def runTest(self): + self.test_ctors() + self.test_copy() + self.test_modifiers() + self.test_cover() + self.test_validity() + + # ------------ TESTS -------------- + + def test_ctors(self): + + empty = PacBioBam.GenomicInterval() + normal = PacBioBam.GenomicInterval("foo", 100, 200) + + self.assertEqual("", empty.Name()) + self.assertEqual(0, empty.Start()) + self.assertEqual(0, empty.Stop()) + + self.assertEqual("foo", normal.Name()) + self.assertEqual(100, normal.Start()) + self.assertEqual(200, normal.Stop()) + + + def test_copy(self): + + a = PacBioBam.GenomicInterval("foo", 10, 20) + b = PacBioBam.GenomicInterval(a) + c = a + + self.assertEqual(a, a) + self.assertEqual(a, b) + self.assertEqual(a, c) + + def test_modifiers(self): + + a = PacBioBam.GenomicInterval("foo", 10, 20) + + b = PacBioBam.GenomicInterval(a) + b.Name("bar").Start(2).Stop(10) + + c = PacBioBam.GenomicInterval(a) + c.Interval(b.Interval()) + + self.assertNotEqual(a, b) + self.assertEqual("bar", b.Name()) + self.assertEqual(2, b.Start()) + self.assertEqual(10, b.Stop()) + self.assertEqual(a.Name(), c.Name()) + self.assertEqual(b.Interval(), c.Interval()) + + def test_cover(self): + + a = PacBioBam.GenomicInterval("foo",2,4) + b = PacBioBam.GenomicInterval("foo",3,5) + c = PacBioBam.GenomicInterval("foo",6,8) + d = PacBioBam.GenomicInterval("foo",1,7) + e = PacBioBam.GenomicInterval("foo",5,8) + f = PacBioBam.GenomicInterval("bar",3,5) # same as b, different ref + + # 0123456789 + # a -- + # b -- + # c -- + # d ------ + # e --- + + # self-cover + self.assertTrue(a.Covers(a)) + self.assertTrue(a.CoveredBy(a)) + + # basic covers/covered + self.assertTrue(b.CoveredBy(d)) + self.assertTrue(d.Covers(b)) + self.assertNotEqual(b, d) + self.assertFalse(b.Covers(d)) + + # same coords as b, but different ref + self.assertFalse(f.CoveredBy(d)) + self.assertFalse(d.Covers(f)) + self.assertNotEqual(f, d) + self.assertFalse(f.Covers(d)) + + # obvious disjoint + self.assertFalse(b.Covers(c)) + self.assertFalse(c.Covers(b)) + self.assertFalse(b.CoveredBy(c)) + self.assertFalse(c.CoveredBy(b)) + + # b.stop == e.start (intervals are right-open, so disjoint) + self.assertFalse(b.Covers(e)) + self.assertFalse(b.CoveredBy(e)) + + # shared endpoint, start contained + self.assertTrue(e.Covers(c)) + self.assertTrue(c.CoveredBy(e)) + + def test_validity(self): + + a = PacBioBam.GenomicInterval() # default + b = PacBioBam.GenomicInterval("foo",0,0) # valid id, start == stop (zero) + c = PacBioBam.GenomicInterval("foo",4,4) # valid id, start == stop (non-zero) + d = PacBioBam.GenomicInterval("foo",0,1) # valid id, start < stop (start == zero) OK + e = PacBioBam.GenomicInterval("foo",4,5) # valid id, start < stop (start > zero) OK + f = PacBioBam.GenomicInterval("foo",5,4) # valid id, start > stop + g = PacBioBam.GenomicInterval("",0,0) # invalid id, start == stop (zero) + h = PacBioBam.GenomicInterval("",4,4) # invalid id, start == stop (non-zero) + i = PacBioBam.GenomicInterval("",0,1) # invalid id, start < stop (start == zero) + j = PacBioBam.GenomicInterval("",4,5) # invalid id, start < stop (start > zero) + k = PacBioBam.GenomicInterval("",5,4) # invalid id, start > stop + + self.assertTrue(d.IsValid()) + self.assertTrue(e.IsValid()) + self.assertFalse(a.IsValid()) + self.assertFalse(b.IsValid()) + self.assertFalse(c.IsValid()) + self.assertFalse(f.IsValid()) + self.assertFalse(g.IsValid()) + self.assertFalse(h.IsValid()) + self.assertFalse(i.IsValid()) + self.assertFalse(j.IsValid()) + self.assertFalse(k.IsValid()) diff --git a/tests/src/python/test/test_PolymeraseStitching.py b/tests/src/python/test/test_PolymeraseStitching.py new file mode 100755 index 0000000..13ee448 --- /dev/null +++ b/tests/src/python/test/test_PolymeraseStitching.py @@ -0,0 +1,358 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +import PacBioBam +import config +import unittest + +class PolymeraseStitchingTest(unittest.TestCase): + + # ------------ SETUP -------------- + + def setUp(self): + self.data = config.TestData() + + def runTest(self): + self.test_virtualRegions() + self.test_internalSubreadsToOriginal() + self.test_internalHqToOriginal() + self.test_productionSubreadsToOriginal() + self.test_productionHqToOriginal() + + # ------------ TESTS -------------- + + def test_virtualRegions(self): + + subreadBam = self.data.directory + "/polymerase/internal.subreads.bam" + scrapsBam = self.data.directory + "/polymerase/internal.scraps.bam" + vpr = PacBioBam.VirtualPolymeraseReader(subreadBam, scrapsBam) + + virtualRecord = vpr.Next() + + # NOTE: this method is disabled + # + # Any attempt to retrive this value resulted in several + # "swig/python detected a memory leak of type 'unknown', no destructor found." + # errors (& an empty dictionary result). The same info is available via the + # VirtualRegionsTable(regionType) method, though a bit clunkier if you just want + # to iterate. But access to region info for specific types are available & correct, + # so I'm just going to leave this one out for now. - DB + # + # regionMap = virtualRecord.VirtualRegionsMap(); + + # ADAPTER + adapter = virtualRecord.VirtualRegionsTable(PacBioBam.VirtualRegionType_ADAPTER) + self.assertEqual(7, len(adapter)) + self.assertEqual(3047, adapter[0].beginPos); + self.assertEqual(3095, adapter[0].endPos); + self.assertEqual(3650, adapter[1].beginPos); + self.assertEqual(3700, adapter[1].endPos); + self.assertEqual(4289, adapter[2].beginPos); + self.assertEqual(4335, adapter[2].endPos); + self.assertEqual(4888, adapter[3].beginPos); + self.assertEqual(4939, adapter[3].endPos); + self.assertEqual(5498, adapter[4].beginPos); + self.assertEqual(5546, adapter[4].endPos); + self.assertEqual(6116, adapter[5].beginPos); + self.assertEqual(6173, adapter[5].endPos); + self.assertEqual(6740, adapter[6].beginPos); + self.assertEqual(6790, adapter[6].endPos); + + # BARCODE + barcode = virtualRecord.VirtualRegionsTable(PacBioBam.VirtualRegionType_BARCODE) + self.assertEqual(14, len(barcode)) + self.assertEqual(3025, barcode[0].beginPos); + self.assertEqual(3047, barcode[0].endPos); + self.assertEqual(3095, barcode[1].beginPos); + self.assertEqual(3116, barcode[1].endPos); + self.assertEqual(3628, barcode[2].beginPos); + self.assertEqual(3650, barcode[2].endPos); + self.assertEqual(3700, barcode[3].beginPos); + self.assertEqual(3722, barcode[3].endPos); + self.assertEqual(4267, barcode[4].beginPos); + self.assertEqual(4289, barcode[4].endPos); + self.assertEqual(4335, barcode[5].beginPos); + self.assertEqual(4356, barcode[5].endPos); + self.assertEqual(4864, barcode[6].beginPos); + self.assertEqual(4888, barcode[6].endPos); + self.assertEqual(4939, barcode[7].beginPos); + self.assertEqual(4960, barcode[7].endPos); + self.assertEqual(5477, barcode[8].beginPos); + self.assertEqual(5498, barcode[8].endPos); + self.assertEqual(5546, barcode[9].beginPos); + self.assertEqual(5571, barcode[9].endPos); + self.assertEqual(6087, barcode[10].beginPos); + self.assertEqual(6116, barcode[10].endPos); + self.assertEqual(6173, barcode[11].beginPos); + self.assertEqual(6199, barcode[11].endPos); + self.assertEqual(6719, barcode[12].beginPos); + self.assertEqual(6740, barcode[12].endPos); + self.assertEqual(6790, barcode[13].beginPos); + self.assertEqual(6812, barcode[13].endPos); + + # HQREGION + hqregion = virtualRecord.VirtualRegionsTable(PacBioBam.VirtualRegionType_HQREGION) + self.assertEqual(1, len(hqregion)) + + self.assertEqual(2659, hqregion[0].beginPos); + self.assertEqual(7034, hqregion[0].endPos); + + # LQREGION + lqregion = virtualRecord.VirtualRegionsTable(PacBioBam.VirtualRegionType_LQREGION) + self.assertEqual(2, len(lqregion)) + + self.assertEqual(0, lqregion[0].beginPos); + self.assertEqual(2659, lqregion[0].endPos); + self.assertEqual(7034, lqregion[1].beginPos); + self.assertEqual(7035, lqregion[1].endPos); + + # SUBREAD + subread = virtualRecord.VirtualRegionsTable(PacBioBam.VirtualRegionType_SUBREAD) + self.assertEqual(8, len(subread)) + + def test_internalSubreadsToOriginal(self): + + # stitch virtual polymerase record + subreadsBam = self.data.directory + "/polymerase/internal.subreads.bam" + scrapsBam = self.data.directory + "/polymerase/internal.scraps.bam" + vpr = PacBioBam.VirtualPolymeraseReader(subreadsBam, scrapsBam) + self.assertTrue(vpr.HasNext()) + virtualRecord = vpr.Next() + + # fetch original polymerase record + polyBam = PacBioBam.DataSet(self.data.directory + "/polymerase/internal.polymerase.bam") + polyQuery = PacBioBam.EntireFileQuery(polyBam) + polyIter = polyQuery.begin() + polyEnd = polyQuery.end() + self.assertTrue(polyIter != polyEnd) + polyRecord = polyIter.value() + + # compare + self.compare(polyRecord, virtualRecord) + + def test_internalHqToOriginal(self): + + # stitch virtual polymerase record + hqRegionsBam = self.data.directory + "/polymerase/internal.hqregions.bam" + lqRegionsBam = self.data.directory + "/polymerase/internal.lqregions.bam" + vpr = PacBioBam.VirtualPolymeraseReader(hqRegionsBam, lqRegionsBam) + self.assertTrue(vpr.HasNext()) + virtualRecord = vpr.Next() + + # fetch original polymerase record + polyBam = PacBioBam.DataSet(self.data.directory + "/polymerase/internal.polymerase.bam") + polyQuery = PacBioBam.EntireFileQuery(polyBam) + polyIter = polyQuery.begin() + polyEnd = polyQuery.end() + self.assertTrue(polyIter != polyEnd) + polyRecord = polyIter.value() + + # # compare + self.compare(polyRecord, virtualRecord) + + def test_productionSubreadsToOriginal(self): + + # stitch virtual polymerase record + subreadsBam = self.data.directory + "/polymerase/production.subreads.bam" + scrapsBam = self.data.directory + "/polymerase/production.scraps.bam" + vpr = PacBioBam.VirtualPolymeraseReader(subreadsBam, scrapsBam) + self.assertTrue(vpr.HasNext()) + virtualRecord = vpr.Next() + + # fetch original polymerase record + polyBam = PacBioBam.DataSet(self.data.directory + "/polymerase/production.polymerase.bam") + polyQuery = PacBioBam.EntireFileQuery(polyBam) + polyIter = polyQuery.begin() + polyEnd = polyQuery.end() + self.assertTrue(polyIter != polyEnd) + polyRecord = polyIter.value() + + # compare + self.assertEqual(polyRecord.FullName(), virtualRecord.FullName()); + self.assertEqual(polyRecord.HoleNumber(), virtualRecord.HoleNumber()); + self.assertEqual(polyRecord.NumPasses(), virtualRecord.NumPasses()); + self.assertEqual(polyRecord.Sequence(), virtualRecord.Sequence()); + self.assertEqual(polyRecord.DeletionTag(), virtualRecord.DeletionTag()); + self.assertEqual(polyRecord.SubstitutionTag(), virtualRecord.SubstitutionTag()); + self.assertEqual(polyRecord.IPD(), virtualRecord.IPDV1Frames()); + self.assertEqual(polyRecord.ReadGroup(), virtualRecord.ReadGroup()); + + self.assertAlmostEqual(float(polyRecord.ReadAccuracy()), float(virtualRecord.ReadAccuracy())); + + self.assertEqual(polyRecord.Qualities().Fastq(), virtualRecord.Qualities().Fastq()); + self.assertEqual(polyRecord.DeletionQV().Fastq(), virtualRecord.DeletionQV().Fastq()); + self.assertEqual(polyRecord.InsertionQV().Fastq(), virtualRecord.InsertionQV().Fastq()); + self.assertEqual(polyRecord.MergeQV().Fastq(), virtualRecord.MergeQV().Fastq()); + self.assertEqual(polyRecord.SubstitutionQV().Fastq(), virtualRecord.SubstitutionQV().Fastq()); + + def test_productionHqToOriginal(self): + + # stitch virtual polymerase record + hqRegionsBam = self.data.directory + "/polymerase/production_hq.hqregion.bam" + lqRegionsBam = self.data.directory + "/polymerase/production_hq.scraps.bam" + vpr = PacBioBam.VirtualPolymeraseReader(hqRegionsBam, lqRegionsBam) + self.assertTrue(vpr.HasNext()) + virtualRecord = vpr.Next() + + # fetch original polymerase record + polyBam = PacBioBam.DataSet(self.data.directory + "/polymerase/production.polymerase.bam") + polyQuery = PacBioBam.EntireFileQuery(polyBam) + polyIter = polyQuery.begin() + polyEnd = polyQuery.end() + self.assertTrue(polyIter != polyEnd) + polyRecord = polyIter.value() + + # compare + self.assertFalse(polyRecord.HasPulseCall()); + self.assertFalse(virtualRecord.HasPulseCall()); + + self.assertEqual(polyRecord.FullName(), virtualRecord.FullName()); + self.assertEqual(polyRecord.HoleNumber(), virtualRecord.HoleNumber()); + self.assertEqual(polyRecord.NumPasses(), virtualRecord.NumPasses()); + self.assertEqual(polyRecord.Sequence(), virtualRecord.Sequence()); + self.assertEqual(polyRecord.DeletionTag(), virtualRecord.DeletionTag()); + self.assertEqual(polyRecord.SubstitutionTag(), virtualRecord.SubstitutionTag()); + self.assertEqual(polyRecord.IPD(), virtualRecord.IPDV1Frames()); + self.assertEqual(polyRecord.ReadGroup(), virtualRecord.ReadGroup()); + + self.assertAlmostEqual(float(polyRecord.ReadAccuracy()), float(virtualRecord.ReadAccuracy())); + + self.assertEqual(polyRecord.Qualities().Fastq(), virtualRecord.Qualities().Fastq()); + self.assertEqual(polyRecord.DeletionQV().Fastq(), virtualRecord.DeletionQV().Fastq()); + self.assertEqual(polyRecord.InsertionQV().Fastq(), virtualRecord.InsertionQV().Fastq()); + self.assertEqual(polyRecord.MergeQV().Fastq(), virtualRecord.MergeQV().Fastq()); + self.assertEqual(polyRecord.SubstitutionQV().Fastq(), virtualRecord.SubstitutionQV().Fastq()); + + self.assertTrue(polyRecord.HasDeletionQV()); + self.assertTrue(polyRecord.HasDeletionTag()); + self.assertTrue(polyRecord.HasInsertionQV()); + self.assertTrue(polyRecord.HasMergeQV()); + self.assertTrue(polyRecord.HasSubstitutionQV()); + self.assertTrue(polyRecord.HasSubstitutionTag()); + self.assertTrue(polyRecord.HasIPD()); + self.assertFalse(polyRecord.HasLabelQV()); + self.assertFalse(polyRecord.HasAltLabelQV()); + self.assertFalse(polyRecord.HasAltLabelTag()); + self.assertFalse(polyRecord.HasPkmean()); + self.assertFalse(polyRecord.HasPkmid()); + self.assertFalse(polyRecord.HasPulseCall()); + self.assertFalse(polyRecord.HasPulseWidth()); + self.assertFalse(polyRecord.HasPrePulseFrames()); + self.assertFalse(polyRecord.HasPulseCallWidth()); + + self.assertTrue(virtualRecord.HasDeletionQV()); + self.assertTrue(virtualRecord.HasDeletionTag()); + self.assertTrue(virtualRecord.HasInsertionQV()); + self.assertTrue(virtualRecord.HasMergeQV()); + self.assertTrue(virtualRecord.HasSubstitutionQV()); + self.assertTrue(virtualRecord.HasSubstitutionTag()); + self.assertTrue(virtualRecord.HasIPD()); + self.assertFalse(virtualRecord.HasLabelQV()); + self.assertFalse(virtualRecord.HasAltLabelQV()); + self.assertFalse(virtualRecord.HasAltLabelTag()); + self.assertFalse(virtualRecord.HasPkmean()); + self.assertFalse(virtualRecord.HasPkmid()); + self.assertFalse(virtualRecord.HasPulseCall()); + self.assertFalse(virtualRecord.HasPulseWidth()); + self.assertFalse(virtualRecord.HasPrePulseFrames()); + self.assertFalse(virtualRecord.HasPulseCallWidth()); + + # ------------ HELPERS -------------- + + def compare(self, b1, b2): + + self.assertTrue(b1.HasDeletionQV()); + self.assertTrue(b1.HasDeletionTag()); + self.assertTrue(b1.HasInsertionQV()); + self.assertTrue(b1.HasMergeQV()); + self.assertTrue(b1.HasSubstitutionQV()); + self.assertTrue(b1.HasSubstitutionTag()); + self.assertTrue(b1.HasLabelQV()); + self.assertTrue(b1.HasAltLabelQV()); + self.assertTrue(b1.HasAltLabelTag()); + self.assertTrue(b1.HasPkmean()); + self.assertTrue(b1.HasPkmid()); + self.assertTrue(b1.HasPulseCall()); + self.assertTrue(b1.HasIPD()); + self.assertTrue(b1.HasPulseWidth()); + self.assertTrue(b1.HasPrePulseFrames()); + self.assertTrue(b1.HasPulseCallWidth()); + self.assertTrue(b1.HasPulseMergeQV()); + + self.assertTrue(b2.HasDeletionQV()); + self.assertTrue(b2.HasDeletionTag()); + self.assertTrue(b2.HasInsertionQV()); + self.assertTrue(b2.HasMergeQV()); + self.assertTrue(b2.HasSubstitutionQV()); + self.assertTrue(b2.HasSubstitutionTag()); + self.assertTrue(b2.HasLabelQV()); + self.assertTrue(b2.HasAltLabelQV()); + self.assertTrue(b2.HasAltLabelTag()); + self.assertTrue(b2.HasPkmean()); + self.assertTrue(b2.HasPkmid()); + self.assertTrue(b2.HasPulseCall()); + self.assertTrue(b2.HasIPD()); + self.assertTrue(b2.HasPulseWidth()); + self.assertTrue(b2.HasPrePulseFrames()); + self.assertTrue(b2.HasPulseCallWidth()); + self.assertTrue(b2.HasPulseMergeQV()); + + self.assertEqual(b1.FullName(), b2.FullName()); + self.assertEqual(b1.HoleNumber(), b2.HoleNumber()); + self.assertEqual(b1.NumPasses(), b2.NumPasses()); + self.assertEqual(b1.Sequence(), b2.Sequence()); + self.assertEqual(b1.DeletionTag(), b2.DeletionTag()); + self.assertEqual(b1.SubstitutionTag(), b2.SubstitutionTag()); + self.assertEqual(b1.AltLabelTag(), b2.AltLabelTag()); + self.assertEqual(b1.Pkmean(), b2.Pkmean()); + self.assertEqual(b1.Pkmid(), b2.Pkmid()); + self.assertEqual(b1.PulseCall(), b2.PulseCall()); + self.assertEqual(b1.IPD(), b2.IPD()); + self.assertEqual(b1.PulseWidth(), b2.PulseWidth()); + self.assertEqual(b1.PrePulseFrames(), b2.PrePulseFrames()); + self.assertEqual(b1.PulseCallWidth(), b2.PulseCallWidth()); + self.assertEqual(b1.ReadGroup(), b2.ReadGroup()); + + self.assertEqual(b1.Qualities().Fastq(), b2.Qualities().Fastq()); + self.assertEqual(b1.DeletionQV().Fastq(), b2.DeletionQV().Fastq()); + self.assertEqual(b1.InsertionQV().Fastq(), b2.InsertionQV().Fastq()); + self.assertEqual(b1.MergeQV().Fastq(), b2.MergeQV().Fastq()); + self.assertEqual(b1.SubstitutionQV().Fastq(), b2.SubstitutionQV().Fastq()); + self.assertEqual(b1.PulseMergeQV().Fastq(), b2.PulseMergeQV().Fastq()); + self.assertEqual(b1.LabelQV().Fastq(), b2.LabelQV().Fastq()); + self.assertEqual(b1.AltLabelQV().Fastq(), b2.AltLabelQV().Fastq()); + diff --git a/tests/src/python/test/test_QualityValues.py b/tests/src/python/test/test_QualityValues.py new file mode 100755 index 0000000..41bf8c3 --- /dev/null +++ b/tests/src/python/test/test_QualityValues.py @@ -0,0 +1,131 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +import PacBioBam +import config +import unittest + +class QualityValueTest(unittest.TestCase): + + # ------------ SETUP -------------- + + def runTest(self): + self.test_defaults() + self.test_fromNumber() + self.test_fromFastq() + + # ------------ TESTS -------------- + + def test_defaults(self): + value = PacBioBam.QualityValue() + self.assertEqual(0, int(value)) + self.assertEqual('!', value.Fastq()) + + def test_fromNumber(self): + + zero = PacBioBam.QualityValue(0) + thirtythree = PacBioBam.QualityValue(33) + normal = PacBioBam.QualityValue(42) + maxQV = PacBioBam.QualityValue(93) + tooHigh = PacBioBam.QualityValue(94) + max8bit = PacBioBam.QualityValue(126) + + self.assertEqual(0, int(zero)) + self.assertEqual(33, int(thirtythree)) + self.assertEqual(42, int(normal)) + self.assertEqual(93, int(maxQV)) + self.assertEqual(93, int(tooHigh)) + self.assertEqual(93, int(max8bit)) + + self.assertEqual('!', zero.Fastq()) + self.assertEqual('B', thirtythree.Fastq()) + self.assertEqual('K', normal.Fastq()) + self.assertEqual('~', maxQV.Fastq()) + self.assertEqual('~', tooHigh.Fastq()) + self.assertEqual('~', max8bit.Fastq()) + + def test_fromFastq(self): + + zero = PacBioBam.QualityValue.FromFastq('!') + thirtythree = PacBioBam.QualityValue.FromFastq('B') + normal = PacBioBam.QualityValue.FromFastq('K') + maxQV = PacBioBam.QualityValue.FromFastq('~') + + self.assertEqual(0, int(zero)) + self.assertEqual(33, int(thirtythree)) + self.assertEqual(42, int(normal)) + self.assertEqual(93, int(maxQV)) + +class QualityValuesTest(unittest.TestCase): + + # ------------ SETUP -------------- + + def runTest(self): + self.test_defaults() + self.test_fromNumbers() + self.test_fromFastq() + + # ------------ TESTS -------------- + + def test_defaults(self): + values = PacBioBam.QualityValues() + self.assertFalse(values.Fastq()) + + def test_fromNumbers(self): + + fastqString = "~~~KKBB!!" + values = [ 93, 93, 93, 42, 42, 33, 33, 0, 0 ] + + qvs = PacBioBam.QualityValues() + for value in values: + qvs.append(PacBioBam.QualityValue(value)) + + self.assertEqual(fastqString, qvs.Fastq()) + + def test_fromFastq(self): + + fastqString = "~~~KKBB!!" + values = [ 93, 93, 93, 42, 42, 33, 33, 0, 0 ] + + qvs = PacBioBam.QualityValues.FromFastq(fastqString) + + self.assertEqual(len(fastqString), len(qvs)) + self.assertEqual(len(values), len(qvs)) + + for i, v in enumerate(values): + self.assertEqual(v, int(qvs[i])) + \ No newline at end of file diff --git a/tests/src/python/test_pbbam.py b/tests/src/python/test_pbbam.py new file mode 100755 index 0000000..8ca3c90 --- /dev/null +++ b/tests/src/python/test_pbbam.py @@ -0,0 +1,48 @@ +# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted (subject to the limitations in the +# disclaimer below) provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of Pacific Biosciences nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# Author: Derek Barnett + +import sys +import unittest + +if __name__ == "__main__": + suite = unittest.TestLoader().discover('.', pattern = "test_*.py") + result = unittest.TextTestRunner(verbosity=2).run(suite) + if result.wasSuccessful(): + sys.exit(0) + else: + sys.exit(1) + \ No newline at end of file diff --git a/tests/src/test_Accuracy.cpp b/tests/src/test_Accuracy.cpp new file mode 100644 index 0000000..9750dd4 --- /dev/null +++ b/tests/src/test_Accuracy.cpp @@ -0,0 +1,63 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +TEST(AccuracyTest, ClampValues) +{ + Accuracy a_zero(0.0); + Accuracy a_neg(-0.5); + Accuracy a_min(0.0); + Accuracy a_normal(0.9); + Accuracy a_max(1.0); + Accuracy a_tooLarge(1.1); + + EXPECT_FLOAT_EQ(0.0, a_zero); + EXPECT_FLOAT_EQ(0.0, a_neg); + EXPECT_FLOAT_EQ(0.0, a_min); + EXPECT_FLOAT_EQ(0.9, a_normal); + EXPECT_FLOAT_EQ(1.0, a_max); + EXPECT_FLOAT_EQ(1.0, a_tooLarge); +} diff --git a/tests/src/test_AlignmentPrinter.cpp b/tests/src/test_AlignmentPrinter.cpp new file mode 100644 index 0000000..89ec98a --- /dev/null +++ b/tests/src/test_AlignmentPrinter.cpp @@ -0,0 +1,152 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include + +#include +#include +#include +#include +#include +#include + +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +const string lambdaFasta = tests::Data_Dir + "/lambdaNEB.fa"; +const string singleInsertionBam = tests::Data_Dir + "/aligned.bam"; + +TEST(AlignmentPrinterTest, Print) +{ + IndexedFastaReader r(lambdaFasta); + AlignmentPrinter pretty(r); + + BamFile bamFile(singleInsertionBam); + EntireFileQuery bamQuery(bamFile); + auto it = bamQuery.begin(); + + // funky formatting used to format alignments + auto expected = string + { + "Read : singleInsertion/100/0_49\n" + "Reference : lambda_NEB3011\n" + "\n" + "Read-length : 49\n" + "Concordance : 0.96\n" + "\n" + "5210 : GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGG : 5249\n" + " \x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| |\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| ||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n" + " 0 : GGCTGCAG-GTACAGCGGTCAGGAGGCCAATTGATGCCGG : 39\n" + "\n" + "5249 : ACTGGCTGAT : 5259\n" + " |\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n" + " 39 : ACTGGCTGAT : 49\n" + "\n" + }; + + auto record = *it++; + EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC)); + + expected = string + { + "Read : singleInsertion/200/0_49\n" + "Reference : lambda_NEB3011\n" + "\n" + "Read-length : 49\n" + "Concordance : 0.96\n" + "\n" + "5210 : GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGG : 5249\n" + " \x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| |\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| ||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n" + " 0 : GGCTGCAG-GTACAGCGGTCAGGAGGCCAATTGATGCCGG : 39\n" + "\n" + "5249 : ACTGGCTGAT : 5259\n" + " |\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n" + " 39 : ACTGGCTGAT : 49\n" + "\n" + }; + + record = *it++; + EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC)); + + expected = string + { + "Read : singleInsertion/100/0_111\n" + "Reference : lambda_NEB3011\n" + "\n" + "Read-length : 59\n" + "Concordance : 0.951\n" + "\n" + "9377 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCG : 9417\n" + " |||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||| |\n" + " 0 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGA--G : 38\n" + "\n" + "9417 : CAGCACGGT-AACAGCGGCAA : 9437\n" + " |||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||| ||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||\n" + " 38 : CAGCACGGTAAACAGCGGCAA : 59\n" + "\n" + }; + + record = *it++; + EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC)); + + expected = string + { + "Read : singleInsertion/100/0_111\n" + "Reference : lambda_NEB3011\n" + "\n" + "Read-length : 59\n" + "Concordance : 0.951\n" + "\n" + "9377 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCG : 9417\n" + " |||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||| |\n" + " 0 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGA--G : 38\n" + "\n" + "9417 : CAGCACGGT-AACAGCGGCAA : 9437\n" + " |||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||| ||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||\n" + " 38 : CAGCACGGTAAACAGCGGCAA : 59\n" + "\n" + }; + + record = *it++; + EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC)); +} diff --git a/tests/src/test_BamFile.cpp b/tests/src/test_BamFile.cpp new file mode 100644 index 0000000..674a471 --- /dev/null +++ b/tests/src/test_BamFile.cpp @@ -0,0 +1,142 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace tests { + +template +void CheckFile(const T& input, const size_t expectedCount) +{ + size_t observedCount = 0; + EntireFileQuery entireFile(input); + for (const BamRecord& r : entireFile) { + (void)r; + ++observedCount; + } + EXPECT_EQ(expectedCount, observedCount); +} + +} // namespace tests +} // namespace BAM +} // namespace PacBio + +TEST(BamFileTest, NonExistentFileThrows) +{ + EXPECT_THROW(BamFile{ "does_not_exist.bam" }, std::runtime_error); +} + +TEST(BamFileTest, NonBamFileThrows) +{ + EXPECT_THROW(BamFile { tests::Data_Dir + "/lambdaNEB.fa.fai" }, std::runtime_error); +} + +TEST(BamFileTest, RelativePathBamOk) +{ + // cache current working directory, then drill down so we can point to + // BAMs using relative path + const string cwd = internal::FileUtils::CurrentWorkingDirectory(); + ASSERT_EQ(0, chdir(tests::Data_Dir.c_str())); + ASSERT_EQ(0, chdir("relative/a")); + + // BamFile from relative BAM fn + tests::CheckFile(BamFile{ "../b/test1.bam" }, 3); + + // dataset from relative BAM fn + tests::CheckFile(DataSet{ "../b/test1.bam" }, 3); + + // dataset from BamFile object (itself from relative BAM fn) + { + auto file = BamFile{"../b/test1.bam"}; + tests::CheckFile(DataSet{ file }, 3); + } + + // restore working directory + ASSERT_EQ(0, chdir(cwd.c_str())); +} + +TEST(BamFileTest, RelativePathXmlOk) +{ + // cache current working directory, then drill down so we can point to + // BAMs using relative path + const string cwd = internal::FileUtils::CurrentWorkingDirectory(); + ASSERT_EQ(0, chdir(tests::Data_Dir.c_str())); + + // dataset from XML containing relative paths + tests::CheckFile(DataSet{ "relative/relative.xml" }, 9); + + // restore working directory + ASSERT_EQ(0, chdir(cwd.c_str())); +} + +TEST(BamFileTest, RelativePathFofnOk) +{ + // cache current working directory, then drill down so we can point to + // BAMs using relative path + const string cwd = internal::FileUtils::CurrentWorkingDirectory(); + ASSERT_EQ(0, chdir(tests::Data_Dir.c_str())); + + // dataset from FOFN containing relative paths + tests::CheckFile(DataSet{ "relative/relative.fofn" }, 9); + + // NOTE: doesn't yet support a FOFN containing an XML with relative paths +// tests::CheckFile(DataSet{ "relative/relative2.fofn" }, 60); + + // restore working directory + ASSERT_EQ(0, chdir(cwd.c_str())); +} + +TEST(BamFileTest, TruncatedFileThrowsOk) +{ + EXPECT_THROW(BamFile{ tests::GeneratedData_Dir + "/truncated.bam" }, std::runtime_error); +} diff --git a/tests/src/test_BamHeader.cpp b/tests/src/test_BamHeader.cpp new file mode 100644 index 0000000..4c49b95 --- /dev/null +++ b/tests/src/test_BamHeader.cpp @@ -0,0 +1,436 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace tests { + +struct BamHdrDeleter +{ + void operator()(bam_hdr_t* hdr) { + if (hdr) + bam_hdr_destroy(hdr); + hdr = nullptr; + } +}; + +} // namespace tests +} // namespace BAM +} // namespace PacBio + +TEST(BamHeaderTest, DefaultConstruction) +{ + BamHeader header; + EXPECT_TRUE(header.Version().empty()); + EXPECT_TRUE(header.SortOrder().empty()); // default to unknown ? + EXPECT_TRUE(header.ReadGroups().empty()); + EXPECT_TRUE(header.Sequences().empty()); + EXPECT_TRUE(header.Programs().empty()); + EXPECT_TRUE(header.Comments().empty()); + + EXPECT_THROW(header.Program("foo"), std::exception); + EXPECT_THROW(header.ReadGroup("foo"), std::exception); + EXPECT_THROW(header.SequenceId("foo"), std::exception); + EXPECT_THROW(header.SequenceLength(42), std::exception); + EXPECT_THROW(header.SequenceName(42), std::exception); +} + +TEST(BamHeaderTest, DecodeTest) +{ + const string& text = "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n" + "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n" + "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n" + "@RG\tID:rg1\tSM:control\n" + "@RG\tID:rg2\tSM:condition1\n" + "@RG\tID:rg3\tSM:condition1\n" + "@PG\tID:_foo_\tPN:ide\n" + "@CO\tipsum and so on\n" + "@CO\tcitation needed\n"; + + BamHeader header = BamHeader(text); + + EXPECT_EQ(string("1.1"), header.Version()); + EXPECT_EQ(string("queryname"), header.SortOrder()); + EXPECT_EQ(string("3.0.1"), header.PacBioBamVersion()); + + EXPECT_EQ(3, header.ReadGroups().size()); + EXPECT_TRUE(header.HasReadGroup("rg1")); + EXPECT_TRUE(header.HasReadGroup("rg2")); + EXPECT_TRUE(header.HasReadGroup("rg3")); + + EXPECT_EQ(string("control"), header.ReadGroup("rg1").Sample()); + EXPECT_EQ(string("condition1"), header.ReadGroup("rg2").Sample()); + EXPECT_EQ(string("condition1"), header.ReadGroup("rg3").Sample()); + + EXPECT_EQ(2, header.Sequences().size()); + EXPECT_TRUE(header.HasSequence("chr1")); + EXPECT_TRUE(header.HasSequence("chr2")); + EXPECT_EQ(string("chocobo"), header.Sequence("chr1").Species()); + EXPECT_EQ(string("chocobo"), header.Sequence("chr2").Species()); + EXPECT_EQ(string("2038"), header.Sequence("chr1").Length()); + EXPECT_EQ(string("3042"), header.Sequence("chr2").Length()); + + EXPECT_EQ(1, header.Programs().size()); + EXPECT_TRUE(header.HasProgram("_foo_")); + EXPECT_EQ(string("ide"), header.Program("_foo_").Name()); + + EXPECT_EQ(2, header.Comments().size()); + EXPECT_EQ(string("ipsum and so on"), header.Comments().at(0)); + EXPECT_EQ(string("citation needed"), header.Comments().at(1)); +} + +TEST(BamHeaderTest, VersionCheckOk) +{ + auto expectFail = [](string&& label, string&& text) + { + SCOPED_TRACE(label); + EXPECT_THROW(BamHeader{ text }, std::runtime_error); + }; + expectFail("empty version", "@HD\tVN:1.1\tSO:queryname\tpb:\n"); + expectFail("old beta version", "@HD\tVN:1.1\tSO:queryname\tpb:3.0b3\n"); + expectFail("old beta version", "@HD\tVN:1.1\tSO:queryname\tpb:3.0b7\n"); + expectFail("invalid value", "@HD\tVN:1.1\tSO:queryname\tpb:3.0.should_not_work\n"); + expectFail("earlier than minimum", "@HD\tVN:1.1\tSO:queryname\tpb:3.0.0\n"); + + // correct version syntax, number + EXPECT_NO_THROW(BamHeader{ "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n" }); +} + +TEST(BamHeaderTest, EncodeTest) +{ + ReadGroupInfo rg1("rg1"); + rg1.Sample("control"); + ReadGroupInfo rg2("rg2"); + rg2.Sample("condition1"); + ReadGroupInfo rg3("rg3"); + rg3.Sample("condition1"); + + SequenceInfo seq1("chr1"); + seq1.Length("2038").Species("chocobo"); + SequenceInfo seq2("chr2"); + seq2.Length("3042").Species("chocobo"); + + ProgramInfo prog1("_foo_"); + prog1.Name("ide"); + + BamHeader header; + header.Version("1.1") + .SortOrder("queryname") + .PacBioBamVersion("3.0.1") + .AddReadGroup(rg1) + .AddReadGroup(rg2) + .AddReadGroup(rg3) + .AddSequence(seq1) + .AddSequence(seq2) + .AddProgram(prog1) + .AddComment("ipsum and so on") + .AddComment("citation needed"); + + const string& expectedText = "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n" + "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n" + "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n" + "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n" + "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n" + "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n" + "@PG\tID:_foo_\tPN:ide\n" + "@CO\tipsum and so on\n" + "@CO\tcitation needed\n"; + + const string& text = header.ToSam(); + EXPECT_EQ(expectedText, text); +} + +TEST(BamHeaderTest, ConvertToRawDataOk) +{ + ReadGroupInfo rg1("rg1"); + rg1.Sample("control"); + ReadGroupInfo rg2("rg2"); + rg2.Sample("condition1"); + ReadGroupInfo rg3("rg3"); + rg3.Sample("condition1"); + + SequenceInfo seq1("chr1"); + seq1.Length("2038").Species("chocobo"); + SequenceInfo seq2("chr2"); + seq2.Length("3042").Species("chocobo"); + + ProgramInfo prog1("_foo_"); + prog1.Name("ide"); + + BamHeader header; + header.Version("1.1") + .SortOrder("queryname") + .PacBioBamVersion("3.0.1") + .AddReadGroup(rg1) + .AddReadGroup(rg2) + .AddReadGroup(rg3) + .AddSequence(seq1) + .AddSequence(seq2) + .AddProgram(prog1) + .AddComment("ipsum and so on") + .AddComment("citation needed"); + + const string& expectedText = "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n" + "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n" + "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n" + "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n" + "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n" + "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n" + "@PG\tID:_foo_\tPN:ide\n" + "@CO\tipsum and so on\n" + "@CO\tcitation needed\n"; + + + const string& text = header.ToSam(); + PBBAM_SHARED_PTR rawData(sam_hdr_parse(text.size(), text.c_str()), tests::BamHdrDeleter()); + rawData->ignore_sam_err = 0; + rawData->cigar_tab = NULL; + rawData->l_text = text.size(); + rawData->text = (char*)calloc(rawData->l_text + 1, 1); + memcpy(rawData->text, text.c_str(), rawData->l_text); + + const string& rawText = string(rawData->text, rawData->l_text); + EXPECT_EQ(expectedText, rawText); +} + +TEST(BamHeaderTest, ExtractFromRawDataOk) +{ + ReadGroupInfo rg1("rg1"); + rg1.Sample("control"); + ReadGroupInfo rg2("rg2"); + rg2.Sample("condition1"); + ReadGroupInfo rg3("rg3"); + rg3.Sample("condition1"); + + SequenceInfo seq1("chr1"); + seq1.Length("2038").Species("chocobo"); + SequenceInfo seq2("chr2"); + seq2.Length("3042").Species("chocobo"); + + ProgramInfo prog1("_foo_"); + prog1.Name("ide"); + + BamHeader header; + header.Version("1.1") + .SortOrder("queryname") + .PacBioBamVersion("3.0.1") + .AddReadGroup(rg1) + .AddReadGroup(rg2) + .AddReadGroup(rg3) + .AddSequence(seq1) + .AddSequence(seq2) + .AddProgram(prog1) + .AddComment("ipsum and so on") + .AddComment("citation needed"); + + const string& expectedText = "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n" + "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n" + "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n" + "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n" + "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n" + "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n" + "@PG\tID:_foo_\tPN:ide\n" + "@CO\tipsum and so on\n" + "@CO\tcitation needed\n"; + + + string text = header.ToSam(); + PBBAM_SHARED_PTR rawData(sam_hdr_parse(text.size(), text.c_str()), tests::BamHdrDeleter()); + rawData->ignore_sam_err = 0; + rawData->cigar_tab = NULL; + rawData->l_text = text.size(); + rawData->text = (char*)calloc(rawData->l_text + 1, 1); + memcpy(rawData->text, text.c_str(), rawData->l_text); + + const BamHeader newHeader = BamHeader(string(rawData->text, rawData->l_text)); + + EXPECT_EQ(header.Version(), newHeader.Version()); + EXPECT_EQ(header.SortOrder(), newHeader.SortOrder()); + EXPECT_EQ(header.PacBioBamVersion(), newHeader.PacBioBamVersion()); + + text = newHeader.ToSam(); + EXPECT_EQ(expectedText, text); +} + +TEST(BamHeaderTest, MergeOk) +{ + const string hdrText1 = { + "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n" + "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;" + "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;" + "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t" + "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\t" + "PM:SEQUEL\n" + "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n" + "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n" + "@CO\tcomment1\n" + }; + + const string hdrText2 = { + "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n" + "@RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;" + "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;" + "PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;" + "PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;" + "BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;" + "FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\t" + "PM:SEQUEL\n" + "@PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0\n" + "@PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0\n" + "@PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0\n" + "@CO\tcomment2\n" + }; + + const string mergedText = { + "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n" + "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;" + "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;" + "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t" + "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\t" + "PM:SEQUEL\n" + "@RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;" + "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;" + "PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;" + "PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;" + "BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;" + "FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\t" + "PM:SEQUEL\n" + "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n" + "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n" + "@PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0\n" + "@PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0\n" + "@PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0\n" + "@CO\tcomment1\n" + "@CO\tcomment2\n" + }; + + { // operator+ + + const BamHeader header1(hdrText1); + const BamHeader header2(hdrText2); + const BamHeader merged = header1 + header2; + EXPECT_EQ(mergedText, merged.ToSam()); + + // also make sure inputs not changed + EXPECT_EQ(hdrText1, header1.ToSam()); + EXPECT_EQ(hdrText2, header2.ToSam()); + } + + { // operator+= + + BamHeader header1(hdrText1); + header1 += BamHeader(hdrText2); + EXPECT_EQ(mergedText, header1.ToSam()); + } +} + +TEST(BamHeaderTest, MergeHandlesDuplicateReadGroups) +{ + const string hdrText = { + "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n" + "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;" + "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;" + "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t" + "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\tPM:SEQUEL\n" + "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n" + "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n" + }; + + // duplicate @RG:IDs handled ok (i.e. not duplicated in output) + const BamHeader header1(hdrText); + const BamHeader header2(hdrText); + const BamHeader merged = header1 + header2; + EXPECT_EQ(hdrText, merged.ToSam()); +} + +TEST(BamHeaderTest, MergeCompatibilityOk) +{ + { // different @HD:VN - this IS allowed (as of SAT-465, pbbam v0.7.2) + const string hdrText1 = { "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n" }; + const string hdrText2 = { "@HD\tVN:1.0\tSO:unknown\tpb:3.0.1\n" }; + const BamHeader header1(hdrText1); + const BamHeader header2(hdrText2); + EXPECT_NO_THROW(header1 + header2); + } + + { // different @HD:SO + const string hdrText1 = { "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n" }; + const string hdrText2 = { "@HD\tVN:1.1\tSO:coordinate\tpb:3.0.1\n" }; + const BamHeader header1(hdrText1); + const BamHeader header2(hdrText2); + EXPECT_THROW(header1 + header2, std::runtime_error); + } + + { // different @HD:pb - this IS allowed (as of SAT-529, pbbam 0.7.4) + const string hdrText1 = { "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n" }; + const string hdrText2 = { "@HD\tVN:1.1\tSO:unknown\tpb:3.0.3\n" }; + const BamHeader header1(hdrText1); + const BamHeader header2(hdrText2); + EXPECT_NO_THROW(header1 + header2); + } + + { // @SQ list clash + const string hdrText1 = { + "@HD\tVN:1.1\tSO:coordinate\tpb:3.0.1\n" + "@SQ\tSN:foo\tLN:42\n" + "@SQ\tSN:bar\tLN:24\n" + }; + const string hdrText2 = { + "@HD\tVN:1.1\tSO:coordinate\tpb:3.0.1\n" + "@SQ\tSN:foo\tLN:42\n" + "@SQ\tSN:baz\tLN:99\n" + }; + const BamHeader header1(hdrText1); + const BamHeader header2(hdrText2); + EXPECT_THROW(header1 + header2, std::runtime_error); + } +} diff --git a/tests/src/test_BamRecord.cpp b/tests/src/test_BamRecord.cpp new file mode 100644 index 0000000..753ab6b --- /dev/null +++ b/tests/src/test_BamRecord.cpp @@ -0,0 +1,2708 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace tests { + +static +BamRecordImpl CreateBamImpl(void) +{ + TagCollection tags; + tags["HX"] = string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = vector({34, 5, 125}); + tags["XY"] = static_cast(-42); + + BamRecordImpl bam; + bam.Bin(42); + bam.Flag(42); + bam.InsertSize(42); + bam.MapQuality(42); + bam.MatePosition(42); + bam.MateReferenceId(42); + bam.Position(42); + bam.ReferenceId(42); + bam.Tags(tags); + return bam; +} + +static inline +BamRecord CreateBam(void) +{ return BamRecord{ CreateBamImpl() }; } + +static +void CheckRawData(const BamRecordImpl& bam) +{ + // ensure raw data (lengths at least) matches API-facing data + const uint32_t expectedNameLength = bam.Name().size() + 1; + const uint32_t expectedNumCigarOps = bam.CigarData().size(); + const int32_t expectedSeqLength = bam.Sequence().length(); + const size_t expectedTagsLength = BamTagCodec::Encode(bam.Tags()).size(); + + // Name CIGAR Sequence Quals Tags + // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + << TAGS >> + const int expectedTotalDataLength = expectedNameLength + + (expectedNumCigarOps * 4) + + (expectedSeqLength+1)/2 + + expectedSeqLength + + expectedTagsLength; + EXPECT_TRUE((bool)bam.d_); + EXPECT_EQ(expectedNameLength, bam.d_->core.l_qname); + EXPECT_EQ(expectedNumCigarOps, bam.d_->core.n_cigar); + EXPECT_EQ(expectedSeqLength, bam.d_->core.l_qseq); + EXPECT_EQ(expectedTotalDataLength, bam.d_->l_data); +} + +static inline +void CheckRawData(const BamRecord& bam) +{ CheckRawData(bam.impl_); } + +static +BamRecordImpl MakeCigaredImpl(const string& seq, + const string& cigar, + const Strand strand) +{ + BamRecordImpl impl; + impl.SetMapped(true).ReferenceId(0).Position(0).MapQuality(0); + impl.CigarData(Cigar::FromStdString(cigar)); + impl.MateReferenceId(-1).MatePosition(-1).InsertSize(0); + impl.SetSequenceAndQualities(seq, string(seq.size(), '*')); + impl.SetReverseStrand(strand == Strand::REVERSE); + return impl; +} + +static inline +BamRecord MakeCigaredRecord(const string& seq, + const string& cigar, + const Strand strand) +{ return BamRecord{ MakeCigaredImpl(seq, cigar, strand) }; } + +static +BamRecord MakeCigaredBaseRecord(const string& bases, + const string& cigar, + const Strand strand) +{ + TagCollection tags; + tags["dt"] = bases; + tags["st"] = bases; + + const string seq = string(bases.size(), 'N'); + BamRecordImpl impl = MakeCigaredImpl(seq, cigar, strand); + impl.Tags(tags); + return BamRecord(std::move(impl)); +} + +static +BamRecord MakeCigaredFrameRecord(const vector& frames, + const string& cigar, + const Strand strand) +{ + TagCollection tags; + tags["ip"] = frames; + tags["pw"] = frames; + + const string seq = string(frames.size(), 'N'); + BamRecordImpl impl = MakeCigaredImpl(seq, cigar, strand); + impl.Tags(tags); + return BamRecord(std::move(impl)); +} + +static +BamRecord MakeCigaredQualRecord(const string& quals, + const string& cigar, + const Strand strand) +{ + TagCollection tags; + tags["dq"] = quals; + tags["iq"] = quals; + tags["mq"] = quals; + tags["sq"] = quals; + + const string seq = string(quals.size(), 'N'); + BamRecordImpl impl = MakeCigaredImpl(seq, cigar, strand); + impl.Tags(tags); + return BamRecord(std::move(impl)); +} + +static +BamRecord MakeCigaredPulseBaseRecord(const string& seqBases, + const string& pulseCalls, + const string& pulseBases, + const string& cigar, + const Strand strand) +{ + TagCollection tags; + tags["pc"] = pulseCalls; // PulseCall + tags["pt"] = pulseBases; // AltLabelTag + + BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand); + impl.Tags(tags); + return BamRecord(std::move(impl)); +} + +static +BamRecord MakeCigaredPulseQualRecord(const string& seqBases, + const string& pulseCalls, + const string& pulseQuals, + const string& cigar, + const Strand strand) +{ + TagCollection tags; + tags["pc"] = pulseCalls; + tags["pv"] = pulseQuals; // AltLabelQV + tags["pq"] = pulseQuals; // LabelQV + tags["pg"] = pulseQuals; // PulseMergeQV + + BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand); + impl.Tags(tags); + return BamRecord(std::move(impl)); +} + +static +BamRecord MakeCigaredPulseFrameRecord(const string& seqBases, + const string& pulseCalls, + const vector& pulseFrames, + const string& cigar, + const Strand strand) +{ + TagCollection tags; + tags["pc"] = pulseCalls; + tags["pd"] = pulseFrames; // PrePulseFrames + tags["px"] = pulseFrames; // PulseCallWidth + + BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand); + impl.Tags(tags); + return BamRecord(std::move(impl)); +} + +static +BamRecord MakeCigaredPulseUIntRecord(const string& seqBases, + const string& pulseCalls, + const vector& pulseUInts, + const string& cigar, + const Strand strand) +{ + TagCollection tags; + tags["pc"] = pulseCalls; + tags["sf"] = pulseUInts; // StartFrame + + BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand); + impl.Tags(tags); + return BamRecord(std::move(impl)); +} + +// ---------------------------------------------------------- +// helper structs and methods for checking combinations of: +// aligned strand, orientation requested, alignment, clipping +// ---------------------------------------------------------- + +// generic result holder for various requested states +template +struct ExpectedResult +{ +public: + ExpectedResult(std::initializer_list init) + : d_(init) + { + assert(12 == init.size()); + } + + T ForwardGenomic(void) const { return d_.at(0); } + T ForwardNative(void) const { return d_.at(1); } + T ForwardGenomicAligned(void) const { return d_.at(2); } + T ForwardNativeAligned(void) const { return d_.at(3); } + T ForwardGenomicAlignedClipped(void) const { return d_.at(4); } + T ForwardNativeAlignedClipped(void) const { return d_.at(5); } + T ReverseGenomic(void) const { return d_.at(6); } + T ReverseNative(void) const { return d_.at(7); } + T ReverseGenomicAligned(void) const { return d_.at(8); } + T ReverseNativeAligned(void) const { return d_.at(9); } + T ReverseGenomicAlignedClipped(void) const { return d_.at(10); } + T ReverseNativeAlignedClipped(void) const { return d_.at(11); } + +private: + vector d_; +}; + +// generic data type checker on the various requested states +template +void CheckAlignAndClip(const string& cigar, + const DataType& input, + const tests::ExpectedResult& e, + const MakeRecordType& makeRecord, + const FetchDataType& fetchData) +{ + { // map to forward strand + const BamRecord b = makeRecord(input, cigar, Strand::FORWARD); + EXPECT_EQ(e.ForwardGenomic(), fetchData(b, Orientation::GENOMIC, false, false)); + EXPECT_EQ(e.ForwardNative(), fetchData(b, Orientation::NATIVE, false, false)); + EXPECT_EQ(e.ForwardGenomicAligned(), fetchData(b, Orientation::GENOMIC, true, false)); + EXPECT_EQ(e.ForwardNativeAligned(), fetchData(b, Orientation::NATIVE, true, false)); + EXPECT_EQ(e.ForwardGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true, true)); + EXPECT_EQ(e.ForwardNativeAlignedClipped(), fetchData(b, Orientation::NATIVE, true, true)); + } + { // map to reverse strand + const BamRecord b = makeRecord(input, cigar, Strand::REVERSE); + EXPECT_EQ(e.ReverseGenomic(), fetchData(b, Orientation::GENOMIC, false, false)); + EXPECT_EQ(e.ReverseNative(), fetchData(b, Orientation::NATIVE, false, false)); + EXPECT_EQ(e.ReverseGenomicAligned(), fetchData(b, Orientation::GENOMIC, true, false)); + EXPECT_EQ(e.ReverseNativeAligned(), fetchData(b, Orientation::NATIVE, true, false)); + EXPECT_EQ(e.ReverseGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true, true)); + EXPECT_EQ(e.ReverseNativeAlignedClipped(), fetchData(b, Orientation::NATIVE, true, true)); + } +} + +template +void CheckPulseDataAlignAndClip(const string& cigar, + const string& seqBases, + const string& pulseCalls, + const DataType& input, + const tests::ExpectedResult& allPulses, + const tests::ExpectedResult& basecallsOnly, + const MakeRecordType& makeRecord, + const FetchDataType& fetchData) +{ + { // map to forward strand + const BamRecord b = makeRecord(seqBases, pulseCalls, input, cigar, Strand::FORWARD); + + EXPECT_EQ(allPulses.ForwardGenomic(), fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::ALL)); + EXPECT_EQ(allPulses.ForwardNative(), fetchData(b, Orientation::NATIVE, false, false, PulseBehavior::ALL)); + // no align/clipping operations available on ALL pulses + + EXPECT_EQ(basecallsOnly.ForwardGenomic(), fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::BASECALLS_ONLY)); + EXPECT_EQ(basecallsOnly.ForwardNative(), fetchData(b, Orientation::NATIVE, false, false, PulseBehavior::BASECALLS_ONLY)); + EXPECT_EQ(basecallsOnly.ForwardGenomicAligned(), fetchData(b, Orientation::GENOMIC, true, false, PulseBehavior::BASECALLS_ONLY)); + EXPECT_EQ(basecallsOnly.ForwardNativeAligned(), fetchData(b, Orientation::NATIVE, true, false, PulseBehavior::BASECALLS_ONLY)); + EXPECT_EQ(basecallsOnly.ForwardGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true, true, PulseBehavior::BASECALLS_ONLY)); + EXPECT_EQ(basecallsOnly.ForwardNativeAlignedClipped(), fetchData(b, Orientation::NATIVE, true, true, PulseBehavior::BASECALLS_ONLY)); + } + { // map to reverse strand + const BamRecord b = makeRecord(seqBases, pulseCalls, input, cigar, Strand::REVERSE); + + EXPECT_EQ(allPulses.ReverseGenomic(), fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::ALL)); + EXPECT_EQ(allPulses.ReverseNative(), fetchData(b, Orientation::NATIVE, false, false, PulseBehavior::ALL)); + // no align/clipping operations available on ALL pulses + + EXPECT_EQ(basecallsOnly.ReverseGenomic(), fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::BASECALLS_ONLY)); + EXPECT_EQ(basecallsOnly.ReverseNative(), fetchData(b, Orientation::NATIVE, false, false, PulseBehavior::BASECALLS_ONLY)); + EXPECT_EQ(basecallsOnly.ReverseGenomicAligned(), fetchData(b, Orientation::GENOMIC, true, false, PulseBehavior::BASECALLS_ONLY)); + EXPECT_EQ(basecallsOnly.ReverseNativeAligned(), fetchData(b, Orientation::NATIVE, true, false, PulseBehavior::BASECALLS_ONLY)); + EXPECT_EQ(basecallsOnly.ReverseGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true, true, PulseBehavior::BASECALLS_ONLY)); + EXPECT_EQ(basecallsOnly.ReverseNativeAlignedClipped(), fetchData(b, Orientation::NATIVE, true, true, PulseBehavior::BASECALLS_ONLY)); + } +} + +static +void CheckBaseTagsClippedAndAligned(const string& cigar, + const string& input, + const ExpectedResult& e) +{ + // aligned record + DeletionTag, SubstitutionTag + auto makeRecord = [](const string& bases, + const string& cigar, + const Strand strand) + { return MakeCigaredBaseRecord(bases, cigar, strand); }; + + // DeletionTag + CheckAlignAndClip(cigar, input, e, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips) + { return b.DeletionTag(orientation, aligned, exciseSoftClips); } + ); + + // SubstitutionTag + CheckAlignAndClip(cigar, input, e, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips) + { return b.SubstitutionTag(orientation, aligned, exciseSoftClips); } + ); +} + +static +void CheckFrameTagsClippedAndAligned(const string& cigar, + const vector& input, + const ExpectedResult >& e) +{ + + // aligned record + IPD, PulseWidth + auto makeRecord = [](const vector& frames, + const string& cigar, + const Strand strand) + { return tests::MakeCigaredFrameRecord(frames, cigar, strand); }; + + // IPD + CheckAlignAndClip(cigar, input, e, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips) + { return b.IPD(orientation, aligned, exciseSoftClips).Data(); } + ); + + // PulseWidth + CheckAlignAndClip(cigar, input, e, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips) + { return b.PulseWidth(orientation, aligned, exciseSoftClips).Data(); } + ); +} + +static +void CheckQualityTagsClippedAndAligned(const string& cigar, + const string& input, + const ExpectedResult& e) +{ + // aligned record + DeletionQV, InsertionQV, MergeQV, SubstitutionQV + auto makeRecord = [](const string& quals, + const string& cigar, + const Strand strand) + { return tests::MakeCigaredQualRecord(quals, cigar, strand); }; + + // DeletionQV + CheckAlignAndClip(cigar, input, e, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips) + { return b.DeletionQV(orientation, aligned, exciseSoftClips).Fastq(); } + ); + + // InsertionQV + CheckAlignAndClip(cigar, input, e, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips) + { return b.InsertionQV(orientation, aligned, exciseSoftClips).Fastq(); } + ); + + // MergeQV + CheckAlignAndClip(cigar, input, e, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips) + { return b.MergeQV(orientation, aligned, exciseSoftClips).Fastq(); } + ); + + // SubstitutionQV + CheckAlignAndClip(cigar, input, e, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips) + { return b.SubstitutionQV(orientation, aligned, exciseSoftClips).Fastq(); } + ); +} + +static +void CheckQualitiesClippedAndAligned(const string& cigar, + const string& input, + const ExpectedResult& e) +{ + // aligned record w/ dummy SEQ & QUALs under test + auto makeRecord = [](const string& quals, + const string& cigar, + const Strand strand) + { + const string seq = string(quals.size(), 'N'); + auto record = tests::MakeCigaredRecord(seq, cigar, strand); + record.Impl().SetSequenceAndQualities(seq, quals); + return record; + }; + + // QUAL + CheckAlignAndClip(cigar, input, e, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips) + { return b.Qualities(orientation, aligned, exciseSoftClips).Fastq(); } + ); +} + +static +void CheckSequenceClippedAndAligned(const string& cigar, + const string& input, + const ExpectedResult& e) +{ + // aligned record w/ SEQ + auto makeRecord = [](const string& seq, + const string& cigar, + const Strand strand) + { return tests::MakeCigaredRecord(seq, cigar, strand); }; + + // SEQ + CheckAlignAndClip(cigar, input, e, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips) + { return b.Sequence(orientation, aligned, exciseSoftClips); } + ); +} + +static +void CheckPulseBaseTags(const string& cigar, + const string& seqBases, + const string& pulseCalls, + const string& pulseBases, + const ExpectedResult& allPulses, + const ExpectedResult& basecallsOnly) +{ + // aligned record + AltLabelTag + auto makeRecord = [](const string& seqBases, + const string& pulseCalls, + const string& pulseBases, + const string& cigar, + const Strand strand) + { return MakeCigaredPulseBaseRecord(seqBases, pulseCalls, pulseBases, cigar, strand); }; + + // AltLabelTag + CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseBases, allPulses, basecallsOnly, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) + { return b.AltLabelTag(orientation, aligned, exciseSoftClips, pulseBehavior); } + ); + // PulseCall + CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseBases, allPulses, basecallsOnly, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) + { return b.PulseCall(orientation, aligned, exciseSoftClips, pulseBehavior); } + ); +} + +static +void CheckPulseFrameTags(const string& cigar, + const string& seqBases, + const string& pulseCalls, + const vector& pulseFrames, + const ExpectedResult>& allPulses, + const ExpectedResult>& basecallsOnly) +{ + // aligned record + PrePulseFrames + auto makeRecord = [](const string& seqBases, + const string& pulseCalls, + const vector& pulseFrames, + const string& cigar, + const Strand strand) + { return MakeCigaredPulseFrameRecord(seqBases, pulseCalls, pulseFrames, cigar, strand); }; + + // PrePulseFrame + CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseFrames, allPulses, basecallsOnly, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) + { return b.PrePulseFrames(orientation, aligned, exciseSoftClips, pulseBehavior).Data(); } + ); + // PulseCallWidth + CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseFrames, allPulses, basecallsOnly, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) + { return b.PulseCallWidth(orientation, aligned, exciseSoftClips, pulseBehavior).Data(); } + ); +} + +/* + + { BamRecordTag::PKMEAN, {"pa", true} }, photons (vector + { BamRecordTag::PKMEAN_2, {"ps", true} }, photons + { BamRecordTag::PKMID, {"pm", true} }, photons + { BamRecordTag::PKMID_2, {"pi", true} }, photons +*/ + +static +void CheckPulseQualityTags(const string& cigar, + const string& seqBases, + const string& pulseCalls, + const string& pulseQuals, + const ExpectedResult& allPulses, + const ExpectedResult& basecallsOnly) +{ + // aligned record + AltLabelQV + auto makeRecord = [](const string& seqBases, + const string& pulseCalls, + const string& pulseQuals, + const string& cigar, + const Strand strand) + { return MakeCigaredPulseQualRecord(seqBases, pulseCalls, pulseQuals, cigar, strand); }; + + // AltLabelQV + CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseQuals, allPulses, basecallsOnly, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) + { return b.AltLabelQV(orientation, aligned, exciseSoftClips, pulseBehavior).Fastq(); } + ); + // LabelQV + CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseQuals, allPulses, basecallsOnly, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) + { return b.LabelQV(orientation, aligned, exciseSoftClips, pulseBehavior).Fastq(); } + ); + // PulseMergeQV + CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseQuals, allPulses, basecallsOnly, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) + { return b.PulseMergeQV(orientation, aligned, exciseSoftClips, pulseBehavior).Fastq(); } + ); +} + +static +void CheckPulseUIntTags(const string& cigar, + const string& seqBases, + const string& pulseCalls, + const vector& startFrames, + const ExpectedResult>& allPulses, + const ExpectedResult>& basecallsOnly) +{ + // aligned record + StartFrame + auto makeRecord = [](const string& seqBases, + const string& pulseCalls, + const vector& startFrames, + const string& cigar, + const Strand strand) + { return MakeCigaredPulseUIntRecord(seqBases, pulseCalls, startFrames, cigar, strand); }; + + // StartFrame + CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, startFrames, allPulses, basecallsOnly, makeRecord, + [](const BamRecord& b, + Orientation orientation, + bool aligned, + bool exciseSoftClips, + PulseBehavior pulseBehavior) + { return b.StartFrame(orientation, aligned, exciseSoftClips, pulseBehavior); } + ); +} + + + +} // namespace tests + +TEST(BamRecordTest, DefaultValues) +{ + BamRecord bam; + const string emptyString; + + // BamRecordImpl data + EXPECT_EQ(0, bam.impl_.Bin()); + EXPECT_EQ(BamRecordImpl::UNMAPPED, bam.impl_.Flag()); // forced init unmapped + EXPECT_EQ(0, bam.impl_.InsertSize()); + EXPECT_EQ(255, bam.impl_.MapQuality()); + EXPECT_EQ(-1, bam.impl_.MateReferenceId()); + EXPECT_EQ(-1, bam.impl_.MatePosition()); + EXPECT_EQ(-1, bam.impl_.Position()); + EXPECT_EQ(-1, bam.impl_.ReferenceId()); + EXPECT_EQ(0, bam.impl_.Tags().size()); + + EXPECT_FALSE(bam.impl_.IsDuplicate()); + EXPECT_FALSE(bam.impl_.IsFailedQC()); + EXPECT_FALSE(bam.impl_.IsFirstMate()); + EXPECT_FALSE(bam.impl_.IsMapped()); // forced init unmapped + EXPECT_TRUE(bam.impl_.IsMateMapped()); + EXPECT_FALSE(bam.impl_.IsMateReverseStrand()); + EXPECT_FALSE(bam.impl_.IsPaired()); + EXPECT_TRUE(bam.impl_.IsPrimaryAlignment()); + EXPECT_FALSE(bam.impl_.IsProperPair()); + EXPECT_FALSE(bam.impl_.IsReverseStrand()); + EXPECT_FALSE(bam.impl_.IsSecondMate()); + EXPECT_FALSE(bam.impl_.IsSupplementaryAlignment()); + + EXPECT_EQ(emptyString, bam.impl_.Name()); + EXPECT_EQ(emptyString, bam.impl_.CigarData().ToStdString()); + EXPECT_EQ(emptyString, bam.impl_.Sequence()); + EXPECT_EQ(emptyString, bam.impl_.Qualities().Fastq()); + + // PacBio data + EXPECT_EQ(-1, bam.AlignedStart()); + EXPECT_EQ(-1, bam.AlignedEnd()); + + EXPECT_FALSE(bam.HasHoleNumber()); + EXPECT_FALSE(bam.HasNumPasses()); + EXPECT_FALSE(bam.HasQueryEnd()); + EXPECT_FALSE(bam.HasQueryStart()); + EXPECT_FALSE(bam.HasReadAccuracy()); + + EXPECT_THROW(bam.HoleNumber(), std::exception); + EXPECT_THROW(bam.NumPasses(), std::exception); + EXPECT_EQ(Position(0), bam.QueryEnd()); + EXPECT_EQ(Position(0), bam.QueryStart()); + EXPECT_THROW(bam.ReadAccuracy(), std::exception); + + EXPECT_FALSE(bam.HasDeletionQV()); + EXPECT_FALSE(bam.HasDeletionTag()); + EXPECT_FALSE(bam.HasInsertionQV()); + EXPECT_FALSE(bam.HasMergeQV()); + EXPECT_FALSE(bam.HasSubstitutionQV()); + EXPECT_FALSE(bam.HasSubstitutionTag()); + + EXPECT_THROW(bam.DeletionQV(), std::exception); + EXPECT_THROW(bam.DeletionTag(), std::exception); + EXPECT_THROW(bam.InsertionQV(), std::exception); + EXPECT_THROW(bam.MergeQV(), std::exception); + EXPECT_THROW(bam.SubstitutionQV(), std::exception); + EXPECT_THROW(bam.SubstitutionTag(), std::exception); + + // raw data + tests::CheckRawData(bam); +} + +TEST(BamRecordTest, FromBamRecordImpl) +{ + // check generic data + BamRecordImpl genericBam = tests::CreateBamImpl(); + + EXPECT_EQ(42, genericBam.Bin()); + EXPECT_EQ(42, genericBam.Flag()); + EXPECT_EQ(42, genericBam.InsertSize()); + EXPECT_EQ(42, genericBam.MapQuality()); + EXPECT_EQ(42, genericBam.MateReferenceId()); + EXPECT_EQ(42, genericBam.MatePosition()); + EXPECT_EQ(42, genericBam.Position()); + EXPECT_EQ(42, genericBam.ReferenceId()); + + const TagCollection& genericTags = genericBam.Tags(); + EXPECT_TRUE(genericTags.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(string("1abc75"), genericTags.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), genericTags.at("XY").ToInt32()); + EXPECT_EQ(vector({34, 5, 125}), genericTags.at("CA").ToUInt8Array()); + + // copy ctor + BamRecord bam1(genericBam); + + EXPECT_EQ(42, bam1.impl_.Bin()); + EXPECT_EQ(42, bam1.impl_.Flag()); + EXPECT_EQ(42, bam1.impl_.InsertSize()); + EXPECT_EQ(42, bam1.impl_.MapQuality()); + EXPECT_EQ(42, bam1.impl_.MateReferenceId()); + EXPECT_EQ(42, bam1.impl_.MatePosition()); + EXPECT_EQ(42, bam1.impl_.Position()); + EXPECT_EQ(42, bam1.impl_.ReferenceId()); + + const TagCollection& bam1Tags = bam1.impl_.Tags(); + EXPECT_TRUE(bam1Tags.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(string("1abc75"), bam1Tags.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), bam1Tags.at("XY").ToInt32()); + EXPECT_EQ(vector({34, 5, 125}), bam1Tags.at("CA").ToUInt8Array()); + + // copy assignment + BamRecord bam2; + bam2 = genericBam; + + EXPECT_EQ(42, bam2.impl_.Bin()); + EXPECT_EQ(42, bam2.impl_.Flag()); + EXPECT_EQ(42, bam2.impl_.InsertSize()); + EXPECT_EQ(42, bam2.impl_.MapQuality()); + EXPECT_EQ(42, bam2.impl_.MateReferenceId()); + EXPECT_EQ(42, bam2.impl_.MatePosition()); + EXPECT_EQ(42, bam2.impl_.Position()); + EXPECT_EQ(42, bam2.impl_.ReferenceId()); + + const TagCollection& bam2Tags = bam2.impl_.Tags(); + EXPECT_TRUE(bam2Tags.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(string("1abc75"), bam2Tags.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), bam2Tags.at("XY").ToInt32()); + EXPECT_EQ(vector({34, 5, 125}), bam2Tags.at("CA").ToUInt8Array()); + + // change genericBam, make sure we deep copied bam1 & bam2 + genericBam.Position(2000); + + EXPECT_EQ(2000, genericBam.Position()); + EXPECT_EQ(42, bam1.impl_.Position()); + EXPECT_EQ(42, bam2.impl_.Position()); + + // move ctor +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpessimizing-move" +#endif + BamRecord bam3(move(tests::CreateBamImpl())); +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + + EXPECT_EQ(42, bam3.impl_.Bin()); + EXPECT_EQ(42, bam3.impl_.Flag()); + EXPECT_EQ(42, bam3.impl_.InsertSize()); + EXPECT_EQ(42, bam3.impl_.MapQuality()); + EXPECT_EQ(42, bam3.impl_.MateReferenceId()); + EXPECT_EQ(42, bam3.impl_.MatePosition()); + EXPECT_EQ(42, bam3.impl_.Position()); + EXPECT_EQ(42, bam3.impl_.ReferenceId()); + + const TagCollection& bam3Tags = bam3.impl_.Tags(); + EXPECT_TRUE(bam3Tags.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(string("1abc75"), bam3Tags.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), bam3Tags.at("XY").ToInt32()); + EXPECT_EQ(vector({34, 5, 125}), bam3Tags.at("CA").ToUInt8Array()); + + // move assignment + BamRecord bam4; +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpessimizing-move" +#endif + bam4 = move(tests::CreateBamImpl()); +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + + EXPECT_EQ(42, bam4.impl_.Bin()); + EXPECT_EQ(42, bam4.impl_.Flag()); + EXPECT_EQ(42, bam4.impl_.InsertSize()); + EXPECT_EQ(42, bam4.impl_.MapQuality()); + EXPECT_EQ(42, bam4.impl_.MateReferenceId()); + EXPECT_EQ(42, bam4.impl_.MatePosition()); + EXPECT_EQ(42, bam4.impl_.Position()); + EXPECT_EQ(42, bam4.impl_.ReferenceId()); + + const TagCollection& bam4Tags = bam4.impl_.Tags(); + EXPECT_TRUE(bam4Tags.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(string("1abc75"), bam4Tags.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), bam4Tags.at("XY").ToInt32()); + EXPECT_EQ(vector({34, 5, 125}), bam4Tags.at("CA").ToUInt8Array()); +} + +TEST(BamRecordTest, SelfAssignmentTolerated) +{ + BamRecord bam1; + bam1.impl_.Bin(42); + bam1.impl_.Flag(42); + bam1.impl_.InsertSize(42); + bam1.impl_.MapQuality(42); + bam1.impl_.MatePosition(42); + bam1.impl_.MateReferenceId(42); + bam1.impl_.Position(42); + bam1.impl_.ReferenceId(42); + + TagCollection tags; + tags["HX"] = string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = vector({34, 5, 125}); + tags["XY"] = static_cast(-42); + bam1.impl_.Tags(tags); + + bam1 = bam1; + + EXPECT_EQ(42, bam1.impl_.Bin()); + EXPECT_EQ(42, bam1.impl_.Flag()); + EXPECT_EQ(42, bam1.impl_.InsertSize()); + EXPECT_EQ(42, bam1.impl_.MapQuality()); + EXPECT_EQ(42, bam1.impl_.MateReferenceId()); + EXPECT_EQ(42, bam1.impl_.MatePosition()); + EXPECT_EQ(42, bam1.impl_.Position()); + EXPECT_EQ(42, bam1.impl_.ReferenceId()); + + const TagCollection& fetchedTags1 = bam1.impl_.Tags(); + EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(string("1abc75"), fetchedTags1.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), fetchedTags1.at("XY").ToInt32()); + EXPECT_EQ(vector({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array()); + + tests::CheckRawData(bam1); +} + +TEST(BamRecordTest, CoreSetters) +{ + // create basic BAM with (generic) data + BamRecord bam = tests::CreateBam(); + + QualityValues testQVs; + testQVs.push_back(0); + testQVs.push_back(1); + + const string testTags = "GATTACA"; + + // now set PacBio data +// bam.AlignedStart(42); +// bam.AlignedEnd(42); +// bam.DeletionQVs(testQVs); +// bam.DeletionTags(testTags); +// bam.HoleNumber(42); +// bam.InsertionQVs(testQVs); +// bam.MergeQVs(testQVs); +// bam.NumPasses(42); +// bam.QueryEnd(42); +// bam.QueryStart(42); +// bam.ReadAccuracy(42); +// bam.ReferenceEnd(42); +// bam.ReferenceStart(42); +// bam.SubstitutionQVs(testQVs); +// bam.SubstitutionTags(testTags); + + // check generic data + EXPECT_EQ(42, bam.impl_.Bin()); + EXPECT_EQ(42, bam.impl_.Flag()); + EXPECT_EQ(42, bam.impl_.InsertSize()); + EXPECT_EQ(42, bam.impl_.MapQuality()); + EXPECT_EQ(42, bam.impl_.MateReferenceId()); + EXPECT_EQ(42, bam.impl_.MatePosition()); + EXPECT_EQ(42, bam.impl_.Position()); + EXPECT_EQ(42, bam.impl_.ReferenceId()); + + // check PacBio data +// EXPECT_EQ(42, bam.AlignedStart()); +// EXPECT_EQ(42, bam.AlignedEnd()); +// EXPECT_EQ(testQVs, bam.DeletionQVs()); +// EXPECT_EQ(testTags, bam.DeletionTags()); +// EXPECT_EQ(42, bam.HoleNumber()); +// EXPECT_EQ(testQVs, bam.InsertionQVs()); +// EXPECT_EQ(testQVs, bam.MergeQVs()); + +// EXPECT_EQ(42, bam.NumPasses()); +// EXPECT_EQ(42, bam.QueryEnd()); +// EXPECT_EQ(42, bam.QueryStart()); +// EXPECT_EQ(42, bam.ReadAccuracy()); +// EXPECT_EQ(42, bam.ReferenceEnd()); +// EXPECT_EQ(42, bam.ReferenceStart()); +// EXPECT_EQ(testQVs, bam.SubstitutionQVs()); +// EXPECT_EQ(testTags, bam.SubstitutionTags()); + + // check tags + const TagCollection& fetchedTags = bam.impl_.Tags(); + EXPECT_TRUE(fetchedTags.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(string("1abc75"), fetchedTags.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), fetchedTags.at("XY").ToInt32()); + EXPECT_EQ(vector({34, 5, 125}), fetchedTags.at("CA").ToUInt8Array()); + + tests::CheckRawData(bam); +} + +TEST(BamRecordTest, SequenceOrientation) +{ + { + SCOPED_TRACE("Simple CIGAR Sequence"); + tests::CheckSequenceClippedAndAligned( + "13=", // CIGAR + "ATATATCCCGGCG", // input + { + "ATATATCCCGGCG", // forward strand, genomic + "ATATATCCCGGCG", // forward strand, native + "ATATATCCCGGCG", // forward strand, genomic, aligned + "ATATATCCCGGCG", // forward strand, native, aligned + "ATATATCCCGGCG", // forward strand, genomic, aligned + clipped + "ATATATCCCGGCG", // forward strand, native, aligned + clipped + "ATATATCCCGGCG", // reverse strand, genomic + "CGCCGGGATATAT", // reverse strand, native + "ATATATCCCGGCG", // reverse strand, genomic, aligned + "CGCCGGGATATAT", // reverse strand, native, aligned + "ATATATCCCGGCG", // reverse strand, genomic, aligned + clipped + "CGCCGGGATATAT" // reverse strand, native, aligned + clipped + } + ); + } +} + +TEST(BamRecordTest, QualitiesOrientation) +{ + { + SCOPED_TRACE("Simple CIGAR Qualities"); + tests::CheckQualitiesClippedAndAligned( + "13=", // CIGAR + "?]?]?]?]?]?]*", // input + { + "?]?]?]?]?]?]*", // forward strand, genomic + "?]?]?]?]?]?]*", // forward strand, native + "?]?]?]?]?]?]*", // forward strand, genomic, aligned + "?]?]?]?]?]?]*", // forward strand, native, aligned + "?]?]?]?]?]?]*", // forward strand, genomic, aligned + clipped + "?]?]?]?]?]?]*", // forward strand, native, aligned + clipped + "?]?]?]?]?]?]*", // reverse strand, genomic + "*]?]?]?]?]?]?", // reverse strand, native + "?]?]?]?]?]?]*", // reverse strand, genomic, aligned + "*]?]?]?]?]?]?", // reverse strand, native, aligned + "?]?]?]?]?]?]*", // reverse strand, genomic, aligned + clipped + "*]?]?]?]?]?]?" // reverse strand, native, aligned + clipped + } + ); + } +} + +TEST(BamRecordTest, SequenceTagsOrientation) +{ + { + SCOPED_TRACE("Simple CIGAR Base Tags"); + tests::CheckBaseTagsClippedAndAligned( + "13=", // CIGAR + "ATATATCCCGGCG", // input + { + "ATATATCCCGGCG", // forward strand, genomic + "ATATATCCCGGCG", // forward strand, native + "ATATATCCCGGCG", // forward strand, genomic, aligned + "ATATATCCCGGCG", // forward strand, native, aligned + "ATATATCCCGGCG", // forward strand, genomic, aligned, clipped + "ATATATCCCGGCG", // forward strand, native, aligned, clipped + "CGCCGGGATATAT", // reverse strand, genomic + "ATATATCCCGGCG", // reverse strand, native + "CGCCGGGATATAT", // reverse strand, genomic, aligned + "ATATATCCCGGCG", // reverse strand, native, aligned + "CGCCGGGATATAT", // reverse strand, genomic, aligned, clipped + "ATATATCCCGGCG" // reverse strand, native, aligned, clipped + } + ); + } +} + +TEST(BamRecordTest, FrameTagsOrientation) +{ + { + SCOPED_TRACE("Simple CIGAR Frames"); + tests::CheckFrameTagsClippedAndAligned( + "5=", // CIGAR + { 0, 1, 2, 3, 4 }, // input + { + { 0, 1, 2, 3, 4 }, // forward strand, genomic + { 0, 1, 2, 3, 4 }, // forward strand, native + { 0, 1, 2, 3, 4 }, // forward strand, genomic, aligned + { 0, 1, 2, 3, 4 }, // forward strand, native, aligned + { 0, 1, 2, 3, 4 }, // forward strand, genomic, aligned, clipped + { 0, 1, 2, 3, 4 }, // forward strand, native, aligned, clipped + { 4, 3, 2, 1, 0 }, // reverse strand, genomic + { 0, 1, 2, 3, 4 }, // reverse strand, native + { 4, 3, 2, 1, 0 }, // reverse strand, genomic, aligned + { 0, 1, 2, 3, 4 }, // reverse strand, native, aligned + { 4, 3, 2, 1, 0 }, // reverse strand, genomic, aligned, clipped + { 0, 1, 2, 3, 4 } // reverse strand, native, aligned, clipped + } + ); + } +} + +TEST(BamRecordTest, QualityTagsOrientation) +{ + { + SCOPED_TRACE("Simple CIGAR Quality Tags"); + tests::CheckQualityTagsClippedAndAligned( + "13=", // CIGAR + "?]?]?]?]?]?]*", // input + { + "?]?]?]?]?]?]*", // forward strand, genomic + "?]?]?]?]?]?]*", // forward strand, native + "?]?]?]?]?]?]*", // forward strand, genomic, aligned + "?]?]?]?]?]?]*", // forward strand, native, aligned + "?]?]?]?]?]?]*", // forward strand, genomic, aligned + clipped + "?]?]?]?]?]?]*", // forward strand, native, aligned + clipped + "*]?]?]?]?]?]?", // reverse strand, genomic + "?]?]?]?]?]?]*", // reverse strand, native + "*]?]?]?]?]?]?", // reverse strand, genomic, aligned + "?]?]?]?]?]?]*", // reverse strand, native, aligned + "*]?]?]?]?]?]?", // reverse strand, genomic, aligned + clipped + "?]?]?]?]?]?]*" // reverse strand, native, aligned + clipped + } + ); + } +} + +TEST(BamRecordTest, SequenceClippedAndAligned) +{ + { + SCOPED_TRACE("CIGAR: 10="); + tests::CheckSequenceClippedAndAligned( + "10=", // CIGAR + "ATCCGCGGTT", // input + { + "ATCCGCGGTT", // forward strand, genomic + "ATCCGCGGTT", // forward strand, native + "ATCCGCGGTT", // forward strand, genomic, aligned + "ATCCGCGGTT", // forward strand, native, aligned + "ATCCGCGGTT", // forward strand, genomic, aligned + clipped + "ATCCGCGGTT", // forward strand, native, aligned + clipped + "ATCCGCGGTT", // reverse strand, genomic + "AACCGCGGAT", // reverse strand, native + "ATCCGCGGTT", // reverse strand, genomic, aligned + "AACCGCGGAT", // reverse strand, native, aligned + "ATCCGCGGTT", // reverse strand, genomic, aligned + clipped + "AACCGCGGAT" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 3=4N3="); + tests::CheckSequenceClippedAndAligned( + "3=4N3=", // CIGAR + "ACGTTT", // input + { + "ACGTTT", // forward strand, genomic + "ACGTTT", // forward strand, native + "ACGTTT", // forward strand, genomic, aligned + "ACGTTT", // forward strand, native, aligned + "ACGTTT", // forward strand, genomic, aligned + clipped + "ACGTTT", // forward strand, native, aligned + clipped + "ACGTTT", // reverse strand, genomic + "AAACGT", // reverse strand, native + "ACGTTT", // reverse strand, genomic, aligned + "AAACGT", // reverse strand, native, aligned + "ACGTTT", // reverse strand, genomic, aligned + clipped + "AAACGT" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 1S8=1S"); + tests::CheckSequenceClippedAndAligned( + "1S8=1S", // CIGAR + "ACCCGCGGTT", // input + { + "ACCCGCGGTT", // forward strand, genomic + "ACCCGCGGTT", // forward strand, native + "ACCCGCGGTT", // forward strand, genomic, aligned + "ACCCGCGGTT", // forward strand, native, aligned + "CCCGCGGT", // forward strand, genomic, aligned + clipped + "CCCGCGGT", // forward strand, native, aligned + clipped + "ACCCGCGGTT", // reverse strand, genomic + "AACCGCGGGT", // reverse strand, native + "ACCCGCGGTT", // reverse strand, genomic, aligned + "AACCGCGGGT", // reverse strand, native, aligned + "CCCGCGGT", // reverse strand, genomic, aligned + clipped + "ACCGCGGG" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 1H8=1H"); + tests::CheckSequenceClippedAndAligned( + "1H8=1H", // CIGAR + "ATCGCGGT", // input + { + "ATCGCGGT", // forward strand, genomic + "ATCGCGGT", // forward strand, native + "ATCGCGGT", // forward strand, genomic, aligned + "ATCGCGGT", // forward strand, native, aligned + "ATCGCGGT", // forward strand, genomic, aligned + clipped + "ATCGCGGT", // forward strand, native, aligned + clipped + "ATCGCGGT", // reverse strand, genomic + "ACCGCGAT", // reverse strand, native + "ATCGCGGT", // reverse strand, genomic, aligned + "ACCGCGAT", // reverse strand, native, aligned + "ATCGCGGT", // reverse strand, genomic, aligned + clipped + "ACCGCGAT" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2S6=2S"); + tests::CheckSequenceClippedAndAligned( + "2S6=2S", // CIGAR + "AGCCGCGGTT", // input + { + "AGCCGCGGTT", // forward strand, genomic + "AGCCGCGGTT", // forward strand, native + "AGCCGCGGTT", // forward strand, genomic, aligned + "AGCCGCGGTT", // forward strand, native, aligned + "CCGCGG", // forward strand, genomic, aligned + clipped + "CCGCGG", // forward strand, native, aligned + clipped + "AGCCGCGGTT", // reverse strand, genomic + "AACCGCGGCT", // reverse strand, native + "AGCCGCGGTT", // reverse strand, genomic, aligned + "AACCGCGGCT", // reverse strand, native, aligned + "CCGCGG", // reverse strand, genomic, aligned + clipped + "CCGCGG" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2S3=2I3=2S"); + tests::CheckSequenceClippedAndAligned( + "2S3=2I3=2S", // CIGAR + "ATCCGNNCGGTT", // input + { + "ATCCGNNCGGTT", // forward strand, genomic + "ATCCGNNCGGTT", // forward strand, native + "ATCCGNNCGGTT", // forward strand, genomic, aligned + "ATCCGNNCGGTT", // forward strand, native, aligned + "CCGNNCGG", // forward strand, genomic, aligned + clipped + "CCGNNCGG", // forward strand, native, aligned + clipped + "ATCCGNNCGGTT", // reverse strand, genomic + "AACCGNNCGGAT", // reverse strand, native + "ATCCGNNCGGTT", // reverse strand, genomic, aligned + "AACCGNNCGGAT", // reverse strand, native, aligned + "CCGNNCGG", // reverse strand, genomic, aligned + clipped + "CCGNNCGG" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H6=2H"); + tests::CheckSequenceClippedAndAligned( + "2H6=2H", // CIGAR + "CAGCGG", // input + { + "CAGCGG", // forward strand, genomic + "CAGCGG", // forward strand, native + "CAGCGG", // forward strand, genomic, aligned + "CAGCGG", // forward strand, native, aligned + "CAGCGG", // forward strand, genomic, aligned + clipped + "CAGCGG", // forward strand, native, aligned + clipped + "CAGCGG", // reverse strand, genomic + "CCGCTG", // reverse strand, native + "CAGCGG", // reverse strand, genomic, aligned + "CCGCTG", // reverse strand, native, aligned + "CAGCGG", // reverse strand, genomic, aligned + clipped + "CCGCTG" // reverse strand, native, aligned + clipped + } + ); + } +} + +TEST(BamRecordTest, ClippingOrientationAndAlignment) +{ + { + SCOPED_TRACE("CIGAR: 4=3D4="); + tests::CheckSequenceClippedAndAligned( + "4=3D4=", // CIGAR + "AACCGTTA", // input + { + "AACCGTTA", // forward strand, genomic + "AACCGTTA", // forward strand, native + "AACC---GTTA", // forward strand, genomic, aligned + "AACC---GTTA", // forward strand, native, aligned + "AACC---GTTA", // forward strand, genomic, aligned + clipped + "AACC---GTTA", // forward strand, native, aligned + clipped + "AACCGTTA", // reverse strand, genomic + "TAACGGTT", // reverse strand, native + "AACC---GTTA", // reverse strand, genomic, aligned + "TAAC---GGTT", // reverse strand, native, aligned + "AACC---GTTA", // reverse strand, genomic, aligned + clipped + "TAAC---GGTT" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2I2D4="); + tests::CheckSequenceClippedAndAligned( + "4=1D2I2D4=", // CIGAR + "ATCCTAGGTT", // input + { + "ATCCTAGGTT", // forward strand, genomic + "ATCCTAGGTT", // forward strand, native + "ATCC-TA--GGTT", // forward strand, genomic, aligned + "ATCC-TA--GGTT", // forward strand, native, aligned + "ATCC-TA--GGTT", // forward strand, genomic, aligned + clipped + "ATCC-TA--GGTT", // forward strand, native, aligned + clipped + "ATCCTAGGTT", // reverse strand, genomic + "AACCTAGGAT", // reverse strand, native + "ATCC-TA--GGTT", // reverse strand, genomic, aligned + "AACC--TA-GGAT", // reverse strand, native, aligned + "ATCC-TA--GGTT", // reverse strand, genomic, aligned + clipped + "AACC--TA-GGAT" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4="); + tests::CheckSequenceClippedAndAligned( + "4=1D2P2I2P2D4=", // CIGAR + "ATCCTAGGTT", // input + { + "ATCCTAGGTT", // forward strand, genomic + "ATCCTAGGTT", // forward strand, native + "ATCC-**TA**--GGTT", // forward strand, genomic, aligned + "ATCC-**TA**--GGTT", // forward strand, native, aligned + "ATCC-**TA**--GGTT", // forward strand, genomic, aligned + clipped + "ATCC-**TA**--GGTT", // forward strand, native, aligned + clipped + "ATCCTAGGTT", // reverse strand, genomic + "AACCTAGGAT", // reverse strand, native + "ATCC-**TA**--GGTT", // reverse strand, genomic, aligned + "AACC--**TA**-GGAT", // reverse strand, native, aligned + "ATCC-**TA**--GGTT", // reverse strand, genomic, aligned + clipped + "AACC--**TA**-GGAT" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2S4=3D4=3S"); + tests::CheckSequenceClippedAndAligned( + "2S4=3D4=3S", // CIGAR + "TTAACCGTTACCG", // input + { + "TTAACCGTTACCG", // forward strand, genomic + "TTAACCGTTACCG", // forward strand, native + "TTAACC---GTTACCG", // forward strand, genomic, aligned + "TTAACC---GTTACCG", // forward strand, native, aligned + "AACC---GTTA", // forward strand, genomic, aligned + clipped + "AACC---GTTA", // forward strand, native, aligned + clipped + "TTAACCGTTACCG", // reverse strand, genomic + "CGGTAACGGTTAA", // reverse strand, native + "TTAACC---GTTACCG", // reverse strand, genomic, aligned + "CGGTAAC---GGTTAA", // reverse strand, native, aligned + "AACC---GTTA", // reverse strand, genomic, aligned + clipped + "TAAC---GGTT" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H4=3D4=3H"); + tests::CheckSequenceClippedAndAligned( + "2H4=3D4=3H", // CIGAR + "AACCGTTA", // input + { + "AACCGTTA", // forward strand, genomic + "AACCGTTA", // forward strand, native + "AACC---GTTA", // forward strand, genomic, aligned + "AACC---GTTA", // forward strand, native, aligned + "AACC---GTTA", // forward strand, genomic, aligned + clipped + "AACC---GTTA", // forward strand, native, aligned + clipped + "AACCGTTA", // reverse strand, genomic + "TAACGGTT", // reverse strand, native + "AACC---GTTA", // reverse strand, genomic, aligned + "TAAC---GGTT", // reverse strand, native, aligned + "AACC---GTTA", // reverse strand, genomic, aligned + clipped + "TAAC---GGTT" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H2S4=3D4=3S3H"); + tests::CheckSequenceClippedAndAligned( + "2H2S4=3D4=3S3H", // CIGAR + "TTAACCGTTACCG", // input + { + "TTAACCGTTACCG", // forward strand, genomic + "TTAACCGTTACCG", // forward strand, native + "TTAACC---GTTACCG", // forward strand, genomic, aligned + "TTAACC---GTTACCG", // forward strand, native, aligned + "AACC---GTTA", // forward strand, genomic, aligned + clipped + "AACC---GTTA", // forward strand, native, aligned + clipped + "TTAACCGTTACCG", // reverse strand, genomic + "CGGTAACGGTTAA", // reverse strand, native + "TTAACC---GTTACCG", // reverse strand, genomic, aligned + "CGGTAAC---GGTTAA", // reverse strand, native, aligned + "AACC---GTTA", // reverse strand, genomic, aligned + clipped + "TAAC---GGTT" // reverse strand, native, aligned + clipped + } + ); + } +} + +TEST(BamRecordTest, QualityTagsClippedAndAligned) +{ + { + SCOPED_TRACE("CIGAR: 4=3D4="); + tests::CheckQualityTagsClippedAndAligned( + "4=3D4=", // CIGAR + "?]?]?]?@", // input + { + "?]?]?]?@", // forward strand, genomic + "?]?]?]?@", // forward strand, native + "?]?]!!!?]?@", // forward strand, genomic, aligned + "?]?]!!!?]?@", // forward strand, native, aligned + "?]?]!!!?]?@", // forward strand, genomic, aligned + clipped + "?]?]!!!?]?@", // forward strand, native, aligned + clipped + "@?]?]?]?", // reverse strand, genomic + "?]?]?]?@", // reverse strand, native + "@?]?!!!]?]?", // reverse strand, genomic, aligned + "?]?]!!!?]?@", // reverse strand, native, aligned + "@?]?!!!]?]?", // reverse strand, genomic, aligned + clipped + "?]?]!!!?]?@" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2I2D4="); + tests::CheckQualityTagsClippedAndAligned( + "4=1D2I2D4=", // CIGAR + "?]?]87?]?@", // input + { + "?]?]87?]?@", // forward strand, genomic + "?]?]87?]?@", // forward strand, native + "?]?]!87!!?]?@", // forward strand, genomic, aligned + "?]?]!87!!?]?@", // forward strand, native, aligned + "?]?]!87!!?]?@", // forward strand, genomic, aligned + clipped + "?]?]!87!!?]?@", // forward strand, native, aligned + clipped + "@?]?78]?]?", // reverse strand, genomic + "?]?]87?]?@", // reverse strand, native + "@?]?!78!!]?]?", // reverse strand, genomic, aligned + "?]?]!!87!?]?@", // reverse strand, native, aligned + "@?]?!78!!]?]?", // reverse strand, genomic, aligned + clipped + "?]?]!!87!?]?@" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4="); + tests::CheckQualityTagsClippedAndAligned( + "4=1D2P2I2P2D4=", // CIGAR + "?]?]87?]?@", // input + { + "?]?]87?]?@", // forward strand, genomic + "?]?]87?]?@", // forward strand, native + "?]?]!!!87!!!!?]?@", // forward strand, genomic, aligned + "?]?]!!!87!!!!?]?@", // forward strand, native, aligned + "?]?]!!!87!!!!?]?@", // forward strand, genomic, aligned + clipped + "?]?]!!!87!!!!?]?@", // forward strand, native, aligned + clipped + "@?]?78]?]?", // reverse strand, genomic + "?]?]87?]?@", // reverse strand, native + "@?]?!!!78!!!!]?]?", // reverse strand, genomic, aligned + "?]?]!!!!87!!!?]?@", // reverse strand, native, aligned + "@?]?!!!78!!!!]?]?", // reverse strand, genomic, aligned + clipped + "?]?]!!!!87!!!?]?@" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 3S4=3D4=3S"); + tests::CheckQualityTagsClippedAndAligned( + "3S4=3D4=3S", // CIGAR + "vvv?]?]?]?@xxx", // input + { + "vvv?]?]?]?@xxx", // forward strand, genomic + "vvv?]?]?]?@xxx", // forward strand, native + "vvv?]?]!!!?]?@xxx", // forward strand, genomic, aligned + "vvv?]?]!!!?]?@xxx", // forward strand, native, aligned + "?]?]!!!?]?@", // forward strand, genomic, aligned, clipped + "?]?]!!!?]?@", // forward strand, native, aligned, clipped + "xxx@?]?]?]?vvv", // reverse strand, genomic + "vvv?]?]?]?@xxx", // reverse strand, native + "xxx@?]?!!!]?]?vvv", // reverse strand, genomic, aligned + "vvv?]?]!!!?]?@xxx", // reverse strand, native, aligned + "@?]?!!!]?]?", // reverse strand, genomic, aligned, clipped + "?]?]!!!?]?@" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H4=3D4=3H"); + tests::CheckQualityTagsClippedAndAligned( + "2H4=3D4=3H", // CIGAR + "?]?]?]?@", // input + { + "?]?]?]?@", // forward strand, genomic + "?]?]?]?@", // forward strand, native + "?]?]!!!?]?@", // forward strand, genomic, aligned + "?]?]!!!?]?@", // forward strand, native, aligned + "?]?]!!!?]?@", // forward strand, genomic, aligned, clipped + "?]?]!!!?]?@", // forward strand, native, aligned, clipped + "@?]?]?]?", // reverse strand, genomic + "?]?]?]?@", // reverse strand, native + "@?]?!!!]?]?", // reverse strand, genomic, aligned + "?]?]!!!?]?@", // reverse strand, native, aligned + "@?]?!!!]?]?", // reverse strand, genomic, aligned, clipped + "?]?]!!!?]?@" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H"); + tests::CheckQualityTagsClippedAndAligned( + "2H3S4=3D4=3S3H", // CIGAR + "vvv?]?]?]?@xxx", // input + { + "vvv?]?]?]?@xxx", // forward strand, genomic + "vvv?]?]?]?@xxx", // forward strand, native + "vvv?]?]!!!?]?@xxx", // forward strand, genomic, aligned + "vvv?]?]!!!?]?@xxx", // forward strand, native, aligned + "?]?]!!!?]?@", // forward strand, genomic, aligned, clipped + "?]?]!!!?]?@", // forward strand, native, aligned, clipped + "xxx@?]?]?]?vvv", // reverse strand, genomic + "vvv?]?]?]?@xxx", // reverse strand, native + "xxx@?]?!!!]?]?vvv", // reverse strand, genomic, aligned + "vvv?]?]!!!?]?@xxx", // reverse strand, native, aligned + "@?]?!!!]?]?", // reverse strand, genomic, aligned, clipped + "?]?]!!!?]?@" // reverse strand, native, aligned, clipped + } + ); + } +} + +TEST(BamRecordTest, BaseTagsClippedAndAligned) +{ + { + SCOPED_TRACE("CIGAR: 4=3D4="); + tests::CheckBaseTagsClippedAndAligned( + "4=3D4=", // CIGAR + "AACCGTTA", // input + { + "AACCGTTA", // forward strand, genomic + "AACCGTTA", // forward strand, native + "AACC---GTTA", // forward strand, genomic, aligned + "AACC---GTTA", // forward strand, native, aligned + "AACC---GTTA", // forward strand, genomic, aligned, clipped + "AACC---GTTA", // forward strand, native, aligned, clipped + "TAACGGTT", // reverse strand, genomic + "AACCGTTA", // reverse strand, native + "TAAC---GGTT", // reverse strand, genomic, aligned + "AACC---GTTA", // reverse strand, native, aligned + "TAAC---GGTT", // reverse strand, genomic, aligned, clipped + "AACC---GTTA" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2I2D4="); + tests::CheckBaseTagsClippedAndAligned( + "4=1D2I2D4=", // CIGAR + "ATCCTAGGTT", // input + { + "ATCCTAGGTT", // forward strand, genomic + "ATCCTAGGTT", // forward strand, native + "ATCC-TA--GGTT", // forward strand, genomic, aligned + "ATCC-TA--GGTT", // forward strand, native, aligned + "ATCC-TA--GGTT", // forward strand, genomic, aligned, clipped + "ATCC-TA--GGTT", // forward strand, native, aligned, clipped + "AACCTAGGAT", // reverse strand, genomic + "ATCCTAGGTT", // reverse strand, native + "AACC-TA--GGAT", // reverse strand, genomic, aligned + "ATCC--TA-GGTT", // reverse strand, native, aligned + "AACC-TA--GGAT", // reverse strand, genomic, aligned, clipped + "ATCC--TA-GGTT" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4="); + tests::CheckBaseTagsClippedAndAligned( + "4=1D2P2I2P2D4=", // CIGAR + "ATCCTAGGTT", // input + { + "ATCCTAGGTT", // forward strand, genomic + "ATCCTAGGTT", // forward strand, native + "ATCC-**TA**--GGTT", // forward strand, genomic, aligned + "ATCC-**TA**--GGTT", // forward strand, native, aligned + "ATCC-**TA**--GGTT", // forward strand, genomic, aligned, clipped + "ATCC-**TA**--GGTT", // forward strand, native, aligned, clipped + "AACCTAGGAT", // reverse strand, genomic + "ATCCTAGGTT", // reverse strand, native + "AACC-**TA**--GGAT", // reverse strand, genomic, aligned + "ATCC--**TA**-GGTT", // reverse strand, native, aligned + "AACC-**TA**--GGAT", // reverse strand, genomic, aligned, clipped + "ATCC--**TA**-GGTT" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 3S4=3D4=3S"); + tests::CheckBaseTagsClippedAndAligned( + "3S4=3D4=3S", // CIGAR + "TTTAACCGTTACCG", // input + { + "TTTAACCGTTACCG", // forward strand, genomic + "TTTAACCGTTACCG", // forward strand, native + "TTTAACC---GTTACCG", // forward strand, genomic, aligned + "TTTAACC---GTTACCG", // forward strand, native, aligned + "AACC---GTTA", // forward strand, genomic, aligned, clipped + "AACC---GTTA", // forward strand, native, aligned, clipped + "CGGTAACGGTTAAA", // reverse strand, genomic + "TTTAACCGTTACCG", // reverse strand, native + "CGGTAAC---GGTTAAA", // reverse strand, genomic, aligned + "TTTAACC---GTTACCG", // reverse strand, native, aligned + "TAAC---GGTT", // reverse strand, genomic, aligned, clipped + "AACC---GTTA" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H4=3D4=3H"); + tests::CheckBaseTagsClippedAndAligned( + "2H4=3D4=3H", // CIGAR + "AACCGTTA", // input + { + "AACCGTTA", // forward strand, genomic + "AACCGTTA", // forward strand, native + "AACC---GTTA", // forward strand, genomic, aligned + "AACC---GTTA", // forward strand, native, aligned + "AACC---GTTA", // forward strand, genomic, aligned, clipped + "AACC---GTTA", // forward strand, native, aligned, clipped + "TAACGGTT", // reverse strand, genomic + "AACCGTTA", // reverse strand, native + "TAAC---GGTT", // reverse strand, genomic, aligned + "AACC---GTTA", // reverse strand, native, aligned + "TAAC---GGTT", // reverse strand, genomic, aligned, clipped + "AACC---GTTA" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H"); + tests::CheckBaseTagsClippedAndAligned( + "2H3S4=3D4=3S3H", // CIGAR + "TTTAACCGTTACCG", // input + { + "TTTAACCGTTACCG", // forward strand, genomic + "TTTAACCGTTACCG", // forward strand, native + "TTTAACC---GTTACCG", // forward strand, genomic, aligned + "TTTAACC---GTTACCG", // forward strand, native, aligned + "AACC---GTTA", // forward strand, genomic, aligned, clipped + "AACC---GTTA", // forward strand, native, aligned, clipped + "CGGTAACGGTTAAA", // reverse strand, genomic + "TTTAACCGTTACCG", // reverse strand, native + "CGGTAAC---GGTTAAA", // reverse strand, genomic, aligned + "TTTAACC---GTTACCG", // reverse strand, native, aligned + "TAAC---GGTT", // reverse strand, genomic, aligned, clipped + "AACC---GTTA" // reverse strand, native, aligned, clipped + } + ); + } +} + +TEST(BamRecordTest, FrameTagsClippedAndAligned) +{ + { + SCOPED_TRACE("CIGAR: 4=3D4="); + tests::CheckFrameTagsClippedAndAligned( + "4=3D4=", // CIGAR + { 10, 20, 10, 20, 10, 20, 10, 30 }, // input + { + { 10, 20, 10, 20, 10, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 10, 20, 10, 20, 10, 30 }, // forward strand, native + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 30, 10, 20, 10, 20, 10, 20, 10 }, // reverse strand, genomic + { 10, 20, 10, 20, 10, 20, 10, 30 }, // reverse strand, native + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2I2D4="); + tests::CheckFrameTagsClippedAndAligned( + "4=1D2I2D4=", // CIGAR + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // input + { + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // forward strand, native + { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned + { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 }, // reverse strand, genomic + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // reverse strand, native + { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4="); + tests::CheckFrameTagsClippedAndAligned( + "4=1D2P2I2P2D4=", // CIGAR + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // input + { + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // forward strand, native + { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 }, // reverse strand, genomic + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // reverse strand, native + { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 3S4=3D4=3S"); + tests::CheckFrameTagsClippedAndAligned( + "3S4=3D4=3S", // CIGAR + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // input + { + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, genomic + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, native + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, genomic, aligned + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 }, // reverse strand, genomic + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // reverse strand, native + { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 }, // reverse strand, genomic, aligned + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H4=3D4=3H"); + tests::CheckFrameTagsClippedAndAligned( + "2H4=3D4=3H", // CIGAR + { 10, 20, 10, 20, 10, 20, 10, 30 }, // input + { + { 10, 20, 10, 20, 10, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 10, 20, 10, 20, 10, 30 }, // forward strand, native + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 30, 10, 20, 10, 20, 10, 20, 10 }, // reverse strand, genomic + { 10, 20, 10, 20, 10, 20, 10, 30 }, // reverse strand, native + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H"); + tests::CheckFrameTagsClippedAndAligned( + "2H3S4=3D4=3S3H", // CIGAR + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // input + { + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, genomic + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, native + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, genomic, aligned + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 }, // reverse strand, genomic + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // reverse strand, native + { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 }, // reverse strand, genomic, aligned + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } +} + +TEST(BamRecordTest, PulseBaseTags) +{ + { + SCOPED_TRACE("CIGAR: 4=3D4="); + tests::CheckPulseBaseTags( + "4=3D4=", // CIGAR + "AACCGTTA", // seqBases + "AAaaCCGggTTA", // pulseCalls + "AAaaCCGggTTA", // tag data + + { // all pulses + + "AAaaCCGggTTA", // forward strand, genomic + "AAaaCCGggTTA", // forward strand, native + "", // forward strand, genomic, aligned + "", // forward strand, native, aligned + "", // forward strand, genomic, aligned, clipped + "", // forward strand, native, aligned, clipped + "TAAccCGGttTT", // reverse strand, genomic + "AAaaCCGggTTA", // reverse strand, native + "", // reverse strand, genomic, aligned + "", // reverse strand, native, aligned + "", // reverse strand, genomic, aligned, clipped + "" // reverse strand, native, aligned, clipped + }, + { // basecalls only + + "AACCGTTA", // forward strand, genomic + "AACCGTTA", // forward strand, native + "AACC---GTTA", // forward strand, genomic, aligned + "AACC---GTTA", // forward strand, native, aligned + "AACC---GTTA", // forward strand, genomic, aligned, clipped + "AACC---GTTA", // forward strand, native, aligned, clipped + "TAACGGTT", // reverse strand, genomic + "AACCGTTA", // reverse strand, native + "TAAC---GGTT", // reverse strand, genomic, aligned + "AACC---GTTA", // reverse strand, native, aligned + "TAAC---GGTT", // reverse strand, genomic, aligned, clipped + "AACC---GTTA" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2I2D4="); + tests::CheckPulseBaseTags( + "4=1D2I2D4=", // CIGAR + "ATCCTAGGTT", // seqBases + "ATttCCTtAGGggTT", // pulseCalls + "ATttCCTtAGGggTT", // tag data + + { // all pulses + + "ATttCCTtAGGggTT", // forward strand, genomic + "ATttCCTtAGGggTT", // forward strand, native + "", // forward strand, genomic, aligned + "", // forward strand, native, aligned + "", // forward strand, genomic, aligned, clipped + "", // forward strand, native, aligned, clipped + "AAccCCTaAGGaaAT", // reverse strand, genomic + "ATttCCTtAGGggTT", // reverse strand, native + "", // reverse strand, genomic, aligned + "", // reverse strand, native, aligned + "", // reverse strand, genomic, aligned, clipped + "" // reverse strand, native, aligned, clipped + }, + { // basecalls only + + "ATCCTAGGTT", // forward strand, genomic + "ATCCTAGGTT", // forward strand, native + "ATCC-TA--GGTT", // forward strand, genomic, aligned + "ATCC-TA--GGTT", // forward strand, native, aligned + "ATCC-TA--GGTT", // forward strand, genomic, aligned, clipped + "ATCC-TA--GGTT", // forward strand, native, aligned, clipped + "AACCTAGGAT", // reverse strand, genomic + "ATCCTAGGTT", // reverse strand, native + "AACC-TA--GGAT", // reverse strand, genomic, aligned + "ATCC--TA-GGTT", // reverse strand, native, aligned + "AACC-TA--GGAT", // reverse strand, genomic, aligned, clipped + "ATCC--TA-GGTT" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4="); + tests::CheckPulseBaseTags( + "4=1D2P2I2P2D4=", // CIGAR + "ATCCTAGGTT", // seqBases + "ATttCCTtAGGggTT", // pulseCalls + "ATttCCTtAGGggTT", // tag data + { + "ATttCCTtAGGggTT", // forward strand, genomic + "ATttCCTtAGGggTT", // forward strand, native + "", // forward strand, genomic, aligned + "", // forward strand, native, aligned + "", // forward strand, genomic, aligned, clipped + "", // forward strand, native, aligned, clipped + "AAccCCTaAGGaaAT", // reverse strand, genomic + "ATttCCTtAGGggTT", // reverse strand, native + "", // reverse strand, genomic, aligned + "", // reverse strand, native, aligned + "", // reverse strand, genomic, aligned, clipped + "" // reverse strand, native, aligned, clipped + }, + { + "ATCCTAGGTT", // forward strand, genomic + "ATCCTAGGTT", // forward strand, native + "ATCC-**TA**--GGTT", // forward strand, genomic, aligned + "ATCC-**TA**--GGTT", // forward strand, native, aligned + "ATCC-**TA**--GGTT", // forward strand, genomic, aligned, clipped + "ATCC-**TA**--GGTT", // forward strand, native, aligned, clipped + "AACCTAGGAT", // reverse strand, genomic + "ATCCTAGGTT", // reverse strand, native + "AACC-**TA**--GGAT", // reverse strand, genomic, aligned + "ATCC--**TA**-GGTT", // reverse strand, native, aligned + "AACC-**TA**--GGAT", // reverse strand, genomic, aligned, clipped + "ATCC--**TA**-GGTT" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 3S4=3D4=3S"); + tests::CheckPulseBaseTags( + "3S4=3D4=3S", // CIGAR + "TTTAACCGTTACCG", // seqBases + "TTTttAACCccGTTAaaCCG", // pulseCalls + "TTTttAACCccGTTAaaCCG", // tag data + + { // all pulses + + "TTTttAACCccGTTAaaCCG", // forward strand, genomic + "TTTttAACCccGTTAaaCCG", // forward strand, native + "", // forward strand, genomic, aligned + "", // forward strand, native, aligned + "", // forward strand, genomic, aligned, clipped + "", // forward strand, native, aligned, clipped + "CGGttTAACggGGTTaaAAA", // reverse strand, genomic + "TTTttAACCccGTTAaaCCG", // reverse strand, native + "", // reverse strand, genomic, aligned + "", // reverse strand, native, aligned + "", // reverse strand, genomic, aligned, clipped + "" // reverse strand, native, aligned, clipped + }, + { // basecalls only + + "TTTAACCGTTACCG", // forward strand, genomic + "TTTAACCGTTACCG", // forward strand, native + "TTTAACC---GTTACCG", // forward strand, genomic, aligned + "TTTAACC---GTTACCG", // forward strand, native, aligned + "AACC---GTTA", // forward strand, genomic, aligned, clipped + "AACC---GTTA", // forward strand, native, aligned, clipped + "CGGTAACGGTTAAA", // reverse strand, genomic + "TTTAACCGTTACCG", // reverse strand, native + "CGGTAAC---GGTTAAA", // reverse strand, genomic, aligned + "TTTAACC---GTTACCG", // reverse strand, native, aligned + "TAAC---GGTT", // reverse strand, genomic, aligned, clipped + "AACC---GTTA" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H4=3D4=3H"); + tests::CheckPulseBaseTags( + "2H4=3D4=3H", // CIGAR + "AACCGTTA", // seqBases + "AAaaCCGggTTA", // pulseCalls + "AAaaCCGggTTA", // tag data + + { // all pulses + + "AAaaCCGggTTA", // forward strand, genomic + "AAaaCCGggTTA", // forward strand, native + "", // forward strand, genomic, aligned + "", // forward strand, native, aligned + "", // forward strand, genomic, aligned, clipped + "", // forward strand, native, aligned, clipped + "TAAccCGGttTT", // reverse strand, genomic + "AAaaCCGggTTA", // reverse strand, native + "", // reverse strand, genomic, aligned + "", // reverse strand, native, aligned + "", // reverse strand, genomic, aligned, clipped + "" // reverse strand, native, aligned, clipped + }, + { // basecalls only + + "AACCGTTA", // forward strand, genomic + "AACCGTTA", // forward strand, native + "AACC---GTTA", // forward strand, genomic, aligned + "AACC---GTTA", // forward strand, native, aligned + "AACC---GTTA", // forward strand, genomic, aligned, clipped + "AACC---GTTA", // forward strand, native, aligned, clipped + "TAACGGTT", // reverse strand, genomic + "AACCGTTA", // reverse strand, native + "TAAC---GGTT", // reverse strand, genomic, aligned + "AACC---GTTA", // reverse strand, native, aligned + "TAAC---GGTT", // reverse strand, genomic, aligned, clipped + "AACC---GTTA" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H"); + tests::CheckPulseBaseTags( + "2H3S4=3D4=3S3H", // CIGAR + "TTTAACCGTTACCG", // seqBases + "TTTttAACCccGTTAaaCCG", // pulseCalls + "TTTttAACCccGTTAaaCCG", // tag data + + { // all pulses + + "TTTttAACCccGTTAaaCCG", // forward strand, genomic + "TTTttAACCccGTTAaaCCG", // forward strand, native + "", // forward strand, genomic, aligned + "", // forward strand, native, aligned + "", // forward strand, genomic, aligned, clipped + "", // forward strand, native, aligned, clipped + "CGGttTAACggGGTTaaAAA", // reverse strand, genomic + "TTTttAACCccGTTAaaCCG", // reverse strand, native + "", // reverse strand, genomic, aligned + "", // reverse strand, native, aligned + "", // reverse strand, genomic, aligned, clipped + "" // reverse strand, native, aligned, clipped + }, + { // basecalls only + + "TTTAACCGTTACCG", // forward strand, genomic + "TTTAACCGTTACCG", // forward strand, native + "TTTAACC---GTTACCG", // forward strand, genomic, aligned + "TTTAACC---GTTACCG", // forward strand, native, aligned + "AACC---GTTA", // forward strand, genomic, aligned, clipped + "AACC---GTTA", // forward strand, native, aligned, clipped + "CGGTAACGGTTAAA", // reverse strand, genomic + "TTTAACCGTTACCG", // reverse strand, native + "CGGTAAC---GGTTAAA", // reverse strand, genomic, aligned + "TTTAACC---GTTACCG", // reverse strand, native, aligned + "TAAC---GGTT", // reverse strand, genomic, aligned, clipped + "AACC---GTTA" // reverse strand, native, aligned, clipped + } + ); + } +} + +TEST(BamRecordTest, PulseQualityTags) +{ + { + SCOPED_TRACE("CIGAR: 4=3D4="); + tests::CheckPulseQualityTags( + "4=3D4=", // CIGAR + "AACCGTTA", // seqBases + "AAaaCCGggTTA", // pulseCalls + "?]!!?]?!!]?@", // tag data + + { // all pulses + + "?]!!?]?!!]?@", // forward strand, genomic + "?]!!?]?!!]?@", // forward strand, native + "", // forward strand, genomic, aligned + "", // forward strand, native, aligned + "", // forward strand, genomic, aligned + clipped + "", // forward strand, native, aligned + clipped + "@?]!!?]?!!]?", // reverse strand, genomic + "?]!!?]?!!]?@", // reverse strand, native + "", // reverse strand, genomic, aligned + "", // reverse strand, native, aligned + "", // reverse strand, genomic, aligned + clipped + "" // reverse strand, native, aligned + clipped + }, + { // basecalls only + + "?]?]?]?@", // forward strand, genomic + "?]?]?]?@", // forward strand, native + "?]?]!!!?]?@", // forward strand, genomic, aligned + "?]?]!!!?]?@", // forward strand, native, aligned + "?]?]!!!?]?@", // forward strand, genomic, aligned + clipped + "?]?]!!!?]?@", // forward strand, native, aligned + clipped + "@?]?]?]?", // reverse strand, genomic + "?]?]?]?@", // reverse strand, native + "@?]?!!!]?]?", // reverse strand, genomic, aligned + "?]?]!!!?]?@", // reverse strand, native, aligned + "@?]?!!!]?]?", // reverse strand, genomic, aligned + clipped + "?]?]!!!?]?@" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2I2D4="); + tests::CheckPulseQualityTags( + "4=1D2I2D4=", // CIGAR + "ATCCTAGGTT", // seqBases + "ATttCCTtAGGggTT", // pulseCalls + "?]!!?]8!7?]!!?@", // tag data + + { // all pulses + + "?]!!?]8!7?]!!?@", // forward strand, genomic + "?]!!?]8!7?]!!?@", // forward strand, native + "", // forward strand, genomic, aligned + "", // forward strand, native, aligned + "", // forward strand, genomic, aligned + clipped + "", // forward strand, native, aligned + clipped + "@?!!]?7!8]?!!]?", // reverse strand, genomic + "?]!!?]8!7?]!!?@", // reverse strand, native + "", // reverse strand, genomic, aligned + "", // reverse strand, native, aligned + "", // reverse strand, genomic, aligned + clipped + "" // reverse strand, native, aligned + clipped + }, + { // basecalls only + + "?]?]87?]?@", // forward strand, genomic + "?]?]87?]?@", // forward strand, native + "?]?]!87!!?]?@", // forward strand, genomic, aligned + "?]?]!87!!?]?@", // forward strand, native, aligned + "?]?]!87!!?]?@", // forward strand, genomic, aligned + clipped + "?]?]!87!!?]?@", // forward strand, native, aligned + clipped + "@?]?78]?]?", // reverse strand, genomic + "?]?]87?]?@", // reverse strand, native + "@?]?!78!!]?]?", // reverse strand, genomic, aligned + "?]?]!!87!?]?@", // reverse strand, native, aligned + "@?]?!78!!]?]?", // reverse strand, genomic, aligned + clipped + "?]?]!!87!?]?@" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4="); + tests::CheckPulseQualityTags( + "4=1D2P2I2P2D4=", // CIGAR + "ATCCTAGGTT", // seqBases + "ATttCCTtAGGggTT", // pulseCalls + "?]!!?]8!7?]!!?@", // tag data + { + "?]!!?]8!7?]!!?@", // forward strand, genomic + "?]!!?]8!7?]!!?@", // forward strand, native + "", // forward strand, genomic, aligned + "", // forward strand, native, aligned + "", // forward strand, genomic, aligned + clipped + "", // forward strand, native, aligned + clipped + "@?!!]?7!8]?!!]?", // reverse strand, genomic + "?]!!?]8!7?]!!?@", // reverse strand, native + "", // reverse strand, genomic, aligned + "", // reverse strand, native, aligned + "", // reverse strand, genomic, aligned + clipped + "" // reverse strand, native, aligned + clipped + }, + { + "?]?]87?]?@", // forward strand, genomic + "?]?]87?]?@", // forward strand, native + "?]?]!!!87!!!!?]?@", // forward strand, genomic, aligned + "?]?]!!!87!!!!?]?@", // forward strand, native, aligned + "?]?]!!!87!!!!?]?@", // forward strand, genomic, aligned + clipped + "?]?]!!!87!!!!?]?@", // forward strand, native, aligned + clipped + "@?]?78]?]?", // reverse strand, genomic + "?]?]87?]?@", // reverse strand, native + "@?]?!!!78!!!!]?]?", // reverse strand, genomic, aligned + "?]?]!!!!87!!!?]?@", // reverse strand, native, aligned + "@?]?!!!78!!!!]?]?", // reverse strand, genomic, aligned + clipped + "?]?]!!!!87!!!?]?@" // reverse strand, native, aligned + clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 3S4=3D4=3S"); + tests::CheckPulseQualityTags( + "3S4=3D4=3S", // CIGAR + "TTTAACCGTTACCG", // seqBases + "TTTttAACCccGTTAaaCCG", // pulseCalls + "vvv!!?]?]!!?]?@!!xxx", // tag data + + { // all pulses + + "vvv!!?]?]!!?]?@!!xxx", // forward strand, genomic + "vvv!!?]?]!!?]?@!!xxx", // forward strand, native + "", // forward strand, genomic, aligned + "", // forward strand, native, aligned + "", // forward strand, genomic, aligned, clipped + "", // forward strand, native, aligned, clipped + "xxx!!@?]?!!]?]?!!vvv", // reverse strand, genomic + "vvv!!?]?]!!?]?@!!xxx", // reverse strand, native + "", // reverse strand, genomic, aligned + "", // reverse strand, native, aligned + "", // reverse strand, genomic, aligned, clipped + "" // reverse strand, native, aligned, clipped + }, + { // basecalls only + + "vvv?]?]?]?@xxx", // forward strand, genomic + "vvv?]?]?]?@xxx", // forward strand, native + "vvv?]?]!!!?]?@xxx", // forward strand, genomic, aligned + "vvv?]?]!!!?]?@xxx", // forward strand, native, aligned + "?]?]!!!?]?@", // forward strand, genomic, aligned, clipped + "?]?]!!!?]?@", // forward strand, native, aligned, clipped + "xxx@?]?]?]?vvv", // reverse strand, genomic + "vvv?]?]?]?@xxx", // reverse strand, native + "xxx@?]?!!!]?]?vvv", // reverse strand, genomic, aligned + "vvv?]?]!!!?]?@xxx", // reverse strand, native, aligned + "@?]?!!!]?]?", // reverse strand, genomic, aligned, clipped + "?]?]!!!?]?@" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H4=3D4=3H"); + tests::CheckPulseQualityTags( + "2H4=3D4=3H", // CIGAR + "AACCGTTA", // seqBases + "AAaaCCGggTTA", // pulseCalls + "?]!!?]?!!]?@", // tag data + + { // all pulses + + "?]!!?]?!!]?@", // forward strand, genomic + "?]!!?]?!!]?@", // forward strand, native + "", // forward strand, genomic, aligned + "", // forward strand, native, aligned + "", // forward strand, genomic, aligned, clipped + "", // forward strand, native, aligned, clipped + "@?]!!?]?!!]?", // reverse strand, genomic + "?]!!?]?!!]?@", // reverse strand, native + "", // reverse strand, genomic, aligned + "", // reverse strand, native, aligned + "", // reverse strand, genomic, aligned, clipped + "" // reverse strand, native, aligned, clipped + }, + { // basecalls only + + "?]?]?]?@", // forward strand, genomic + "?]?]?]?@", // forward strand, native + "?]?]!!!?]?@", // forward strand, genomic, aligned + "?]?]!!!?]?@", // forward strand, native, aligned + "?]?]!!!?]?@", // forward strand, genomic, aligned, clipped + "?]?]!!!?]?@", // forward strand, native, aligned, clipped + "@?]?]?]?", // reverse strand, genomic + "?]?]?]?@", // reverse strand, native + "@?]?!!!]?]?", // reverse strand, genomic, aligned + "?]?]!!!?]?@", // reverse strand, native, aligned + "@?]?!!!]?]?", // reverse strand, genomic, aligned, clipped + "?]?]!!!?]?@" // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H"); + tests::CheckPulseQualityTags( + "2H3S4=3D4=3S3H", // CIGAR + "TTTAACCGTTACCG", // seqBases + "TTTttAACCccGTTAaaCCG", // pulseCalls + "vvv!!?]?]!!?]?@!!xxx", // tag data + + { // all pulses + + "vvv!!?]?]!!?]?@!!xxx", // forward strand, genomic + "vvv!!?]?]!!?]?@!!xxx", // forward strand, native + "", // forward strand, genomic, aligned + "", // forward strand, native, aligned + "", // forward strand, genomic, aligned, clipped + "", // forward strand, native, aligned, clipped + "xxx!!@?]?!!]?]?!!vvv", // reverse strand, genomic + "vvv!!?]?]!!?]?@!!xxx", // reverse strand, native + "", // reverse strand, genomic, aligned + "", // reverse strand, native, aligned + "", // reverse strand, genomic, aligned, clipped + "" // reverse strand, native, aligned, clipped + }, + { // basecalls only + + "vvv?]?]?]?@xxx", // forward strand, genomic + "vvv?]?]?]?@xxx", // forward strand, native + "vvv?]?]!!!?]?@xxx", // forward strand, genomic, aligned + "vvv?]?]!!!?]?@xxx", // forward strand, native, aligned + "?]?]!!!?]?@", // forward strand, genomic, aligned, clipped + "?]?]!!!?]?@", // forward strand, native, aligned, clipped + "xxx@?]?]?]?vvv", // reverse strand, genomic + "vvv?]?]?]?@xxx", // reverse strand, native + "xxx@?]?!!!]?]?vvv", // reverse strand, genomic, aligned + "vvv?]?]!!!?]?@xxx", // reverse strand, native, aligned + "@?]?!!!]?]?", // reverse strand, genomic, aligned, clipped + "?]?]!!!?]?@" // reverse strand, native, aligned, clipped + } + ); + } +} + +TEST(BamRecordTest, PulseFrameTags) +{ + { + SCOPED_TRACE("CIGAR: 4=3D4="); + tests::CheckPulseFrameTags( + "4=3D4=", // CIGAR + "AACCGTTA", // seqBases + "AAaaCCGggTTA", // pulseCalls + { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 }, // tag data + + { // all pulses + + { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 }, // forward strand, native + { }, // forward strand, genomic, aligned + { }, // forward strand, native, aligned + { }, // forward strand, genomic, aligned, clipped + { }, // forward strand, native, aligned, clipped + { 30, 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10 }, // reverse strand, genomic + { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 }, // reverse strand, native + { }, // reverse strand, genomic, aligned + { }, // reverse strand, native, aligned + { }, // reverse strand, genomic, aligned, clipped + { } // reverse strand, native, aligned, clipped + }, + { // basecalls only + + { 10, 20, 10, 20, 10, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 10, 20, 10, 20, 10, 30 }, // forward strand, native + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 30, 10, 20, 10, 20, 10, 20, 10 }, // reverse strand, genomic + { 10, 20, 10, 20, 10, 20, 10, 30 }, // reverse strand, native + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2I2D4="); + tests::CheckPulseFrameTags( + "4=1D2I2D4=", // CIGAR + "ATCCTAGGTT", // seqBases + "ATttCCTtAGGggTT", // pulseCalls + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data + + { // all pulses + + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // forward strand, genomic + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // forward strand, native + { }, // forward strand, genomic, aligned + { }, // forward strand, native, aligned + { }, // forward strand, genomic, aligned, clipped + { }, // forward strand, native, aligned, clipped + { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 }, // reverse strand, genomic + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // reverse strand, native + { }, // reverse strand, genomic, aligned + { }, // reverse strand, native, aligned + { }, // reverse strand, genomic, aligned, clipped + { } // reverse strand, native, aligned, clipped + }, + { // basecalls only + + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // forward strand, native + { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned + { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 }, // reverse strand, genomic + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // reverse strand, native + { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4="); + tests::CheckPulseFrameTags( + "4=1D2P2I2P2D4=", // CIGAR + "ATCCTAGGTT", // seqBases + "ATttCCTtAGGggTT", // pulseCalls + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data + + { // all pulses + + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // forward strand, genomic + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // forward strand, native + { }, // forward strand, genomic, aligned + { }, // forward strand, native, aligned + { }, // forward strand, genomic, aligned, clipped + { }, // forward strand, native, aligned, clipped + { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 }, // reverse strand, genomic + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // reverse strand, native + { }, // reverse strand, genomic, aligned + { }, // reverse strand, native, aligned + { }, // reverse strand, genomic, aligned, clipped + { } // reverse strand, native, aligned, clipped + }, + { // basecalls only + + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // forward strand, native + { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 }, // reverse strand, genomic + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // reverse strand, native + { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 3S4=3D4=3S"); + tests::CheckPulseFrameTags( + "3S4=3D4=3S", // CIGAR + "TTTAACCGTTACCG", // seqBases + "TTTttAACCccGTTAaaCCG", // pulseCalls + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // tag data + + { // all pulses + + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // forward strand, genomic + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // forward strand, native + { }, // forward strand, genomic, aligned + { }, // forward strand, native, aligned + { }, // forward strand, genomic, aligned, clipped + { }, // forward strand, native, aligned, clipped + { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 }, // reverse strand, genomic + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // reverse strand, native + { }, // reverse strand, genomic, aligned + { }, // reverse strand, native, aligned + { }, // reverse strand, genomic, aligned, clipped + { } // reverse strand, native, aligned, clipped + }, + { // basecalls only + + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, genomic + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, native + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, genomic, aligned + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 }, // reverse strand, genomic + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // reverse strand, native + { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 }, // reverse strand, genomic, aligned + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H4=3D4=3H"); + tests::CheckPulseFrameTags( + "2H4=3D4=3H", // CIGAR + "AACCGTTA", // seqBases + "AAaaCCGggTTA", // pulseCalls + { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // tag data + + { // all pulses + + { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // forward strand, native + { }, // forward strand, genomic, aligned + { }, // forward strand, native, aligned + { }, // forward strand, genomic, aligned, clipped + { }, // forward strand, native, aligned, clipped + { 30, 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10 }, // reverse strand, genomic + { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // reverse strand, native + { }, // reverse strand, genomic, aligned + { }, // reverse strand, native, aligned + { }, // reverse strand, genomic, aligned, clipped + { } // reverse strand, native, aligned, clipped + }, + { // basecalls only + + { 10, 20, 10, 20, 10, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 10, 20, 10, 20, 10, 30 }, // forward strand, native + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 30, 10, 20, 10, 20, 10, 20, 10 }, // reverse strand, genomic + { 10, 20, 10, 20, 10, 20, 10, 30 }, // reverse strand, native + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H"); + tests::CheckPulseFrameTags( + "2H3S4=3D4=3S3H", // CIGAR + "TTTAACCGTTACCG", // seqBases + "TTTttAACCccGTTAaaCCG", // pulseCalls + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // tag data + + { // all pulses + + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // forward strand, genomic + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // forward strand, native + { }, // forward strand, genomic, aligned + { }, // forward strand, native, aligned + { }, // forward strand, genomic, aligned, clipped + { }, // forward strand, native, aligned, clipped + { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 }, // reverse strand, genomic + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // reverse strand, native + { }, // reverse strand, genomic, aligned + { }, // reverse strand, native, aligned + { }, // reverse strand, genomic, aligned, clipped + { } // reverse strand, native, aligned, clipped + }, + { // basecalls only + + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, genomic + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, native + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, genomic, aligned + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 }, // reverse strand, genomic + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // reverse strand, native + { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 }, // reverse strand, genomic, aligned + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } +} + +TEST(BamRecordTest, PulseUIntTags) +{ + { + SCOPED_TRACE("CIGAR: 4=3D4="); + tests::CheckPulseUIntTags( + "4=3D4=", // CIGAR + "AACCGTTA", // seqBases + "AAaaCCGggTTA", // pulseCalls + { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 }, // tag data + + { // all pulses + + { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 }, // forward strand, native + { }, // forward strand, genomic, aligned + { }, // forward strand, native, aligned + { }, // forward strand, genomic, aligned, clipped + { }, // forward strand, native, aligned, clipped + { 30, 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10 }, // reverse strand, genomic + { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 }, // reverse strand, native + { }, // reverse strand, genomic, aligned + { }, // reverse strand, native, aligned + { }, // reverse strand, genomic, aligned, clipped + { } // reverse strand, native, aligned, clipped + }, + { // basecalls only + + { 10, 20, 10, 20, 10, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 10, 20, 10, 20, 10, 30 }, // forward strand, native + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 30, 10, 20, 10, 20, 10, 20, 10 }, // reverse strand, genomic + { 10, 20, 10, 20, 10, 20, 10, 30 }, // reverse strand, native + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2I2D4="); + tests::CheckPulseUIntTags( + "4=1D2I2D4=", // CIGAR + "ATCCTAGGTT", // seqBases + "ATttCCTtAGGggTT", // pulseCalls + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data + + { // all pulses + + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // forward strand, genomic + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // forward strand, native + { }, // forward strand, genomic, aligned + { }, // forward strand, native, aligned + { }, // forward strand, genomic, aligned, clipped + { }, // forward strand, native, aligned, clipped + { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 }, // reverse strand, genomic + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // reverse strand, native + { }, // reverse strand, genomic, aligned + { }, // reverse strand, native, aligned + { }, // reverse strand, genomic, aligned, clipped + { } // reverse strand, native, aligned, clipped + }, + { // basecalls only + + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // forward strand, native + { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned + { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 }, // reverse strand, genomic + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // reverse strand, native + { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4="); + tests::CheckPulseUIntTags( + "4=1D2P2I2P2D4=", // CIGAR + "ATCCTAGGTT", // seqBases + "ATttCCTtAGGggTT", // pulseCalls + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data + + { // all pulses + + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // forward strand, genomic + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // forward strand, native + { }, // forward strand, genomic, aligned + { }, // forward strand, native, aligned + { }, // forward strand, genomic, aligned, clipped + { }, // forward strand, native, aligned, clipped + { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 }, // reverse strand, genomic + { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // reverse strand, native + { }, // reverse strand, genomic, aligned + { }, // reverse strand, native, aligned + { }, // reverse strand, genomic, aligned, clipped + { } // reverse strand, native, aligned, clipped + }, + { // basecalls only + + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // forward strand, native + { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 }, // reverse strand, genomic + { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 }, // reverse strand, native + { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 3S4=3D4=3S"); + tests::CheckPulseUIntTags( + "3S4=3D4=3S", // CIGAR + "TTTAACCGTTACCG", // seqBases + "TTTttAACCccGTTAaaCCG", // pulseCalls + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // tag data + + { // all pulses + + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // forward strand, genomic + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // forward strand, native + { }, // forward strand, genomic, aligned + { }, // forward strand, native, aligned + { }, // forward strand, genomic, aligned, clipped + { }, // forward strand, native, aligned, clipped + { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 }, // reverse strand, genomic + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // reverse strand, native + { }, // reverse strand, genomic, aligned + { }, // reverse strand, native, aligned + { }, // reverse strand, genomic, aligned, clipped + { } // reverse strand, native, aligned, clipped + }, + { // basecalls only + + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, genomic + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, native + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, genomic, aligned + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 }, // reverse strand, genomic + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // reverse strand, native + { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 }, // reverse strand, genomic, aligned + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H4=3D4=3H"); + tests::CheckPulseUIntTags( + "2H4=3D4=3H", // CIGAR + "AACCGTTA", // seqBases + "AAaaCCGggTTA", // pulseCalls + { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // tag data + + { // all pulses + + { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // forward strand, native + { }, // forward strand, genomic, aligned + { }, // forward strand, native, aligned + { }, // forward strand, genomic, aligned, clipped + { }, // forward strand, native, aligned, clipped + { 30, 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10 }, // reverse strand, genomic + { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // reverse strand, native + { }, // reverse strand, genomic, aligned + { }, // reverse strand, native, aligned + { }, // reverse strand, genomic, aligned, clipped + { } // reverse strand, native, aligned, clipped + }, + { // basecalls only + + { 10, 20, 10, 20, 10, 20, 10, 30 }, // forward strand, genomic + { 10, 20, 10, 20, 10, 20, 10, 30 }, // forward strand, native + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 30, 10, 20, 10, 20, 10, 20, 10 }, // reverse strand, genomic + { 10, 20, 10, 20, 10, 20, 10, 30 }, // reverse strand, native + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } + { + SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H"); + tests::CheckPulseUIntTags( + "2H3S4=3D4=3S3H", // CIGAR + "TTTAACCGTTACCG", // seqBases + "TTTttAACCccGTTAaaCCG", // pulseCalls + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // tag data + + { // all pulses + + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // forward strand, genomic + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // forward strand, native + { }, // forward strand, genomic, aligned + { }, // forward strand, native, aligned + { }, // forward strand, genomic, aligned, clipped + { }, // forward strand, native, aligned, clipped + { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 }, // reverse strand, genomic + { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 }, // reverse strand, native + { }, // reverse strand, genomic, aligned + { }, // reverse strand, native, aligned + { }, // reverse strand, genomic, aligned, clipped + { } // reverse strand, native, aligned, clipped + }, + { // basecalls only + + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, genomic + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, native + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, genomic, aligned + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // forward strand, native, aligned + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }, // forward strand, native, aligned, clipped + { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 }, // reverse strand, genomic + { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 }, // reverse strand, native + { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 }, // reverse strand, genomic, aligned + { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 }, // reverse strand, native, aligned + { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 }, // reverse strand, genomic, aligned, clipped + { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 } // reverse strand, native, aligned, clipped + } + ); + } +} diff --git a/tests/src/test_BamRecordBuilder.cpp b/tests/src/test_BamRecordBuilder.cpp new file mode 100644 index 0000000..d1287d5 --- /dev/null +++ b/tests/src/test_BamRecordBuilder.cpp @@ -0,0 +1,274 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; + +namespace tests { + +static +void CheckRawData(const BamRecordImpl& bam) +{ + // ensure raw data (lengths at least) matches API-facing data + + const uint32_t expectedNameLength = bam.Name().size() + 1; + const uint32_t expectedNumCigarOps = bam.CigarData().size(); + const int32_t expectedSeqLength = bam.Sequence().length(); + const size_t expectedTagsLength = BamTagCodec::Encode(bam.Tags()).size(); + + // Name CIGAR Sequence Quals Tags + // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + << TAGS >> + + const int expectedTotalDataLength = expectedNameLength + + (expectedNumCigarOps * 4) + + (expectedSeqLength+1)/2 + + expectedSeqLength + + expectedTagsLength; + + EXPECT_TRUE((bool)bam.d_); + EXPECT_EQ(expectedNameLength, bam.d_->core.l_qname); + EXPECT_EQ(expectedNumCigarOps, bam.d_->core.n_cigar); + EXPECT_EQ(expectedSeqLength, bam.d_->core.l_qseq); + EXPECT_EQ(expectedTotalDataLength, bam.d_->l_data); +} + +static +void CheckRawData(const BamRecord& bam) +{ CheckRawData(bam.impl_); } + +} // namespace tests + +TEST(BamRecordBuilderTest, DefaultValues) +{ + BamRecordBuilder builder; + BamRecord bam = builder.Build(); + + const PBBAM_SHARED_PTR rawData = bam.impl_.d_; + ASSERT_TRUE((bool)rawData); + + // fixed-length (core) data + EXPECT_EQ(0, rawData->core.tid); + EXPECT_EQ(0, rawData->core.pos); + EXPECT_EQ(0, rawData->core.bin); + EXPECT_EQ(0, rawData->core.qual); + EXPECT_EQ(1, rawData->core.l_qname); // initialized w/ NULL-term + EXPECT_EQ(0, rawData->core.flag); + EXPECT_EQ(0, rawData->core.n_cigar); + EXPECT_EQ(0, rawData->core.l_qseq); + EXPECT_EQ(0, rawData->core.mtid); + EXPECT_EQ(0, rawData->core.mpos); + EXPECT_EQ(0, rawData->core.isize); + + // variable length data + EXPECT_TRUE(rawData->data != nullptr); + EXPECT_EQ(1, rawData->l_data); + EXPECT_EQ((int)0x800, rawData->m_data); // check this if we change or tune later + + // ------------------------------- + // check data via API calls + // ------------------------------- + + EXPECT_EQ(0, bam.impl_.Bin()); + EXPECT_EQ(0, bam.impl_.Flag()); + EXPECT_EQ(0, bam.impl_.InsertSize()); + EXPECT_EQ(0, bam.impl_.MapQuality()); + EXPECT_EQ(0, bam.impl_.MateReferenceId()); + EXPECT_EQ(0, bam.impl_.MatePosition()); + EXPECT_EQ(0, bam.impl_.Position()); + EXPECT_EQ(0, bam.impl_.ReferenceId()); + EXPECT_EQ(0, bam.impl_.Tags().size()); + + EXPECT_FALSE(bam.impl_.IsDuplicate()); + EXPECT_FALSE(bam.impl_.IsFailedQC()); + EXPECT_FALSE(bam.impl_.IsFirstMate()); + EXPECT_TRUE(bam.impl_.IsMapped()); + EXPECT_TRUE(bam.impl_.IsMateMapped()); + EXPECT_FALSE(bam.impl_.IsMateReverseStrand()); + EXPECT_FALSE(bam.impl_.IsPaired()); + EXPECT_TRUE(bam.impl_.IsPrimaryAlignment()); + EXPECT_FALSE(bam.impl_.IsProperPair()); + EXPECT_FALSE(bam.impl_.IsReverseStrand()); + EXPECT_FALSE(bam.impl_.IsSecondMate()); + EXPECT_FALSE(bam.impl_.IsSupplementaryAlignment()); + + const std::string emptyString = ""; + EXPECT_EQ(emptyString, bam.impl_.Name()); + EXPECT_EQ(emptyString, bam.impl_.CigarData().ToStdString()); + EXPECT_EQ(emptyString, bam.impl_.Sequence()); + EXPECT_EQ(emptyString, bam.impl_.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordBuilderTest, CheckSetters) +{ + // should be 28 bytes, encoded + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = static_cast(-42); + + BamRecordBuilder builder; + builder.Bin(42) + .Flag(42) + .InsertSize(42) + .MapQuality(42) + .MatePosition(42) + .MateReferenceId(42) + .Position(42) + .ReferenceId(42) + .Tags(tags); + + BamRecord bam = builder.Build(); + + // ------------------------------- + // check raw data + // ------------------------------- + + const PBBAM_SHARED_PTR rawData = bam.impl_.d_; + ASSERT_TRUE((bool)rawData); + + // fixed-length (core) data + EXPECT_EQ(42, rawData->core.tid); + EXPECT_EQ(42, rawData->core.pos); + EXPECT_EQ(42, rawData->core.bin); + EXPECT_EQ(42, rawData->core.qual); + EXPECT_EQ(1, rawData->core.l_qname); // initialized w/ NULL-term + EXPECT_EQ(42, rawData->core.flag); + EXPECT_EQ(0, rawData->core.n_cigar); + EXPECT_EQ(0, rawData->core.l_qseq); + EXPECT_EQ(42, rawData->core.mtid); + EXPECT_EQ(42, rawData->core.mpos); + EXPECT_EQ(42, rawData->core.isize); + + // variable length data + EXPECT_TRUE(rawData->data != nullptr); + EXPECT_EQ(29, rawData->l_data); // NULL-term qname + tags + EXPECT_EQ((int)0x800, rawData->m_data); // check this if we change or tune later + + // ------------------------------- + // check data via API calls + // ------------------------------- + + EXPECT_EQ(42, bam.impl_.Bin()); + EXPECT_EQ(42, bam.impl_.Flag()); + EXPECT_EQ(42, bam.impl_.InsertSize()); + EXPECT_EQ(42, bam.impl_.MapQuality()); + EXPECT_EQ(42, bam.impl_.MateReferenceId()); + EXPECT_EQ(42, bam.impl_.MatePosition()); + EXPECT_EQ(42, bam.impl_.Position()); + EXPECT_EQ(42, bam.impl_.ReferenceId()); + + const TagCollection& fetchedTags = bam.impl_.Tags(); + + EXPECT_TRUE(fetchedTags.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(std::string("1abc75"), fetchedTags.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), fetchedTags.at("XY").ToInt32()); + EXPECT_EQ(std::vector({34, 5, 125}), fetchedTags.at("CA").ToUInt8Array()); +} + +//#define SEQ_LENGTH 7000 +//#define NUM_RECORDS 1000 + +//const std::string& TEST_SEQUENCE = std::string(SEQ_LENGTH, 'G'); +//const std::string& TEST_QUALITIES = std::string(SEQ_LENGTH, '='); +//const std::string& TEST_NAME = std::string(SEQ_LENGTH, '/'); +//const std::string& TEST_TAGDATA = std::string(SEQ_LENGTH, '2'); + +//TEST(BamRecordBuilderTest, JustDoingSomeTimings_BamRecordBuilder) +//{ + +// BamRecordBuilder builder; + +// TagCollection tags; +// tags["aa"] = TEST_TAGDATA; +// tags["bb"] = TEST_TAGDATA; +// tags["cc"] = TEST_TAGDATA; +// tags["dd"] = TEST_TAGDATA; +// tags["ee"] = TEST_TAGDATA; +// tags["ff"] = TEST_TAGDATA; + +// auto start = std::chrono::steady_clock::now(); + +// BamRecord record; +// for (size_t i = 0; i < NUM_RECORDS; ++i) { +// builder.Sequence(TEST_SEQUENCE) +// .Qualities(TEST_QUALITIES) +// .Name(TEST_NAME) +// .Tags(tags) +// .BuildInPlace(record); +// } +// auto end = std::chrono::steady_clock::now(); +// (void)record; +// auto diff = end - start; +// std::cout << std::chrono::duration (diff).count() << " ms" << std::endl; +//} + + +//TEST(BamRecordBuilderTest, JustDoingSomeTimings_BamRecordOnly) +//{ +// TagCollection tags; +// tags["aa"] = TEST_TAGDATA; +// tags["bb"] = TEST_TAGDATA; +// tags["cc"] = TEST_TAGDATA; +// tags["dd"] = TEST_TAGDATA; +// tags["ee"] = TEST_TAGDATA; +// tags["ff"] = TEST_TAGDATA; + +// auto start = std::chrono::steady_clock::now(); + +// BamRecord record; +// for (size_t i = 0; i < NUM_RECORDS; ++i) { +// record.SetSequenceAndQualities(TEST_SEQUENCE, TEST_QUALITIES); +// record.Name(TEST_NAME); +// record.Tags(tags); +// } +// auto end = std::chrono::steady_clock::now(); +// (void)record; +// auto diff = end - start; +// std::cout << std::chrono::duration (diff).count() << " ms" << std::endl; +//} + diff --git a/tests/src/test_BamRecordClipping.cpp b/tests/src/test_BamRecordClipping.cpp new file mode 100644 index 0000000..8a5ddb2 --- /dev/null +++ b/tests/src/test_BamRecordClipping.cpp @@ -0,0 +1,1731 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +typedef vector f_data; + +namespace tests { + +static +BamRecord MakeRecord(const Position qStart, + const Position qEnd, + const string& seq, + const string& quals, + const string& tagBases, + const string& tagQuals, + const f_data& frames, + const string& pulseCall = "", + const string& pulseBases = "", + const string& pulseQuals = "", + const f_data& pulseFrames = f_data()) +{ + BamRecordImpl impl; + impl.SetSequenceAndQualities(seq, quals); + + TagCollection tags; + tags["qs"] = qStart; // qStart + tags["qe"] = qEnd; // qEnd + tags["dt"] = tagBases; // deletionTag + tags["st"] = tagBases; // substitutionTag + tags["dq"] = tagQuals; // deletionQV + tags["iq"] = tagQuals; // insertionQV + tags["mq"] = tagQuals; // mergeQV + tags["sq"] = tagQuals; // substitutionQV + tags["ip"] = frames; // IPD + tags["pw"] = frames; // pulseWidth + tags["pc"] = pulseCall; // pulseCall + tags["pt"] = pulseBases; // altLabelTag + tags["pq"] = pulseQuals; // labelQV + tags["pv"] = pulseQuals; // altLabelQV + tags["pg"] = pulseQuals; // pulseMergeQV + tags["pa"] = pulseFrames; // pkmean + tags["pm"] = pulseFrames; // pkmid + impl.Tags(tags); + + return BamRecord(std::move(impl)); +} + +static +BamRecord MakeCCSRecord(const string& seq, + const string& quals, + const string& tagBases, + const string& tagQuals, + const f_data& frames, + const string& pulseCall = "", + const string& pulseBases = "", + const string& pulseQuals = "", + const f_data& pulseFrames = f_data()) +{ + BamRecordImpl impl; + impl.Name("movie/42/ccs"); + impl.SetSequenceAndQualities(seq, quals); + + TagCollection tags; + tags["dt"] = tagBases; // deletionTag + tags["st"] = tagBases; // substitutionTag + tags["dq"] = tagQuals; // deletionQV + tags["iq"] = tagQuals; // insertionQV + tags["mq"] = tagQuals; // mergeQV + tags["sq"] = tagQuals; // substitutionQV + tags["ip"] = frames; // IPD + tags["pw"] = frames; // pulseWidth + tags["pc"] = pulseCall; // pulseCall + tags["pt"] = pulseBases; // altLabelTag + tags["pq"] = pulseQuals; // labelQV + tags["pv"] = pulseQuals; // altLabelQV + tags["pg"] = pulseQuals; // pulseMergeQV + tags["pa"] = pulseFrames; // pkmean + tags["pm"] = pulseFrames; // pkmid + impl.Tags(tags); + + return BamRecord(std::move(impl)); +} + +} // namespace tests + +TEST(BamRecordClippingTest, ClipToQuery_Basic) +{ + const Position qStart = 500; + const Position qEnd = 510; + const string seq = "AACCGTTAGC"; + const string quals = "?]?]?]?]?*"; + const string tagBases = "AACCGTTAGC"; + const string tagQuals = "?]?]?]?]?*"; + const f_data frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + + const string pulseCall = "ttAaAtaCCGggatTTAcatGCt"; + const string pulseBases = pulseCall; + const string pulseQuals = "==?=]==?]?====]?]===?*="; + const f_data pulseFrames = { 0,0,10,0,10,0,0,20,20,30,0,0,0,0,40,40,10,0,0,0,30,20,0 }; + + const int32_t tId = 0; + const Position tPos = 100; + const uint8_t mapQual = 80; + + const Position clipStart = 502; + const Position clipEnd = 509; + + const string seq_clipped = "CCGTTAG"; + const string quals_clipped = "?]?]?]?"; + const string tagBases_clipped = "CCGTTAG"; + const string tagQuals_clipped = "?]?]?]?"; + const f_data frames_clipped = { 20, 20, 30, 40, 40, 10, 30 }; + + const string pulseCall_clipped = "CCGggatTTAcatG"; + const string pulseQuals_clipped = "?]?====]?]===?"; + const f_data pulseFrames_clipped = { 20,20,30,0,0,0,0,40,40,10,0,0,0,30 }; + + const string seq_rev = "GCTAACGGTT"; + const string pulseCall_rev = "aGCatgTAAatccCGGtaTtTaa"; + const string quals_rev = "*?]?]?]?]?"; + const string tagQuals_rev = quals_rev; + const f_data frames_rev = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 }; + + const string seq_rev_clipped = "CTAACGG"; + const string quals_rev_clipped = "?]?]?]?"; + const string tagBases_rev_clipped = seq_rev_clipped; + const string tagQuals_rev_clipped = quals_rev_clipped; + const f_data frames_rev_clipped = { 30, 10, 40, 40, 30, 20, 20 }; + + const string pulseCall_rev_clipped = "CatgTAAatccCGG"; + const string pulseQuals_rev_clipped = "?===]?]====?]?"; + const f_data pulseFrames_rev_clipped = { 30,0,0,0,10,40,40,0,0,0,0,30,20,20 }; + + const string s1_cigar = "10="; + const string s2_cigar = "5=3D5="; + const string s3_cigar = "4=1D2I2D4="; + + const string s1_cigar_clipped = "7="; + const string s2_cigar_clipped = "3=3D4="; + const string s3_cigar_clipped = "2=1D2I2D3="; + + const BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames, + pulseCall, pulseBases, pulseQuals, pulseFrames); + + BamRecord s0 = prototype; // unmapped record + BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual); + BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual); + BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual); + BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual); + BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual); + BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual); + + s0.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + s1.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + s2.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + s3.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + s1_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + s2_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + s3_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + + { // s0 + + EXPECT_FALSE(s0.IsMapped()); + EXPECT_EQ(clipStart, s0.QueryStart()); + EXPECT_EQ(clipEnd, s0.QueryEnd()); + EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.AlignedStart()); + EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.AlignedEnd()); + EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.ReferenceStart()); + EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.ReferenceEnd()); + + const BamRecordView view + { + s0, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq_clipped, view.Sequence()); + EXPECT_EQ(quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_clipped, view.IPD().Data()); + EXPECT_EQ(pulseCall_clipped, view.PulseCalls()); + } + + { // s1 - FORWARD + + EXPECT_TRUE(s1.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand()); + EXPECT_EQ(clipStart, s1.QueryStart()); + EXPECT_EQ(clipEnd, s1.QueryEnd()); + EXPECT_EQ(clipStart, s1.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(clipEnd, s1.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(102, s1.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(109, s1.ReferenceEnd()); // RefStart + 7= + + EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString()); + + const BamRecordView view + { + s1, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq_clipped, view.Sequence()); + EXPECT_EQ(quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_clipped, view.IPD().Data()); + EXPECT_EQ(pulseCall_clipped, view.PulseCalls()); + } + + { // s1 - REVERSE + + EXPECT_TRUE(s1_rev.IsMapped()); + EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand()); + EXPECT_EQ(clipStart, s1_rev.QueryStart()); + EXPECT_EQ(clipEnd, s1_rev.QueryEnd()); + EXPECT_EQ(clipStart, s1_rev.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(clipEnd, s1_rev.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(102, s1_rev.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(109, s1_rev.ReferenceEnd()); // RefStart + 7= + + EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString()); + + const BamRecordView view + { + s1_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq_rev_clipped, view.Sequence()); + EXPECT_EQ(quals_rev_clipped, view.Qualities().Fastq()); + EXPECT_EQ(tagBases_rev_clipped, view.DeletionTags()); + EXPECT_EQ(tagQuals_rev_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(pulseQuals_rev_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(pulseQuals_rev_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_rev_clipped, view.IPD().Data()); + EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls()); + } + + { // s2 - FORWARD + + EXPECT_TRUE(s2.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand()); + EXPECT_EQ(clipStart, s2.QueryStart()); + EXPECT_EQ(clipEnd, s2.QueryEnd()); + EXPECT_EQ(clipStart, s2.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(clipEnd, s2.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(102, s2.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(112, s2.ReferenceEnd()); // RefStart + 7= + 3D + + EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString()); + + const BamRecordView view + { + s2, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq_clipped, view.Sequence()); + EXPECT_EQ(quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_clipped, view.IPD().Data()); + } + + { // s2 - REVERSE + + EXPECT_TRUE(s2_rev.IsMapped()); + EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand()); + EXPECT_EQ(clipStart, s2_rev.QueryStart()); + EXPECT_EQ(clipEnd, s2_rev.QueryEnd()); + EXPECT_EQ(clipStart, s2_rev.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(clipEnd, s2_rev.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(102, s2_rev.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(112, s2_rev.ReferenceEnd()); // RefStart + 7= + 3D + + EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString()); + + const BamRecordView view + { + s2_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq_rev_clipped, view.Sequence()); + EXPECT_EQ(quals_rev_clipped, view.Qualities().Fastq()); + EXPECT_EQ(tagBases_rev_clipped, view.DeletionTags()); + EXPECT_EQ(tagQuals_rev_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(pulseQuals_rev_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(pulseQuals_rev_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_rev_clipped, view.IPD().Data()); + EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls()); + } + + { // s3 - FORWARD + + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand()); + EXPECT_EQ(clipStart, s3.QueryStart()); + EXPECT_EQ(clipEnd, s3.QueryEnd()); + EXPECT_EQ(clipStart, s3.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(clipEnd, s3.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(102, s3.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(110, s3.ReferenceEnd()); // RefStart + 5= + 3D + + EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString()); + + const BamRecordView view + { + s3, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq_clipped, view.Sequence()); + EXPECT_EQ(quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_clipped, view.IPD().Data()); + EXPECT_EQ(pulseCall_clipped, view.PulseCalls()); + } + + { // s3 - REVERSE + + EXPECT_TRUE(s3_rev.IsMapped()); + EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand()); + EXPECT_EQ(clipStart, s3_rev.QueryStart()); + EXPECT_EQ(clipEnd, s3_rev.QueryEnd()); + EXPECT_EQ(clipStart, s3_rev.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(clipEnd, s3_rev.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(102, s3_rev.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(110, s3_rev.ReferenceEnd()); // RefStart + 5= + 3D + + EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString()); + + const BamRecordView view + { + s3_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq_rev_clipped, view.Sequence()); + EXPECT_EQ(quals_rev_clipped, view.Qualities().Fastq()); + EXPECT_EQ(tagBases_rev_clipped, view.DeletionTags()); + EXPECT_EQ(tagQuals_rev_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(pulseQuals_rev_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(pulseQuals_rev_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_rev_clipped, view.IPD().Data()); + EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls()); + } +} + +TEST(BamRecordClippingTest, ClipToQuery_WithSoftClips) +{ + const Position qStart = 500; + const Position qEnd = 515; + const string seq = "TTAACCGTTAGCAAA"; + const string seq_rev = "TTTGCTAACGGTTAA"; + const string quals = "--?]?]?]?]?*+++"; + const string tagBases = "TTAACCGTTAGCAAA"; + const string tagQuals = "--?]?]?]?]?*+++"; + const string tagQuals_rev = "+++*?]?]?]?]?--"; + const f_data frames = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 }; + const f_data frames_rev = { 10, 10, 10, 20, 30, 10, 40, 40, 30, 20, 20, 10, 10, 40, 40 }; + + const int32_t tId = 0; + const Position tPos = 100; + const uint8_t mapQual = 80; + + const Position clipStart = 502; + const Position clipEnd = 509; + + const string s1_cigar = "2S10=3S"; + const string s1_cigar_clipped = "7="; + const string s1_seq_clipped = "AACCGTT"; + const string s1_quals_clipped = "?]?]?]?"; + const string s1_tagBases_clipped = s1_seq_clipped; + const string s1_tagQuals_clipped = s1_quals_clipped; + const f_data s1_frames_clipped = { 10, 10, 20, 20, 30, 40, 40 }; + const string s1_seq_rev_clipped = "AACGGTT"; + const string s1_quals_rev_clipped = "?]?]?]?"; + const string s1_tagBases_rev_clipped = s1_seq_rev_clipped; + const string s1_tagQuals_rev_clipped = s1_quals_rev_clipped; + const f_data s1_frames_rev_clipped = { 40, 40, 30, 20, 20, 10, 10 }; + + const string s2_cigar = "2S5=3D5=3S"; + const string s2_cigar_clipped = "5=3D2="; + const string s2_seq_clipped = "AACCGTT"; + const string s2_quals_clipped = "?]?]?]?"; + const string s2_tagBases_clipped = s2_seq_clipped; + const string s2_tagQuals_clipped = s2_quals_clipped; + const f_data s2_frames_clipped = { 10, 10, 20, 20, 30, 40, 40 }; + const string s2_seq_rev_clipped = "AACGGTT"; + const string s2_quals_rev_clipped = "?]?]?]?"; + const string s2_tagBases_rev_clipped = s2_seq_rev_clipped; + const string s2_tagQuals_rev_clipped = s2_quals_rev_clipped; + const f_data s2_frames_rev_clipped = { 40, 40, 30, 20, 20, 10, 10 }; + + const string s3_cigar = "2S4=1D2I2D4=3S"; + const string s3_cigar_clipped = "4=1D2I2D1="; + const string s3_seq_clipped = "AACCGTT"; + const string s3_quals_clipped = "?]?]?]?"; + const string s3_tagBases_clipped = s3_seq_clipped; + const string s3_tagQuals_clipped = s3_quals_clipped; + const f_data s3_frames_clipped = { 10, 10, 20, 20, 30, 40, 40 }; + const string s3_seq_rev_clipped = "AACGGTT"; + const string s3_quals_rev_clipped = "?]?]?]?"; + const string s3_tagBases_rev_clipped = s3_seq_rev_clipped; + const string s3_tagQuals_rev_clipped = s3_quals_rev_clipped; + const f_data s3_frames_rev_clipped = { 40, 40, 30, 20, 20, 10, 10 }; + + const BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames, + seq, tagBases, tagQuals, frames); + BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual); + BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual); + BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual); + BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual); + BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual); + BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual); + + // sanity checks before clipping + EXPECT_TRUE(s1.IsMapped()); + EXPECT_EQ(tPos, s1.ReferenceStart()); + EXPECT_EQ(tPos + 10, s1.ReferenceEnd()); // 10= + + EXPECT_TRUE(s1_rev.IsMapped()); + EXPECT_EQ(tPos, s1_rev.ReferenceStart()); + EXPECT_EQ(tPos + 10, s1_rev.ReferenceEnd()); // 10= + + EXPECT_TRUE(s2.IsMapped()); + EXPECT_EQ(tPos, s2.ReferenceStart()); + EXPECT_EQ(tPos + 13, s2.ReferenceEnd()); // 5= + 3D + 5= + + EXPECT_TRUE(s2_rev.IsMapped()); + EXPECT_EQ(tPos, s2_rev.ReferenceStart()); + EXPECT_EQ(tPos + 13, s2_rev.ReferenceEnd()); // 5= + 3D + 5= + + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(tPos, s3.ReferenceStart()); + EXPECT_EQ(tPos + 11, s3.ReferenceEnd()); // 4= + 1D + 2D + 4= + + EXPECT_TRUE(s3_rev.IsMapped()); + EXPECT_EQ(tPos, s3_rev.ReferenceStart()); + EXPECT_EQ(tPos + 11, s3_rev.ReferenceEnd()); // 4= + 1D + 2D + 4= + + s1.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + s2.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + s3.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + s1_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + s2_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + s3_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + + { // s1 - FORWARD + + EXPECT_TRUE(s1.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand()); + EXPECT_EQ(clipStart, s1.QueryStart()); + EXPECT_EQ(clipEnd, s1.QueryEnd()); + EXPECT_EQ(clipStart, s1.AlignedStart()); // queryStart (no soft clips left) + EXPECT_EQ(clipEnd, s1.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(tPos, s1.ReferenceStart()); // tPos + EXPECT_EQ(tPos + 7, s1.ReferenceEnd()); // RefStart + 7= + + EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString()); + + const BamRecordView view + { + s1, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s1_seq_clipped, view.Sequence()); + EXPECT_EQ(s1_quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s1_tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(s1_tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s1_frames_clipped, view.IPD().Data()); + } + + { // s1 - REVERSE + + EXPECT_TRUE(s1_rev.IsMapped()); + EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand()); + EXPECT_EQ(clipStart, s1_rev.QueryStart()); + EXPECT_EQ(clipEnd, s1_rev.QueryEnd()); + EXPECT_EQ(clipStart, s1_rev.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(clipEnd, s1_rev.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(tPos, s1_rev.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(tPos + 7, s1_rev.ReferenceEnd()); // RefStart + 7= + + EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString()); + + const BamRecordView view + { + s1_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s1_seq_rev_clipped, view.Sequence()); + EXPECT_EQ(s1_quals_rev_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s1_tagBases_rev_clipped, view.DeletionTags()); + EXPECT_EQ(s1_tagQuals_rev_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s1_frames_rev_clipped, view.IPD().Data()); + } + + { // s2 - FORWARD + + EXPECT_TRUE(s2.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand()); + EXPECT_EQ(clipStart, s2.QueryStart()); + EXPECT_EQ(clipEnd, s2.QueryEnd()); + EXPECT_EQ(clipStart, s2.AlignedStart()); // queryStart (no soft clips left) + EXPECT_EQ(clipEnd, s2.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(tPos, s2.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(tPos + 10, s2.ReferenceEnd()); // RefStart + 5=3D2= + + EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString()); + + const BamRecordView view + { + s2, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s2_seq_clipped, view.Sequence()); + EXPECT_EQ(s2_quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s2_tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(s2_tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s2_frames_clipped, view.IPD().Data()); + } + + { // s2 - REVERSE + + EXPECT_TRUE(s2_rev.IsMapped()); + EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand()); + EXPECT_EQ(clipStart, s2_rev.QueryStart()); + EXPECT_EQ(clipEnd, s2_rev.QueryEnd()); + EXPECT_EQ(clipStart, s2_rev.AlignedStart()); // queryStart (no soft clips left) + EXPECT_EQ(clipEnd, s2_rev.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(tPos, s2_rev.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(tPos + 10, s2_rev.ReferenceEnd()); // RefStart + 5=3D2= + + EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString()); + + const BamRecordView view + { + s2_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s2_seq_rev_clipped, view.Sequence()); + EXPECT_EQ(s2_quals_rev_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s2_tagBases_rev_clipped, view.DeletionTags()); + EXPECT_EQ(s2_tagQuals_rev_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s2_tagQuals_rev_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s2_frames_rev_clipped, view.IPD().Data()); + } + + { // s3 - FORWARD + + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand()); + EXPECT_EQ(clipStart, s3.QueryStart()); + EXPECT_EQ(clipEnd, s3.QueryEnd()); + EXPECT_EQ(clipStart, s3.AlignedStart()); // queryStart (no soft clips left) + EXPECT_EQ(clipEnd, s3.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(tPos, s3.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(tPos + 8, s3.ReferenceEnd()); // RefStart + 4=1D2D1= + + EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString()); + + const BamRecordView view + { + s3, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s3_seq_clipped, view.Sequence()); + EXPECT_EQ(s3_quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s3_frames_clipped, view.IPD().Data()); + } + + { // s3 - REVERSE + EXPECT_TRUE(s3_rev.IsMapped()); + EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand()); + EXPECT_EQ(clipStart, s3_rev.QueryStart()); + EXPECT_EQ(clipEnd, s3_rev.QueryEnd()); + EXPECT_EQ(clipStart, s3_rev.AlignedStart()); // queryStart (no soft clips left) + EXPECT_EQ(clipEnd, s3_rev.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(tPos, s3_rev.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(tPos + 8, s3_rev.ReferenceEnd()); // RefStart + 4=1D2D1= + + EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString()); + + const BamRecordView view + { + s3_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s3_seq_rev_clipped, view.Sequence()); + EXPECT_EQ(s3_quals_rev_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s3_tagBases_rev_clipped, view.DeletionTags()); + EXPECT_EQ(s3_tagQuals_rev_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_rev_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_rev_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s3_frames_rev_clipped, view.IPD().Data()); + } +} + +TEST(BamRecordClippingTest, ClipToReference_Basic) +{ + const Position qStart = 500; + const Position qEnd = 510; + const string seq = "AACCGTTAGC"; + const string quals = "?]?]?]?]?*"; + const string tagBases = "AACCGTTAGC"; + const string tagQuals = "?]?]?]?]?*"; + const string tagQuals_rev = "*?]?]?]?]?"; + const f_data frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + + const int32_t tId = 0; + const Position tPos = 100; + const uint8_t mapQual = 80; + + const Position clipStart = 102; + const Position clipEnd = 107; + + const string s1_cigar = "10="; + const string s1_cigar_clipped = "5="; + const string s1_seq_clipped = "CCGTT"; + const string s1_quals_clipped = "?]?]?"; + const string s1_tagBases_clipped = s1_seq_clipped; + const string s1_tagQuals_clipped = s1_quals_clipped; + const f_data s1_frames_clipped = { 20, 20, 30, 40, 40 }; + const string s1_seq_rev_clipped = "TAACG"; + const string s1_quals_rev_clipped = "]?]?]"; + const string s1_tagBases_rev_clipped = s1_seq_rev_clipped; + const string s1_tagQuals_rev_clipped = s1_quals_rev_clipped; + const f_data s1_frames_rev_clipped = { 10, 40, 40, 30, 20 }; + + const string s2_cigar = "5=3D5="; + const string s2_cigar_clipped = "3=2D"; + const string s2_seq_clipped = "CCG"; + const string s2_quals_clipped = "?]?"; + const string s2_tagBases_clipped = s2_seq_clipped; + const string s2_tagQuals_clipped = s2_quals_clipped; + const f_data s2_frames_clipped = { 20, 20, 30 }; + const string s2_seq_rev_clipped = "TAA"; + const string s2_quals_rev_clipped = "]?]"; + const string s2_tagBases_rev_clipped = s2_seq_rev_clipped; + const string s2_tagQuals_rev_clipped = s2_quals_rev_clipped; + const f_data s2_frames_rev_clipped = { 10, 40, 40 }; + + const string s3_cigar = "4=1D2I2D4="; + const string s3_cigar_clipped = "2=1D2I2D"; + const string s3_seq_clipped = "CCGT"; + const string s3_quals_clipped = "?]?]"; + const string s3_tagBases_clipped = s3_seq_clipped; + const string s3_tagQuals_clipped = s3_quals_clipped; + const f_data s3_frames_clipped = { 20, 20, 30, 40 }; + const string s3_seq_rev_clipped = "TAAC"; + const string s3_quals_rev_clipped = "]?]?"; + const string s3_tagBases_rev_clipped = s3_seq_rev_clipped; + const string s3_tagQuals_rev_clipped = s3_quals_rev_clipped; + const f_data s3_frames_rev_clipped = { 10, 40, 40, 30}; + + const BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames, + seq, tagBases, tagQuals, frames); + BamRecord s0 = prototype; + BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual); + BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual); + BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual); + BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual); + BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual); + BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual); + + s0.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + s1.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + s2.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + s3.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + s1_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + s2_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + s3_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + + { // s0 - no clipping should have been done to unmapped record + + EXPECT_FALSE(s0.IsMapped()); + EXPECT_EQ(prototype.QueryStart(), s0.QueryStart()); + EXPECT_EQ(prototype.QueryEnd(), s0.QueryEnd()); + EXPECT_EQ(prototype.AlignedStart(), s0.AlignedStart()); + EXPECT_EQ(prototype.AlignedEnd(), s0.AlignedEnd()); + EXPECT_EQ(prototype.ReferenceStart(), s0.ReferenceStart()); + EXPECT_EQ(prototype.ReferenceEnd(), s0.ReferenceEnd()); + + const BamRecordView protoView + { + prototype, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + const BamRecordView view + { + s0, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(protoView.Sequence(), view.Sequence()); + EXPECT_EQ(protoView.Qualities(), view.Qualities()); + EXPECT_EQ(protoView.DeletionTags(), view.DeletionTags()); + EXPECT_EQ(protoView.DeletionQVs(), view.DeletionQVs()); + EXPECT_EQ(protoView.LabelQVs(), view.LabelQVs()); + EXPECT_EQ(protoView.AltLabelQVs(), view.AltLabelQVs()); + EXPECT_EQ(protoView.IPD(), view.IPD()); + } + + { // s1 - FORWARD + + EXPECT_TRUE(s1.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand()); + EXPECT_EQ(502, s1.QueryStart()); + EXPECT_EQ(507, s1.QueryEnd()); + EXPECT_EQ(502, s1.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(507, s1.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(clipStart, s1.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s1.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString()); + + const BamRecordView view + { + s1, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s1_seq_clipped, view.Sequence()); + EXPECT_EQ(s1_quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s1_tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(s1_tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s1_tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s1_tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s1_frames_clipped, view.IPD().Data()); + } + + { // s1 - REVERSE + + EXPECT_TRUE(s1_rev.IsMapped()); + EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand()); + EXPECT_EQ(503, s1_rev.QueryStart()); + EXPECT_EQ(508, s1_rev.QueryEnd()); + EXPECT_EQ(503, s1_rev.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(508, s1_rev.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(clipStart, s1_rev.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s1_rev.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString()); + + const BamRecordView view + { + s1_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s1_seq_rev_clipped, view.Sequence()); + EXPECT_EQ(s1_quals_rev_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s1_tagBases_rev_clipped, view.DeletionTags()); + EXPECT_EQ(s1_tagQuals_rev_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s1_tagQuals_rev_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s1_tagQuals_rev_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s1_frames_rev_clipped, view.IPD().Data()); + } + + { // s2 - FORWARD + + EXPECT_TRUE(s2.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand()); + EXPECT_EQ(502, s2.QueryStart()); + EXPECT_EQ(505, s2.QueryEnd()); + EXPECT_EQ(502, s2.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(505, s2.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(clipStart, s2.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s2.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString()); + + const BamRecordView view + { + s2, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s2_seq_clipped, view.Sequence()); + EXPECT_EQ(s2_quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s2_tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(s2_tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s2_tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s2_tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s2_frames_clipped, view.IPD().Data()); + } + + { // s2 - REVERSE + + EXPECT_TRUE(s2_rev.IsMapped()); + EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand()); + EXPECT_EQ(505, s2_rev.QueryStart()); + EXPECT_EQ(508, s2_rev.QueryEnd()); + EXPECT_EQ(505, s2_rev.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(508, s2_rev.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(clipStart, s2_rev.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s2_rev.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString()); + + const BamRecordView view + { + s2_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s2_seq_rev_clipped, view.Sequence()); + EXPECT_EQ(s2_quals_rev_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s2_tagBases_rev_clipped, view.DeletionTags()); + EXPECT_EQ(s2_tagQuals_rev_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s2_tagQuals_rev_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s2_tagQuals_rev_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s2_frames_rev_clipped, view.IPD().Data()); + } + + { // s3 - FORWARD + + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand()); + EXPECT_EQ(502, s3.QueryStart()); + EXPECT_EQ(506, s3.QueryEnd()); + EXPECT_EQ(502, s3.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(506, s3.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(clipStart, s3.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s3.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString()); + + const BamRecordView view + { + s3, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s3_seq_clipped, view.Sequence()); + EXPECT_EQ(s3_quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s3_frames_clipped, view.IPD().Data()); + } + + { // s3 - REVERSE + + EXPECT_TRUE(s3_rev.IsMapped()); + EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand()); + EXPECT_EQ(504, s3_rev.QueryStart()); + EXPECT_EQ(508, s3_rev.QueryEnd()); + EXPECT_EQ(504, s3_rev.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(508, s3_rev.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(clipStart, s3_rev.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s3_rev.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString()); + + const BamRecordView view + { + s3_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s3_seq_rev_clipped, view.Sequence()); + EXPECT_EQ(s3_quals_rev_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s3_tagBases_rev_clipped, view.DeletionTags()); + EXPECT_EQ(s3_tagQuals_rev_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_rev_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_rev_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s3_frames_rev_clipped, view.IPD().Data()); + } +} + +TEST(BamRecordClippingTest, ClipToReference_WithSoftClips) +{ + const Position qStart = 500; + const Position qEnd = 515; + const string seq = "TTAACCGTTAGCAAA"; + const string quals = "--?]?]?]?]?*+++"; + const string tagBases = "TTAACCGTTAGCAAA"; + const string tagQuals = "--?]?]?]?]?*+++"; + const string tagQuals_rev = "+++*?]?]?]?]?--"; + const f_data frames = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 }; + + const int32_t tId = 0; + const Position tPos = 100; + const uint8_t mapQual = 80; + + const Position clipStart = 102; + const Position clipEnd = 107; + + const string seq_rev = "TTTGCTAACGGTTAA"; + const string quals_rev = "+++*?]?]?]?]?--"; + const f_data frames_rev = { 10, 10, 10, 20, 30, 10, 40, 40, 30, 20, 20, 10, 10, 40, 40 }; + + const string s1_cigar = "2S10=3S"; + const string s1_cigar_clipped = "5="; + const string s1_seq_clipped = "CCGTT"; + const string s1_quals_clipped = "?]?]?"; + const string s1_tagBases_clipped = s1_seq_clipped; + const string s1_tagQuals_clipped = s1_quals_clipped; + const f_data s1_frames_clipped = { 20, 20, 30, 40, 40 }; + const string s1_seq_rev_clipped = "CTAAC"; + const string s1_quals_rev_clipped = "?]?]?"; + const string s1_tagBases_rev_clipped = s1_seq_rev_clipped; + const string s1_tagQuals_rev_clipped = s1_quals_rev_clipped; + const f_data s1_frames_rev_clipped = { 30, 10, 40, 40, 30 }; + + const string s2_cigar = "2S5=3D5=3S"; + const string s2_cigar_clipped = "3=2D"; + const string s2_seq_clipped = "CCG"; + const string s2_quals_clipped = "?]?"; + const string s2_tagBases_clipped = s2_seq_clipped; + const string s2_tagQuals_clipped = s2_quals_clipped; + const f_data s2_frames_clipped = { 20, 20, 30 }; + const string s2_seq_rev_clipped = "CTA"; + const string s2_quals_rev_clipped = "?]?"; + const string s2_tagBases_rev_clipped = s2_seq_rev_clipped; + const string s2_tagQuals_rev_clipped = s2_quals_rev_clipped; + const f_data s2_frames_rev_clipped = { 30, 10, 40 }; + + const string s3_cigar = "2S4=1D2I2D4=3S"; + const string s3_cigar_clipped = "2=1D2I2D"; + const string s3_seq_clipped = "CCGT"; + const string s3_quals_clipped = "?]?]"; + const string s3_tagBases_clipped = s3_seq_clipped; + const string s3_tagQuals_clipped = s3_quals_clipped; + const f_data s3_frames_clipped = { 20, 20, 30, 40 }; + const string s3_seq_rev_clipped = "CTAA"; + const string s3_quals_rev_clipped = "?]?]"; + const string s3_tagBases_rev_clipped = s3_seq_rev_clipped; + const string s3_tagQuals_rev_clipped = s3_quals_rev_clipped; + const f_data s3_frames_rev_clipped = { 30, 10, 40, 40 }; + + const BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames, + seq, tagBases, tagQuals, frames); + BamRecord s0 = prototype; + BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual); + BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual); + BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual); + BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual); + BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual); + BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual); + + // sanity checks before clipping + EXPECT_FALSE(s0.IsMapped()); + + EXPECT_TRUE(s1.IsMapped()); + EXPECT_EQ(500, s1.QueryStart()); // queryStart + EXPECT_EQ(515, s1.QueryEnd()); // queryStart + seqLength + EXPECT_EQ(502, s1.AlignedStart()); // queryStart + 2S + EXPECT_EQ(512, s1.AlignedEnd()); // alignedStart + 10= + EXPECT_EQ(tPos, s1.ReferenceStart()); // tPos + EXPECT_EQ(tPos + 10, s1.ReferenceEnd()); // tPos + 10= + + EXPECT_TRUE(s1_rev.IsMapped()); + EXPECT_EQ(500, s1_rev.QueryStart()); // queryStart + EXPECT_EQ(515, s1_rev.QueryEnd()); // queryStart + seqLength + EXPECT_EQ(503, s1_rev.AlignedStart()); // queryStart + 3S + EXPECT_EQ(513, s1_rev.AlignedEnd()); // alignedStart + 10= + EXPECT_EQ(tPos, s1_rev.ReferenceStart()); // tPos + EXPECT_EQ(tPos + 10, s1_rev.ReferenceEnd()); // tPos + 10= + + EXPECT_TRUE(s2.IsMapped()); + EXPECT_EQ(500, s2.QueryStart()); // queryStart + EXPECT_EQ(515, s2.QueryEnd()); // queryStart + seqLength + EXPECT_EQ(502, s2.AlignedStart()); // queryStart + 2S + EXPECT_EQ(512, s2.AlignedEnd()); // alignedStart + 5=5= + EXPECT_EQ(tPos, s2.ReferenceStart()); // tPos + EXPECT_EQ(tPos + 13, s2.ReferenceEnd()); // tPos + 5=3D5= + + EXPECT_TRUE(s2_rev.IsMapped()); + EXPECT_EQ(500, s2_rev.QueryStart()); // queryStart + EXPECT_EQ(515, s2_rev.QueryEnd()); // queryStart + seqLength + EXPECT_EQ(503, s2_rev.AlignedStart()); // queryStart + S + EXPECT_EQ(513, s2_rev.AlignedEnd()); // alignedStart + 5=5= + EXPECT_EQ(tPos, s2_rev.ReferenceStart()); // tPos + EXPECT_EQ(tPos + 13, s2_rev.ReferenceEnd()); // tPos + 5=3D5= + + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(500, s3.QueryStart()); // queryStart + EXPECT_EQ(515, s3.QueryEnd()); // queryStart + seqLength + EXPECT_EQ(502, s3.AlignedStart()); // queryStart + 2S + EXPECT_EQ(512, s3.AlignedEnd()); // alignedStart + 4=2I4= + EXPECT_EQ(tPos, s3.ReferenceStart()); // tPos + EXPECT_EQ(tPos + 11, s3.ReferenceEnd()); // tPos + 4=1D2D4= + + EXPECT_TRUE(s3_rev.IsMapped()); + EXPECT_EQ(500, s3_rev.QueryStart()); // queryStart + EXPECT_EQ(515, s3_rev.QueryEnd()); // queryStart + seqLength + EXPECT_EQ(503, s3_rev.AlignedStart()); // queryStart + 2S + EXPECT_EQ(513, s3_rev.AlignedEnd()); // alignedStart + 4=2I4= + EXPECT_EQ(tPos, s3_rev.ReferenceStart()); // tPos + EXPECT_EQ(tPos + 11, s3_rev.ReferenceEnd()); // tPos + 4=1D2D4= + + s0.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + s1.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + s2.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + s3.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + s1_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + s2_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + s3_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + + { // s0 - no clipping should have been done to unmapped record + + EXPECT_FALSE(s0.IsMapped()); + EXPECT_EQ(prototype.QueryStart(), s0.QueryStart()); + EXPECT_EQ(prototype.QueryEnd(), s0.QueryEnd()); + EXPECT_EQ(prototype.AlignedStart(), s0.AlignedStart()); + EXPECT_EQ(prototype.AlignedEnd(), s0.AlignedEnd()); + EXPECT_EQ(prototype.ReferenceStart(), s0.ReferenceStart()); + EXPECT_EQ(prototype.ReferenceEnd(), s0.ReferenceEnd()); + + const BamRecordView protoView + { + prototype, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + const BamRecordView view + { + s0, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(protoView.Sequence(), view.Sequence()); + EXPECT_EQ(protoView.Qualities(), view.Qualities()); + EXPECT_EQ(protoView.DeletionTags(), view.DeletionTags()); + EXPECT_EQ(protoView.DeletionQVs(), view.DeletionQVs()); + EXPECT_EQ(protoView.LabelQVs(), view.LabelQVs()); + EXPECT_EQ(protoView.AltLabelQVs(), view.AltLabelQVs()); + EXPECT_EQ(protoView.IPD(), view.IPD()); + } + + { // s1 - FORWARD + + EXPECT_TRUE(s1.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand()); + EXPECT_EQ(504, s1.QueryStart()); // new queryStart + EXPECT_EQ(509, s1.QueryEnd()); // queryStart + new seqLength + EXPECT_EQ(504, s1.AlignedStart()); // queryStart (no soft clips remaining) + EXPECT_EQ(509, s1.AlignedEnd()); // alignStart + new seqLength + EXPECT_EQ(clipStart, s1.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s1.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString()); + + const BamRecordView view + { + s1, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s1_seq_clipped, view.Sequence()); + EXPECT_EQ(s1_quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s1_tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(s1_tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s1_tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s1_tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s1_frames_clipped, view.IPD().Data()); + } + + { // s1 - REVERSE + + EXPECT_TRUE(s1_rev.IsMapped()); + EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand()); + EXPECT_EQ(506, s1_rev.QueryStart()); // new queryStart + EXPECT_EQ(511, s1_rev.QueryEnd()); // queryStart + new seqLength + EXPECT_EQ(506, s1_rev.AlignedStart()); // queryStart (no soft clips remaining) + EXPECT_EQ(511, s1_rev.AlignedEnd()); // alignStart + new seqLength + EXPECT_EQ(clipStart, s1_rev.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s1_rev.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString()); + + const BamRecordView view + { + s1_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s1_seq_rev_clipped, view.Sequence()); + EXPECT_EQ(s1_quals_rev_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s1_tagBases_rev_clipped, view.DeletionTags()); + EXPECT_EQ(s1_tagQuals_rev_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s1_tagQuals_rev_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s1_tagQuals_rev_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s1_frames_rev_clipped, view.IPD().Data()); + } + + { // s2 - FORWARD + + EXPECT_TRUE(s2.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand()); + EXPECT_EQ(504, s2.QueryStart()); + EXPECT_EQ(507, s2.QueryEnd()); + EXPECT_EQ(504, s2.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(507, s2.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(clipStart, s2.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s2.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString()); + + const BamRecordView view + { + s2, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s2_seq_clipped, view.Sequence()); + EXPECT_EQ(s2_quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s2_tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(s2_tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s2_tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s2_tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s2_frames_clipped, view.IPD().Data()); + } + + { // s2 - REVERSE + + EXPECT_TRUE(s2_rev.IsMapped()); + EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand()); + EXPECT_EQ(508, s2_rev.QueryStart()); // new queryStart + EXPECT_EQ(511, s2_rev.QueryEnd()); // queryStart + new seqLength + EXPECT_EQ(508, s2_rev.AlignedStart()); // queryStart (no soft clips remaining) + EXPECT_EQ(511, s2_rev.AlignedEnd()); // alignStart + new seqLength + EXPECT_EQ(clipStart, s2_rev.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s2_rev.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString()); + + const BamRecordView view + { + s2_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s2_seq_rev_clipped, view.Sequence()); + EXPECT_EQ(s2_quals_rev_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s2_tagBases_rev_clipped, view.DeletionTags()); + EXPECT_EQ(s2_tagQuals_rev_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s2_tagQuals_rev_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s2_tagQuals_rev_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s2_frames_rev_clipped, view.IPD().Data()); + } + + { // s3 - FORWARD + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand()); + EXPECT_EQ(504, s3.QueryStart()); + EXPECT_EQ(508, s3.QueryEnd()); + EXPECT_EQ(504, s3.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(508, s3.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(clipStart, s3.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s3.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString()); + + const BamRecordView view + { + s3, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s3_seq_clipped, view.Sequence()); + EXPECT_EQ(s3_quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s3_frames_clipped, view.IPD().Data()); + } + + { // s3 - REVERSE + EXPECT_TRUE(s3_rev.IsMapped()); + EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand()); + EXPECT_EQ(507, s3_rev.QueryStart()); // new queryStart + EXPECT_EQ(511, s3_rev.QueryEnd()); // queryStart + new seqLength + EXPECT_EQ(507, s3_rev.AlignedStart()); // queryStart (no soft clips remaining) + EXPECT_EQ(511, s3_rev.AlignedEnd()); // alignStart + new seqLength + EXPECT_EQ(clipStart, s3_rev.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s3_rev.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString()); + + const BamRecordView view + { + s3_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s3_seq_rev_clipped, view.Sequence()); + EXPECT_EQ(s3_quals_rev_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s3_tagBases_rev_clipped, view.DeletionTags()); + EXPECT_EQ(s3_tagQuals_rev_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_rev_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_rev_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s3_frames_rev_clipped, view.IPD().Data()); + } +} + +TEST(BamRecordClippingTest, ClippedToQueryCopy) +{ + const Position qStart = 500; + const Position qEnd = 510; + const int32_t tId = 0; + const Position tPos = 100; + const uint8_t mapQual = 80; + const Position clipStart = 502; + const Position clipEnd = 509; + + const string seq = "AACCGTTAGC"; + const string quals = "?]?]?]?]?*"; + const string tagBases = "AACCGTTAGC"; + const string tagQuals = "?]?]?]?]?*"; + const f_data frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + + const string seq_clipped = "CCGTTAG"; + const string quals_clipped = "?]?]?]?"; + const string tagBases_clipped = "CCGTTAG"; + const string tagQuals_clipped = "?]?]?]?"; + const f_data frames_clipped = { 20, 20, 30, 40, 40, 10, 30 }; + + const string s3_cigar = "4=1D2I2D4="; + const string s3_cigar_clipped = "2=1D2I2D3="; + + BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames, + seq, tagBases, tagQuals, frames); + prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual); + + BamRecord s3 = prototype.Clipped(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand()); + EXPECT_EQ(clipStart, s3.QueryStart()); + EXPECT_EQ(clipEnd, s3.QueryEnd()); + EXPECT_EQ(clipStart, s3.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(clipEnd, s3.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(102, s3.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(110, s3.ReferenceEnd()); // RefStart + 5= + 3D + + EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString()); + + const BamRecordView view + { + s3, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq_clipped, view.Sequence()); + EXPECT_EQ(quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_clipped, view.IPD().Data()); +} + +TEST(BamRecordClippingTest, ClippedToReferenceCopy) +{ + const Position qStart = 500; + const Position qEnd = 510; + const string seq = "AACCGTTAGC"; + const string quals = "?]?]?]?]?*"; + const string tagBases = "AACCGTTAGC"; + const string tagQuals = "?]?]?]?]?*"; + const f_data frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + const int32_t tId = 0; + const Position tPos = 100; + const uint8_t mapQual = 80; + const Position clipStart = 102; + const Position clipEnd = 107; + + const string s3_cigar = "4=1D2I2D4="; + const string s3_cigar_clipped = "2=1D2I2D"; + const string s3_seq_clipped = "CCGT"; + const string s3_quals_clipped = "?]?]"; + const string s3_tagBases_clipped = s3_seq_clipped; + const string s3_tagQuals_clipped = s3_quals_clipped; + const f_data s3_frames_clipped = { 20, 20, 30, 40 }; + + BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames, + seq, tagBases, tagQuals, frames); + prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual); + + const BamRecord s3 = prototype.Clipped(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + + // s3 - FORWARD + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand()); + EXPECT_EQ(502, s3.QueryStart()); + EXPECT_EQ(506, s3.QueryEnd()); + EXPECT_EQ(502, s3.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(506, s3.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(clipStart, s3.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s3.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString()); + + const BamRecordView view + { + s3, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s3_seq_clipped, view.Sequence()); + EXPECT_EQ(s3_quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s3_frames_clipped, view.IPD().Data()); +} + +TEST(BamRecordClippingTest, StaticClippedToQuery) +{ + const Position qStart = 500; + const Position qEnd = 510; + const int32_t tId = 0; + const Position tPos = 100; + const uint8_t mapQual = 80; + const Position clipStart = 502; + const Position clipEnd = 509; + + const string seq = "AACCGTTAGC"; + const string quals = "?]?]?]?]?*"; + const string tagBases = "AACCGTTAGC"; + const string tagQuals = "?]?]?]?]?*"; + const f_data frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + + const string seq_clipped = "CCGTTAG"; + const string quals_clipped = "?]?]?]?"; + const string tagBases_clipped = "CCGTTAG"; + const string tagQuals_clipped = "?]?]?]?"; + const f_data frames_clipped = { 20, 20, 30, 40, 40, 10, 30 }; + + const string s3_cigar = "4=1D2I2D4="; + const string s3_cigar_clipped = "2=1D2I2D3="; + + BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames, + seq, tagBases, tagQuals, frames); + prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual); + + const BamRecord s3 = BamRecord::Clipped(prototype, ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand()); + EXPECT_EQ(clipStart, s3.QueryStart()); + EXPECT_EQ(clipEnd, s3.QueryEnd()); + EXPECT_EQ(clipStart, s3.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(clipEnd, s3.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(102, s3.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(110, s3.ReferenceEnd()); // RefStart + 5= + 3D + + EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString()); + + const BamRecordView view + { + s3, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq_clipped, view.Sequence()); + EXPECT_EQ(quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_clipped, view.IPD().Data()); +} + +TEST(BamRecordClippingTest, StaticClippedToReference) +{ + const Position qStart = 500; + const Position qEnd = 510; + const string seq = "AACCGTTAGC"; + const string quals = "?]?]?]?]?*"; + const string tagBases = "AACCGTTAGC"; + const string tagQuals = "?]?]?]?]?*"; + const f_data frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + const int32_t tId = 0; + const Position tPos = 100; + const uint8_t mapQual = 80; + const Position clipStart = 102; + const Position clipEnd = 107; + + const string s3_cigar = "4=1D2I2D4="; + const string s3_cigar_clipped = "2=1D2I2D"; + const string s3_seq_clipped = "CCGT"; + const string s3_quals_clipped = "?]?]"; + const string s3_tagBases_clipped = s3_seq_clipped; + const string s3_tagQuals_clipped = s3_quals_clipped; + const f_data s3_frames_clipped = { 20, 20, 30, 40 }; + + BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames, + seq, tagBases, tagQuals, frames); + prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual); + + const BamRecord s3 = BamRecord::Clipped(prototype, ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + + // s3 - FORWARD + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand()); + EXPECT_EQ(502, s3.QueryStart()); + EXPECT_EQ(506, s3.QueryEnd()); + EXPECT_EQ(502, s3.AlignedStart()); // queryStart (no soft clips) + EXPECT_EQ(506, s3.AlignedEnd()); // alignStart + seqLength + EXPECT_EQ(clipStart, s3.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s3.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString()); + + const BamRecordView view + { + s3, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s3_seq_clipped, view.Sequence()); + EXPECT_EQ(s3_quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s3_frames_clipped, view.IPD().Data()); +} + +TEST(BamRecordTest, ClipCigarData) +{ + const Position qStart = 500; + const Position qEnd = 515; + const string seq = "TTAACCGTTAGCAAA"; + const string quals = "--?]?]?]?]?*+++"; + const string tagBases = "TTAACCGTTAGCAAA"; + const string tagQuals = "--?]?]?]?]?*+++"; + const f_data frames = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 }; + const uint8_t mapQual = 80; + BamRecord s3 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames, + seq, tagBases, tagQuals, frames); + BamRecord s3_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames, + seq, tagBases, tagQuals, frames); + + const string s3_cigar = "5H2S4=1D2I2D4=3S7H"; + s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual); + s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual); + + const Cigar s3_cigar_raw = s3.CigarData(); + const Cigar s3_cigar_clipped = s3.CigarData(true); + + EXPECT_EQ(s3_cigar, s3_cigar_raw.ToStdString()); + EXPECT_EQ(string("4=1D2I2D4="), s3_cigar_clipped.ToStdString()); +} + +TEST(BamRecordTest, CCS_ClipToQuery) +{ + const int32_t tId = 0; + const Position tPos = 100; + const uint8_t mapQual = 80; + const Position clipStart = 2; + const Position clipEnd = 9; + + const string seq = "AACCGTTAGC"; + const string quals = "?]?]?]?]?*"; + const string tagBases = "AACCGTTAGC"; + const string tagQuals = "?]?]?]?]?*"; + const f_data frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + + const string seq_clipped = "CCGTTAG"; + const string quals_clipped = "?]?]?]?"; + const string tagBases_clipped = "CCGTTAG"; + const string tagQuals_clipped = "?]?]?]?"; + const f_data frames_clipped = { 20, 20, 30, 40, 40, 10, 30 }; + + const string s3_cigar = "4=1D2I2D4="; + const string s3_cigar_clipped = "2=1D2I2D3="; + + BamRecord prototype = tests::MakeCCSRecord(seq, quals, tagBases, tagQuals, frames, + seq, tagBases, tagQuals, frames); + prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual); + + BamRecord s3 = prototype.Clipped(ClipType::CLIP_TO_QUERY, clipStart, clipEnd); + + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand()); + EXPECT_EQ(0, s3.AlignedStart()); // record start (no soft clips) + EXPECT_EQ(7, s3.AlignedEnd()); // alignStart + clipped seqLength + EXPECT_EQ(102, s3.ReferenceStart()); // 100 + startOffset + EXPECT_EQ(110, s3.ReferenceEnd()); // RefStart + 5= + 3D + + EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString()); + + const BamRecordView view + { + s3, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq_clipped, view.Sequence()); + EXPECT_EQ(quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_clipped, view.IPD().Data()); +} + +TEST(BamRecordTest, CCS_ClipToReference) +{ + const string seq = "AACCGTTAGC"; + const string quals = "?]?]?]?]?*"; + const string tagBases = "AACCGTTAGC"; + const string tagQuals = "?]?]?]?]?*"; + const f_data frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + const int32_t tId = 0; + const Position tPos = 100; + const uint8_t mapQual = 80; + const Position clipStart = 102; + const Position clipEnd = 107; + + const string s3_cigar = "4=1D2I2D4="; + const string s3_cigar_clipped = "2=1D2I2D"; + const string s3_seq_clipped = "CCGT"; + const string s3_quals_clipped = "?]?]"; + const string s3_tagBases_clipped = s3_seq_clipped; + const string s3_tagQuals_clipped = s3_quals_clipped; + const f_data s3_frames_clipped = { 20, 20, 30, 40 }; + + BamRecord prototype = tests::MakeCCSRecord(seq, quals, tagBases, tagQuals, frames, + seq, tagBases, tagQuals, frames); + prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual); + + const BamRecord s3 = BamRecord::Clipped(prototype, ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd); + + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand()); + EXPECT_EQ(0, s3.AlignedStart()); // record tart (no soft clips) + EXPECT_EQ(4, s3.AlignedEnd()); // alignStart + clipped seqLength (4) + EXPECT_EQ(clipStart, s3.ReferenceStart()); // clipStart + EXPECT_EQ(clipEnd, s3.ReferenceEnd()); // clipEnd + + EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString()); + + const BamRecordView view + { + s3, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(s3_seq_clipped, view.Sequence()); + EXPECT_EQ(s3_quals_clipped, view.Qualities().Fastq()); + EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags()); + EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq()); + EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq()); + EXPECT_EQ(s3_frames_clipped, view.IPD().Data()); +} diff --git a/tests/src/test_BamRecordImplCore.cpp b/tests/src/test_BamRecordImplCore.cpp new file mode 100644 index 0000000..30226c2 --- /dev/null +++ b/tests/src/test_BamRecordImplCore.cpp @@ -0,0 +1,627 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; + +namespace tests { + +struct Bam1Deleter +{ + void operator()(bam1_t* b) { + if (b) + bam_destroy1(b); + b = nullptr; + } +}; + +static +BamRecordImpl CreateBamImpl(void) +{ + BamRecordImpl bam; + bam.Bin(42); + bam.Flag(42); + bam.InsertSize(42); + bam.MapQuality(42); + bam.MatePosition(42); + bam.MateReferenceId(42); + bam.Position(42); + bam.ReferenceId(42); + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = static_cast(-42); + bam.Tags(tags); + + return bam; +} + +static +void CheckRawData(const BamRecordImpl& bam) +{ + // ensure raw data (lengths at least) matches API-facing data + + const uint32_t expectedNameLength = bam.Name().size() + 1; + const uint32_t expectedNumCigarOps = bam.CigarData().size(); + const int32_t expectedSeqLength = bam.Sequence().length(); + const size_t expectedTagsLength = BamTagCodec::Encode(bam.Tags()).size(); + + // Name CIGAR Sequence Quals Tags + // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + << TAGS >> + + const int expectedTotalDataLength = expectedNameLength + + (expectedNumCigarOps * 4) + + (expectedSeqLength+1)/2 + + expectedSeqLength + + expectedTagsLength; + + EXPECT_TRUE((bool)bam.d_); + EXPECT_EQ(expectedNameLength, bam.d_->core.l_qname); + EXPECT_EQ(expectedNumCigarOps, bam.d_->core.n_cigar); + EXPECT_EQ(expectedSeqLength, bam.d_->core.l_qseq); + EXPECT_EQ(expectedTotalDataLength, bam.d_->l_data); +} + +} // namespace tests + +TEST(BamRecordImplCoreTest, RawDataDefaultValues) +{ + PBBAM_SHARED_PTR rawData(bam_init1(), tests::Bam1Deleter()); + ASSERT_TRUE((bool)rawData); + + // fixed-length (core) data + EXPECT_EQ(0, rawData->core.tid); + EXPECT_EQ(0, rawData->core.pos); + EXPECT_EQ(0, rawData->core.bin); + EXPECT_EQ(0, rawData->core.qual); + EXPECT_EQ(0, rawData->core.l_qname); + EXPECT_EQ(0, rawData->core.flag); + EXPECT_EQ(0, rawData->core.n_cigar); + EXPECT_EQ(0, rawData->core.l_qseq); + EXPECT_EQ(0, rawData->core.mtid); + EXPECT_EQ(0, rawData->core.mpos); + EXPECT_EQ(0, rawData->core.isize); + + // variable length data + EXPECT_EQ(0, rawData->data); + EXPECT_EQ(0, rawData->l_data); + EXPECT_EQ(0, rawData->m_data); +} + +TEST(BamRecordImplCoreTest, DefaultValues) +{ + BamRecordImpl bam; + + // ------------------------------- + // check raw data + // ------------------------------- + + const PBBAM_SHARED_PTR rawData = bam.d_; + ASSERT_TRUE((bool)rawData); + + // fixed-length (core) data + // (forced init unmapped, with NULL-term as QNAME) + EXPECT_EQ(-1, rawData->core.tid); + EXPECT_EQ(-1, rawData->core.pos); + EXPECT_EQ(0, rawData->core.bin); + EXPECT_EQ(255, rawData->core.qual); + EXPECT_EQ(1, rawData->core.l_qname); + EXPECT_EQ(BamRecordImpl::UNMAPPED, rawData->core.flag); + EXPECT_EQ(0, rawData->core.n_cigar); + EXPECT_EQ(0, rawData->core.l_qseq); + EXPECT_EQ(-1, rawData->core.mtid); + EXPECT_EQ(-1, rawData->core.mpos); + EXPECT_EQ(0, rawData->core.isize); + + // variable length data + EXPECT_TRUE(rawData->data != nullptr); + EXPECT_EQ(1, rawData->l_data); + EXPECT_EQ((int)0x800, rawData->m_data); // check this if we change or tune later + + // ------------------------------- + // check data via API calls + // ------------------------------- + + EXPECT_EQ(0, bam.Bin()); + EXPECT_EQ(BamRecordImpl::UNMAPPED, bam.Flag()); + EXPECT_EQ(0, bam.InsertSize()); + EXPECT_EQ(255, bam.MapQuality()); + EXPECT_EQ(-1, bam.MateReferenceId()); + EXPECT_EQ(-1, bam.MatePosition()); + EXPECT_EQ(-1, bam.Position()); + EXPECT_EQ(-1, bam.ReferenceId()); + EXPECT_EQ(0, bam.Tags().size()); + + EXPECT_FALSE(bam.IsDuplicate()); + EXPECT_FALSE(bam.IsFailedQC()); + EXPECT_FALSE(bam.IsFirstMate()); + EXPECT_FALSE(bam.IsMapped()); + EXPECT_TRUE(bam.IsMateMapped()); + EXPECT_FALSE(bam.IsMateReverseStrand()); + EXPECT_FALSE(bam.IsPaired()); + EXPECT_TRUE(bam.IsPrimaryAlignment()); + EXPECT_FALSE(bam.IsProperPair()); + EXPECT_FALSE(bam.IsReverseStrand()); + EXPECT_FALSE(bam.IsSecondMate()); + EXPECT_FALSE(bam.IsSupplementaryAlignment()); + + const std::string emptyString = ""; + EXPECT_EQ(emptyString, bam.Name()); + EXPECT_EQ(emptyString, bam.CigarData().ToStdString()); + EXPECT_EQ(emptyString, bam.Sequence()); + EXPECT_EQ(emptyString, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplCoreTest, CoreSetters) +{ + BamRecordImpl bam; + bam.Bin(42); + bam.Flag(42); + bam.InsertSize(42); + bam.MapQuality(42); + bam.MatePosition(42); + bam.MateReferenceId(42); + bam.Position(42); + bam.ReferenceId(42); + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = static_cast(-42); + bam.Tags(tags); // (28 bytes encoded) + + // ------------------------------- + // check raw data + // ------------------------------- + + const PBBAM_SHARED_PTR rawData = bam.d_; + ASSERT_TRUE((bool)rawData); + + // fixed-length (core) data + EXPECT_EQ(42, rawData->core.tid); + EXPECT_EQ(42, rawData->core.pos); + EXPECT_EQ(42, rawData->core.bin); + EXPECT_EQ(42, rawData->core.qual); + EXPECT_EQ(1, rawData->core.l_qname); // initialized w/ NULL-term + EXPECT_EQ(42, rawData->core.flag); + EXPECT_EQ(0, rawData->core.n_cigar); + EXPECT_EQ(0, rawData->core.l_qseq); + EXPECT_EQ(42, rawData->core.mtid); + EXPECT_EQ(42, rawData->core.mpos); + EXPECT_EQ(42, rawData->core.isize); + + // variable length data + EXPECT_TRUE(rawData->data != nullptr); + EXPECT_EQ(29, rawData->l_data); // NULL-term qname + tags + EXPECT_EQ((int)0x800, rawData->m_data); // check this if we change or tune later + + // ------------------------------- + // check data via API calls + // ------------------------------- + + EXPECT_EQ(42, bam.Bin()); + EXPECT_EQ(42, bam.Flag()); + EXPECT_EQ(42, bam.InsertSize()); + EXPECT_EQ(42, bam.MapQuality()); + EXPECT_EQ(42, bam.MateReferenceId()); + EXPECT_EQ(42, bam.MatePosition()); + EXPECT_EQ(42, bam.Position()); + EXPECT_EQ(42, bam.ReferenceId()); + + const TagCollection& fetchedTags = bam.Tags(); + + EXPECT_TRUE(fetchedTags.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(std::string("1abc75"), fetchedTags.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), fetchedTags.at("XY").ToInt32()); + EXPECT_EQ(std::vector({34, 5, 125}), fetchedTags.at("CA").ToUInt8Array()); +} + +TEST(BamRecordImplCoreTest, DeepCopyFromRawData) +{ + // init raw data + PBBAM_SHARED_PTR rawData(bam_init1(), tests::Bam1Deleter()); + ASSERT_TRUE((bool)rawData); + + rawData->core.tid = 42; + rawData->core.pos = 42; + rawData->core.bin = 42; + rawData->core.qual = 42; + rawData->core.flag = 42; + rawData->core.mtid = 42; + rawData->core.mpos = 42; + rawData->core.isize = 42; + + const int32_t x = 42; + char valueBytes[sizeof x]; + std::copy(static_cast(static_cast(&x)), + static_cast(static_cast(&x)) + sizeof x, + valueBytes); + bam_aux_append(rawData.get(), "XY", 'i', sizeof(x), (uint8_t*)&valueBytes[0]); + + EXPECT_EQ(42, rawData->core.tid); + EXPECT_EQ(42, rawData->core.pos); + EXPECT_EQ(42, rawData->core.bin); + EXPECT_EQ(42, rawData->core.qual); + EXPECT_EQ(0, rawData->core.l_qname); + EXPECT_EQ(42, rawData->core.flag); + EXPECT_EQ(0, rawData->core.n_cigar); + EXPECT_EQ(0, rawData->core.l_qseq); + EXPECT_EQ(42, rawData->core.mtid); + EXPECT_EQ(42, rawData->core.mpos); + EXPECT_EQ(42, rawData->core.isize); + const int32_t fetchedX = bam_aux2i( bam_aux_get(rawData.get(), "XY") ); + EXPECT_EQ(42, fetchedX); + + // static "ctor" + BamRecordImpl bam = BamRecordImpl::FromRawData(rawData); + + // make sure raw data is still valid + EXPECT_EQ(42, rawData->core.tid); + EXPECT_EQ(42, rawData->core.pos); + EXPECT_EQ(42, rawData->core.bin); + EXPECT_EQ(42, rawData->core.qual); + EXPECT_EQ(0, rawData->core.l_qname); + EXPECT_EQ(42, rawData->core.flag); + EXPECT_EQ(0, rawData->core.n_cigar); + EXPECT_EQ(0, rawData->core.l_qseq); + EXPECT_EQ(42, rawData->core.mtid); + EXPECT_EQ(42, rawData->core.mpos); + EXPECT_EQ(42, rawData->core.isize); + EXPECT_TRUE(rawData->data != nullptr); + EXPECT_TRUE(0 != rawData->l_data); + EXPECT_TRUE(0 != rawData->m_data); + + // check new record + EXPECT_EQ(42, bam.Bin()); + EXPECT_EQ(42, bam.Flag()); + EXPECT_EQ(42, bam.InsertSize()); + EXPECT_EQ(42, bam.MapQuality()); + EXPECT_EQ(42, bam.MateReferenceId()); + EXPECT_EQ(42, bam.MatePosition()); + EXPECT_EQ(42, bam.Position()); + EXPECT_EQ(42, bam.ReferenceId()); + EXPECT_EQ(x, bam.Tags()["XY"].ToInt32()); + + EXPECT_TRUE(bam.d_->data != nullptr); + EXPECT_TRUE(bam.d_->m_data >= (int)0x800); // check this if we change or tune later + + // tweak raw data, make sure we've done a deep copy (so BamRecordImpl isn't changed) + rawData->core.pos = 37; + EXPECT_EQ(37, rawData->core.pos); + EXPECT_EQ(42, bam.Position()); + EXPECT_EQ(42, bam.d_->core.pos); +} + +TEST(BamRecordImplCoreTest, CopyAssignment) +{ + BamRecordImpl bam1; + bam1.Bin(42); + bam1.Flag(42); + bam1.InsertSize(42); + bam1.MapQuality(42); + bam1.MatePosition(42); + bam1.MateReferenceId(42); + bam1.Position(42); + bam1.ReferenceId(42); + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = static_cast(-42); + bam1.Tags(tags); + + BamRecordImpl bam2; + bam2 = bam1; + + EXPECT_EQ(42, bam1.Bin()); + EXPECT_EQ(42, bam1.Flag()); + EXPECT_EQ(42, bam1.InsertSize()); + EXPECT_EQ(42, bam1.MapQuality()); + EXPECT_EQ(42, bam1.MateReferenceId()); + EXPECT_EQ(42, bam1.MatePosition()); + EXPECT_EQ(42, bam1.Position()); + EXPECT_EQ(42, bam1.ReferenceId()); + + const TagCollection& fetchedTags1 = bam1.Tags(); + EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), fetchedTags1.at("XY").ToInt32()); + EXPECT_EQ(std::vector({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array()); + + EXPECT_EQ(42, bam2.Bin()); + EXPECT_EQ(42, bam2.Flag()); + EXPECT_EQ(42, bam2.InsertSize()); + EXPECT_EQ(42, bam2.MapQuality()); + EXPECT_EQ(42, bam2.MateReferenceId()); + EXPECT_EQ(42, bam2.MatePosition()); + EXPECT_EQ(42, bam2.Position()); + EXPECT_EQ(42, bam2.ReferenceId()); + + const TagCollection& fetchedTags2 = bam2.Tags(); + EXPECT_TRUE(fetchedTags2.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(std::string("1abc75"), fetchedTags2.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), fetchedTags2.at("XY").ToInt32()); + EXPECT_EQ(std::vector({34, 5, 125}), fetchedTags2.at("CA").ToUInt8Array()); + + tests::CheckRawData(bam1); + tests::CheckRawData(bam2); +} + +TEST(BamRecordImplCoreTest, SelfAssignmentTolerated) +{ + BamRecordImpl bam1; + bam1.Bin(42); + bam1.Flag(42); + bam1.InsertSize(42); + bam1.MapQuality(42); + bam1.MatePosition(42); + bam1.MateReferenceId(42); + bam1.Position(42); + bam1.ReferenceId(42); + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = static_cast(-42); + bam1.Tags(tags); + + bam1 = bam1; + + EXPECT_EQ(42, bam1.Bin()); + EXPECT_EQ(42, bam1.Flag()); + EXPECT_EQ(42, bam1.InsertSize()); + EXPECT_EQ(42, bam1.MapQuality()); + EXPECT_EQ(42, bam1.MateReferenceId()); + EXPECT_EQ(42, bam1.MatePosition()); + EXPECT_EQ(42, bam1.Position()); + EXPECT_EQ(42, bam1.ReferenceId()); + + const TagCollection& fetchedTags1 = bam1.Tags(); + EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), fetchedTags1.at("XY").ToInt32()); + EXPECT_EQ(std::vector({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array()); + + tests::CheckRawData(bam1); +} + +TEST(BamRecordImplCoreTest, CopyConstructor) +{ + BamRecordImpl bam1; + bam1.Bin(42); + bam1.Flag(42); + bam1.InsertSize(42); + bam1.MapQuality(42); + bam1.MatePosition(42); + bam1.MateReferenceId(42); + bam1.Position(42); + bam1.ReferenceId(42); + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = static_cast(-42); + bam1.Tags(tags); + + BamRecordImpl bam2(bam1); + + EXPECT_EQ(42, bam1.Bin()); + EXPECT_EQ(42, bam1.Flag()); + EXPECT_EQ(42, bam1.InsertSize()); + EXPECT_EQ(42, bam1.MapQuality()); + EXPECT_EQ(42, bam1.MateReferenceId()); + EXPECT_EQ(42, bam1.MatePosition()); + EXPECT_EQ(42, bam1.Position()); + EXPECT_EQ(42, bam1.ReferenceId()); + + const TagCollection& fetchedTags1 = bam1.Tags(); + EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), fetchedTags1.at("XY").ToInt32()); + EXPECT_EQ(std::vector({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array()); + + EXPECT_EQ(42, bam2.Bin()); + EXPECT_EQ(42, bam2.Flag()); + EXPECT_EQ(42, bam2.InsertSize()); + EXPECT_EQ(42, bam2.MapQuality()); + EXPECT_EQ(42, bam2.MateReferenceId()); + EXPECT_EQ(42, bam2.MatePosition()); + EXPECT_EQ(42, bam2.Position()); + EXPECT_EQ(42, bam2.ReferenceId()); + + const TagCollection& fetchedTags2 = bam2.Tags(); + EXPECT_TRUE(fetchedTags2.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(std::string("1abc75"), fetchedTags2.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), fetchedTags2.at("XY").ToInt32()); + EXPECT_EQ(std::vector({34, 5, 125}), fetchedTags2.at("CA").ToUInt8Array()); + + tests::CheckRawData(bam1); + tests::CheckRawData(bam2); +} + +TEST(BamRecordImplCoreTest, CreateRecord_InternalTest) +{ + BamRecordImpl bam = tests::CreateBamImpl(); + + EXPECT_EQ(42, bam.Bin()); + EXPECT_EQ(42, bam.Flag()); + EXPECT_EQ(42, bam.InsertSize()); + EXPECT_EQ(42, bam.MapQuality()); + EXPECT_EQ(42, bam.MateReferenceId()); + EXPECT_EQ(42, bam.MatePosition()); + EXPECT_EQ(42, bam.Position()); + EXPECT_EQ(42, bam.ReferenceId()); + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = static_cast(-42); + bam.Tags(tags); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplCoreTest, MoveAssignment) +{ + BamRecordImpl bam; +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpessimizing-move" +#endif + bam = std::move(tests::CreateBamImpl()); +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + + EXPECT_EQ(42, bam.Bin()); + EXPECT_EQ(42, bam.Flag()); + EXPECT_EQ(42, bam.InsertSize()); + EXPECT_EQ(42, bam.MapQuality()); + EXPECT_EQ(42, bam.MateReferenceId()); + EXPECT_EQ(42, bam.MatePosition()); + EXPECT_EQ(42, bam.Position()); + EXPECT_EQ(42, bam.ReferenceId()); + + const TagCollection& fetchedTags1 = bam.Tags(); + EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), fetchedTags1.at("XY").ToInt32()); + EXPECT_EQ(std::vector({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array()); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplCoreTest, MoveConstructor) +{ +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpessimizing-move" +#endif + BamRecordImpl bam(std::move(tests::CreateBamImpl())); +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + + EXPECT_EQ(42, bam.Bin()); + EXPECT_EQ(42, bam.Flag()); + EXPECT_EQ(42, bam.InsertSize()); + EXPECT_EQ(42, bam.MapQuality()); + EXPECT_EQ(42, bam.MateReferenceId()); + EXPECT_EQ(42, bam.MatePosition()); + EXPECT_EQ(42, bam.Position()); + EXPECT_EQ(42, bam.ReferenceId()); + + const TagCollection& fetchedTags1 = bam.Tags(); + EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString()); + EXPECT_EQ(static_cast(-42), fetchedTags1.at("XY").ToInt32()); + EXPECT_EQ(std::vector({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array()); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplCoreTest, AlignmentFlags) +{ + // same set of flags, different ways of getting there + + // raw number + BamRecordImpl bam1; + bam1.Flag(1107); + + // enum values + BamRecordImpl bam2; + bam2.Flag(BamRecordImpl::DUPLICATE | + BamRecordImpl::MATE_1 | + BamRecordImpl::REVERSE_STRAND | + BamRecordImpl::PROPER_PAIR | + BamRecordImpl::PAIRED + ); + + // convenience calls + BamRecordImpl bam3; + bam3.SetDuplicate(true); + bam3.SetFirstMate(true); + bam3.SetReverseStrand(true); + bam3.SetMapped(true); + bam3.SetMateMapped(true); + bam3.SetPaired(true); + bam3.SetProperPair(true); + bam3.SetPrimaryAlignment(true); + + // make sure all are same + EXPECT_EQ(1107, bam1.Flag()); + EXPECT_EQ(1107, bam2.Flag()); + EXPECT_EQ(1107, bam3.Flag()); + + // check API calls + EXPECT_TRUE(bam1.IsPaired()); + EXPECT_TRUE(bam1.IsProperPair()); + EXPECT_TRUE(bam1.IsMapped()); + EXPECT_TRUE(bam1.IsMateMapped()); + EXPECT_TRUE(bam1.IsReverseStrand()); + EXPECT_FALSE(bam1.IsMateReverseStrand()); + EXPECT_TRUE(bam1.IsFirstMate()); + EXPECT_FALSE(bam1.IsSecondMate()); + EXPECT_TRUE(bam1.IsPrimaryAlignment()); + EXPECT_FALSE(bam1.IsFailedQC()); + EXPECT_TRUE(bam1.IsDuplicate()); + EXPECT_FALSE(bam1.IsSupplementaryAlignment()); +} diff --git a/tests/src/test_BamRecordImplTags.cpp b/tests/src/test_BamRecordImplTags.cpp new file mode 100644 index 0000000..197e11e --- /dev/null +++ b/tests/src/test_BamRecordImplTags.cpp @@ -0,0 +1,214 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +// NOTE: these tests check "high-level" tag query/manipulation via BamRecordImpl. +// For raw Tag/TagCollection tests, see test_Tags.cpp +// For encoding tests, see test_BamRecordImplVariableData.cpp + +TEST(BamRecordImplTagsTest, HasTagTest) +{ + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Tags(tags); + + EXPECT_TRUE(bam.HasTag("HX")); + EXPECT_TRUE(bam.HasTag("CA")); + EXPECT_TRUE(bam.HasTag("XY")); + + EXPECT_FALSE(bam.HasTag("zz")); + EXPECT_FALSE(bam.HasTag("")); + EXPECT_FALSE(bam.HasTag("some_too_long_name")); + + const TagCollection& fetchedTags = bam.Tags(); + EXPECT_TRUE(fetchedTags.Contains("HX")); + EXPECT_TRUE(fetchedTags.Contains("CA")); + EXPECT_TRUE(fetchedTags.Contains("XY")); + EXPECT_FALSE(fetchedTags.Contains("zz")); + EXPECT_FALSE(fetchedTags.Contains("")); + EXPECT_FALSE(fetchedTags.Contains("some_too_long_name")); +} + +TEST(BamRecordImplTagsTest, SimpleAddTag) +{ + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + BamRecordImpl bam; + bam.Tags(tags); + + EXPECT_TRUE(bam.HasTag("HX")); + EXPECT_TRUE(bam.HasTag("CA")); + EXPECT_FALSE(bam.HasTag("XY")); + + bam.AddTag("XY", (int32_t)-42); + + EXPECT_TRUE(bam.HasTag("HX")); + EXPECT_TRUE(bam.HasTag("CA")); + EXPECT_TRUE(bam.HasTag("XY")); + + const TagCollection& fetchedTags = bam.Tags(); + EXPECT_TRUE(fetchedTags.Contains("HX")); + EXPECT_TRUE(fetchedTags.Contains("CA")); + EXPECT_TRUE(fetchedTags.Contains("XY")); + EXPECT_FALSE(fetchedTags.Contains("zz")); + EXPECT_FALSE(fetchedTags.Contains("")); + EXPECT_FALSE(fetchedTags.Contains("some_too_long_name")); + + EXPECT_EQ(-42, fetchedTags.at("XY").ToInt32()); + + // fail on invalid adds + EXPECT_FALSE(bam.AddTag("", (int32_t)-42)); + EXPECT_FALSE(bam.AddTag("some_too_long_name", (int32_t)-42)); + EXPECT_FALSE(bam.AddTag("XY", (int32_t)-42)); // reject duplicate +} + +TEST(BamRecordImplTagsTest, SimpleRemoveTag) +{ + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Tags(tags); + + EXPECT_TRUE(bam.HasTag("HX")); + EXPECT_TRUE(bam.HasTag("CA")); + EXPECT_TRUE(bam.HasTag("XY")); + + const bool removedOk = bam.RemoveTag("XY"); + EXPECT_TRUE(removedOk); + + EXPECT_TRUE(bam.HasTag("HX")); + EXPECT_TRUE(bam.HasTag("CA")); + EXPECT_FALSE(bam.HasTag("XY")); + + const TagCollection& fetchedTags = bam.Tags(); + EXPECT_TRUE(fetchedTags.Contains("HX")); + EXPECT_TRUE(fetchedTags.Contains("CA")); + EXPECT_FALSE(fetchedTags.Contains("XY")); + EXPECT_FALSE(fetchedTags.Contains("zz")); + EXPECT_FALSE(fetchedTags.Contains("")); + EXPECT_FALSE(fetchedTags.Contains("some_too_long_name")); + + // fail on invalid removes + EXPECT_FALSE(bam.RemoveTag("")); + EXPECT_FALSE(bam.RemoveTag("some_too_long_name")); + EXPECT_FALSE(bam.RemoveTag("zz")); // reject remove unknown +} + +TEST(BamRecordImplTagsTest, SimpleEditTag) +{ + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Tags(tags); + + EXPECT_TRUE(bam.HasTag("XY")); + + const TagCollection& fetchedTags = bam.Tags(); + EXPECT_TRUE(fetchedTags.Contains("HX")); + EXPECT_TRUE(fetchedTags.Contains("CA")); + EXPECT_TRUE(fetchedTags.Contains("XY")); + EXPECT_EQ(-42, fetchedTags.at("XY").ToInt32()); + + const bool editedOk = bam.EditTag("XY", (int32_t)500); + EXPECT_TRUE(editedOk); + EXPECT_TRUE(bam.HasTag("XY")); + + const TagCollection& fetchedTags2 = bam.Tags(); + EXPECT_TRUE(fetchedTags2.Contains("HX")); + EXPECT_TRUE(fetchedTags2.Contains("CA")); + EXPECT_TRUE(fetchedTags2.Contains("XY")); + EXPECT_EQ(500, fetchedTags2.at("XY").ToInt32()); + + // fail on invalid edits + EXPECT_FALSE(bam.EditTag("", 500)); + EXPECT_FALSE(bam.EditTag("some_too_long_name", 500)); + EXPECT_FALSE(bam.EditTag("zz", 500)); // reject edit unknown +} + +TEST(BamRecordImplTagsTest, SimpleQueryTag) +{ + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Tags(tags); + + EXPECT_TRUE(bam.HasTag("XY")); + EXPECT_TRUE(bam.HasTag("CA")); + EXPECT_TRUE(bam.HasTag("XY")); + + EXPECT_EQ(string("1abc75"), bam.TagValue("HX").ToString()); + EXPECT_EQ(vector({34, 5, 125}), bam.TagValue("CA").ToUInt8Array()); + EXPECT_EQ((int32_t)-42, bam.TagValue("XY").ToInt32()); + + EXPECT_FALSE(bam.HasTag("zz")); + EXPECT_FALSE(bam.HasTag("")); + EXPECT_FALSE(bam.HasTag("some_too_long_name")); + + EXPECT_EQ(Tag(), bam.TagValue("zz")); + EXPECT_EQ(Tag(), bam.TagValue("")); + EXPECT_EQ(Tag(), bam.TagValue("some_too_long_name")); +} + diff --git a/tests/src/test_BamRecordImplVariableData.cpp b/tests/src/test_BamRecordImplVariableData.cpp new file mode 100644 index 0000000..7a63700 --- /dev/null +++ b/tests/src/test_BamRecordImplVariableData.cpp @@ -0,0 +1,4542 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; + +// NOTE: this file has a *TON* of tests. Probably overkill, but I wanted to check +// every possible combination of variable data, and then manipulate each +// element within each combo to shrink & expand. + +namespace tests { + +static +void CheckRawData(const BamRecordImpl& bam) +{ + // ensure raw data (lengths at least) matches API-facing data + + const uint32_t expectedNameLength = bam.Name().size() + 1; + const uint32_t expectedNumCigarOps = bam.CigarData().size(); + const int32_t expectedSeqLength = bam.Sequence().length(); + const size_t expectedTagsLength = BamTagCodec::Encode(bam.Tags()).size(); + + // Name CIGAR Sequence Quals Tags + // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + + + const int expectedTotalDataLength = expectedNameLength + + (expectedNumCigarOps * 4) + + (expectedSeqLength+1)/2 + + expectedSeqLength + + expectedTagsLength; + + EXPECT_EQ(expectedNameLength, bam.d_->core.l_qname); + EXPECT_EQ(expectedNumCigarOps, bam.d_->core.n_cigar); + EXPECT_EQ(expectedSeqLength, bam.d_->core.l_qseq); + EXPECT_EQ(expectedTotalDataLength, bam.d_->l_data); +} + +} // namespace tests + +TEST(BamRecordImplVariableDataTest, InitEmpty) +{ + BamRecordImpl bam; + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, TagOnly_InitEmpty) +{ + BamRecordImpl bam; + bam.Tags(TagCollection()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, TagOnly_InitNormal) +{ + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Tags(tags); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); +} + +TEST(BamRecordImplVariableDataTest, TagOnly_ThenOverwriteWithLongerTags) +{ + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Tags(tags); + bam.Tags(longerTags); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); +} + +TEST(BamRecordImplVariableDataTest, TagOnly_ThenOverwriteWithShorterTags) +{ + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Tags(longerTags); + bam.Tags(tags); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); +} + +TEST(BamRecordImplVariableDataTest, TagOnly_ThenOverwriteWithEmptyTags) +{ + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Tags(tags); + bam.Tags(TagCollection()); + + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarOnly_InitEmpty) +{ + BamRecordImpl bam; + bam.CigarData(std::string()); + EXPECT_EQ(0, bam.CigarData().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarOnly_InitNormal_CigarObject) +{ + Cigar cigar; + cigar.push_back(CigarOperation('=', 100)); + + BamRecordImpl bam; + bam.CigarData(cigar); + + EXPECT_EQ(cigar, bam.CigarData()); + EXPECT_TRUE("100=" == bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarOnly_InitNormal_StdString) +{ + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.CigarData(cigar); + + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarOnly_ThenOverwriteWithLongerCigar) +{ + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + BamRecordImpl bam; + bam.CigarData(cigar); + bam.CigarData(longerCigar); + + EXPECT_EQ(longerCigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarOnly_ThenOverwriteWithShorterCigar) +{ + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + BamRecordImpl bam; + bam.CigarData(longerCigar); + bam.CigarData(cigar); + + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarOnly_ThenOverwriteWithEmptyCigar) +{ + const std::string cigar = "100="; + const std::string empty = ""; + + BamRecordImpl bam; + bam.CigarData(cigar); + bam.CigarData(empty); + + EXPECT_EQ(empty, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarTag_Init_Normal) +{ + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarTag_Init_EmptyCigar) +{ + const std::string cigar = "100="; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.CigarData(cigar); + bam.Tags(tags); + bam.CigarData(empty); + + EXPECT_EQ(empty, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarTag_Init_EmptyTag) +{ + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.CigarData(cigar); + bam.Tags(TagCollection()); + + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithLongerCigar) +{ + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.CigarData(cigar); + bam.Tags(tags); + bam.CigarData(longerCigar); + + EXPECT_EQ(longerCigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithShorterCigar) +{ + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.CigarData(longerCigar); + bam.Tags(tags); + bam.CigarData(cigar); + + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithEmptyCigar) +{ + const std::string cigar = "100="; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.CigarData(cigar); + bam.Tags(tags); + bam.CigarData(empty); + + EXPECT_EQ(empty, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithLongerTags) +{ + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.CigarData(cigar); + bam.Tags(tags); + bam.Tags(longerTags); + + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithShorterTags) +{ + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.CigarData(cigar); + bam.Tags(longerTags); + bam.Tags(tags); + + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithEmptyTags) +{ + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + BamRecordImpl bam; + bam.CigarData(cigar); + bam.Tags(tags); + bam.Tags(TagCollection()); + + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_Empty) +{ + BamRecordImpl bam; + bam.SetSequenceAndQualities(std::string(), std::string()); + EXPECT_EQ(0, bam.Sequence().size()); + EXPECT_EQ(0, bam.Qualities().Fastq().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_NormalQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_EmptyQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_Preencoded) { + + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + const size_t encodedLength = static_cast((sequence.size()+1)/2); + char* encoded = (char*)::calloc(encodedLength, sizeof(char)); + char* e = encoded; + + uint8_t nucleotideCode; + bool useHighWord = true; + for (size_t i = 0; i < sequence.size(); ++i) { + switch (sequence.at(i)) { + case 'A' : nucleotideCode = 1; break; + case 'C' : nucleotideCode = 2; break; + case 'G' : nucleotideCode = 4; break; + case 'T' : nucleotideCode = 8; break; + default: + EXPECT_FALSE(true); + break; + } + + // pack the nucleotide code + if (useHighWord) { + *e = nucleotideCode << 4; + useHighWord = false; + } else { + *e |= nucleotideCode; + ++e; + useHighWord = true; + } + } + + BamRecordImpl bam; + bam.SetPreencodedSequenceAndQualities(encoded, sequence.size(), qualities.c_str()); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); + + if (encoded) + free(encoded); +} + +TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_Preencoded_EmptyQual) { + + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + + const size_t encodedLength = static_cast((sequence.size()+1)/2); + char* encoded = (char*)::calloc(encodedLength, sizeof(char)); + char* e = encoded; + + uint8_t nucleotideCode; + bool useHighWord = true; + for (size_t i = 0; i < sequence.size(); ++i) { + switch (sequence.at(i)) { + case 'A' : nucleotideCode = 1; break; + case 'C' : nucleotideCode = 2; break; + case 'G' : nucleotideCode = 4; break; + case 'T' : nucleotideCode = 8; break; + default: + EXPECT_FALSE(true); + break; + } + + // pack the nucleotide code + if (useHighWord) { + *e = nucleotideCode << 4; + useHighWord = false; + } else { + *e |= nucleotideCode; + ++e; + useHighWord = true; + } + } + + BamRecordImpl bam; + bam.SetPreencodedSequenceAndQualities(encoded, sequence.size(), qualities.c_str()); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); + + if (encoded) + free(encoded); +} + +TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithLongerSeq_NormalQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithLongerSeq_EmptyQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithShorterSeq_NormalQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithShorterSeq_EmptyQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string shortSeq = "ACGT"; + const std::string shortQual = ""; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithEmptySeq) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string empty = ""; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.SetSequenceAndQualities(empty, empty); + + EXPECT_EQ(empty, bam.Sequence()); + EXPECT_EQ(empty, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_Normal) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_EmptySeqQual) +{ + const std::string sequence = ""; + const std::string qualities = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_EmptyQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_EmptyTag) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(TagCollection()); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithLongerSeq_NormalQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.Tags(tags); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithLongerSeq_EmptyQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.Tags(tags); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithShorterSeq_NormalQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithShorterSeq_EmptyQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string shortSeq = "ACGT"; + const std::string shortQual = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithEmptySeq) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.SetSequenceAndQualities(empty, empty); + + EXPECT_EQ(empty, bam.Sequence()); + EXPECT_EQ(empty, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithLongerTags) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.Tags(longerTags); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithShorterTags) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(longerTags); + bam.Tags(tags); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithEmptyTags) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.Tags(TagCollection()); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_Normal) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_EmptySeqQual) +{ + const std::string sequence = ""; + const std::string qualities = ""; + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_EmptyQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_EmptyCigar) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = ""; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithLongerSeq_NormalQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.CigarData(cigar); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithLongerSeq_EmptyQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.CigarData(cigar); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithShorterSeq_NormalQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithShorterSeq_EmptyQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = ""; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithEmptySeq) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string empty = ""; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.SetSequenceAndQualities(empty, empty); + + EXPECT_EQ(empty, bam.Sequence()); + EXPECT_EQ(empty, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithLongerCigar) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.CigarData(longerCigar); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(longerCigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithShorterCigar) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(longerCigar); + bam.CigarData(cigar); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithEmptyCigar) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string empty = ""; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.CigarData(empty); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(empty, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_Normal) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptySeqQual) +{ + const std::string sequence = ""; + const std::string qualities = ""; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptyQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptyCigar) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptyTag) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(TagCollection()); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerSeq_NormalQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.CigarData(cigar); + bam.Tags(tags); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerSeq_EmptyQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.CigarData(cigar); + bam.Tags(tags); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterSeq_NormalQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterSeq_EmptyQual) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithEmptySeq) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.SetSequenceAndQualities(empty, empty); + + EXPECT_EQ(empty, bam.Sequence()); + EXPECT_EQ(empty, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerCigar) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.CigarData(longerCigar); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(longerCigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterCigar) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(longerCigar); + bam.Tags(tags); + bam.CigarData(cigar); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithEmptyCigar) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.CigarData(empty); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(empty, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerTags) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.Tags(longerTags); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterTags) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(longerTags); + bam.Tags(tags); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithEmptyTags) +{ + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.Tags(TagCollection()); + + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameOnly_InitEmpty) +{ + BamRecordImpl bam; + bam.Name(std::string()); + EXPECT_EQ(0, bam.Name().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameOnly_InitNormal) +{ + const std::string readName = "foo"; + + BamRecordImpl bam; + bam.Name(readName); + + EXPECT_EQ(readName, bam.Name()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameOnly_ThenOverwriteWithLongerName) +{ + const std::string readName = "foo"; + const std::string longerName = "this is a long read name"; + + BamRecordImpl bam; + bam.Name(readName); + bam.Name(longerName); + + EXPECT_EQ(longerName, bam.Name()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameOnly_ThenOverwriteWithShorterName) +{ + const std::string readName = "foo"; + const std::string longerName = "this is a long read name"; + + BamRecordImpl bam; + bam.Name(longerName); + bam.Name(readName); + + EXPECT_EQ(readName, bam.Name()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameOnly_ThenOverwriteWithEmptyName) +{ + const std::string readName = "foo"; + const std::string emptyName = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.Name(emptyName); + + EXPECT_EQ(emptyName, bam.Name()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameTag_Init_Normal) +{ + const std::string readName = "foo"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameTag_Init_EmptyName) +{ + const std::string readName = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameTag_Init_EmptyTag) +{ + const std::string readName = "foo"; + + BamRecordImpl bam; + bam.Name(readName); + bam.Tags(TagCollection()); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithLongerName) +{ + const std::string readName = "foo"; + const std::string longerName = "this is a long read name"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.Tags(tags); + bam.Name(longerName); + + EXPECT_EQ(longerName, bam.Name()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithShorterName) +{ + const std::string readName = "foo"; + const std::string longerName = "this is a long read name"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(longerName); + bam.Tags(tags); + bam.Name(readName); + + EXPECT_EQ(readName, bam.Name()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithEmptyName) +{ + const std::string readName = "foo"; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.Tags(tags); + bam.Name(empty); + + EXPECT_EQ(empty, bam.Name()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithLongerTags) +{ + const std::string readName = "foo"; + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + BamRecordImpl bam; + bam.Name(readName); + bam.Tags(tags); + bam.Tags(longerTags); + + EXPECT_EQ(readName, bam.Name()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithShorterTags) +{ + const std::string readName = "foo"; + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + BamRecordImpl bam; + bam.Name(readName); + bam.Tags(longerTags); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithEmptyTags) +{ + const std::string readName = "foo"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.Tags(tags); + bam.Tags(TagCollection()); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigar_Init_Normal) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigar_Init_EmptyName) +{ + const std::string readName = ""; + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigar_Init_EmptyCigar) +{ + const std::string readName = "foo"; + const std::string cigar = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithLongerName) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + const std::string longerName = "this is a long read name"; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Name(longerName); + + EXPECT_EQ(longerName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithShorterName) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + const std::string longerName = "this is a long read name"; + + BamRecordImpl bam; + bam.Name(longerName); + bam.CigarData(cigar); + bam.Name(readName); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithEmptyName) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + const std::string empty = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Name(empty); + + EXPECT_EQ(empty, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithLongerCigar) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.CigarData(longerCigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(longerCigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithShorterCigar) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(longerCigar); + bam.CigarData(cigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithEmptyCigar) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + const std::string empty = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.CigarData(empty); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(empty, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_Normal) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_EmptyName) +{ + const std::string readName = ""; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_EmptyCigar) +{ + const std::string readName = "foo"; + const std::string cigar = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_EmptyTag) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Tags(TagCollection()); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithLongerName) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + const std::string longerName = "this is a long read name"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Tags(tags); + bam.Name(longerName); + + EXPECT_EQ(longerName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithShorterName) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + const std::string longerName = "this is a long read name"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(longerName); + bam.CigarData(cigar); + bam.Tags(tags); + bam.Name(readName); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithEmptyName) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Tags(tags); + bam.Name(empty); + + EXPECT_EQ(empty, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithLongerCigar) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Tags(tags); + bam.CigarData(longerCigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(longerCigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithShorterCigar) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(longerCigar); + bam.Tags(tags); + bam.CigarData(cigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithEmptyCigar) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Tags(tags); + bam.CigarData(empty); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(empty, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithLongerTags) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Tags(tags); + bam.Tags(longerTags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithShorterTags) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Tags(longerTags); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithEmptyTags) +{ + const std::string readName = "foo"; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.CigarData(cigar); + bam.Tags(tags); + bam.Tags(TagCollection()); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQual_Init_Normal) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQual_Init_EmptySeqQual) +{ + const std::string readName = "foo"; + const std::string sequence = ""; + const std::string qualities = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQual_Init_EmptyQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithLongerName) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string longerName = "this is a long read name"; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Name(longerName); + + EXPECT_EQ(longerName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithShorterName) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string longerName = "this is a long read name"; + + BamRecordImpl bam; + bam.Name(longerName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Name(readName); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithEmptyName) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string empty = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Name(empty); + + EXPECT_EQ(empty, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithLongerSeq_NormalQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithLongerSeq_EmptyQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithShorterSeq_NormalQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithShorterSeq_EmptyQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string shortSeq = "ACGT"; + const std::string shortQual = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithEmptySeq) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string empty = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.SetSequenceAndQualities(empty, empty); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(empty, bam.Sequence()); + EXPECT_EQ(empty, bam.Qualities().Fastq()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_Normal) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptyName) +{ + const std::string readName = ""; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptySeqQual) +{ + const std::string readName = "foo"; + const std::string sequence = ""; + const std::string qualities = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptyQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptyTag) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(TagCollection()); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(0, bam.Tags().size()); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerName) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string longerName = "this is a long read name"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.Name(longerName); + + EXPECT_EQ(longerName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterName) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string longerName = "this is a long read name"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(longerName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.Name(readName); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithEmptyName) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.Name(empty); + + EXPECT_EQ(empty, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerSeq_NormalQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.Tags(tags); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerSeq_EmptyQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.Tags(tags); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterSeq_NormalQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterSeq_EmptyQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string shortSeq = "ACGT"; + const std::string shortQual = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithEmptySeq) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.SetSequenceAndQualities(empty, empty); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(empty, bam.Sequence()); + EXPECT_EQ(empty, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerTags) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.Tags(longerTags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterTags) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(longerTags); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithEmptyTags) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.Tags(tags); + bam.Tags(TagCollection()); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_Normal) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptyName) +{ + const std::string readName = ""; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptySeqQual) +{ + const std::string readName = "foo"; + const std::string sequence = ""; + const std::string qualities = ""; + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptyQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptyCigar) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerName) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string longerName = "this is a long read name"; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Name(longerName); + + EXPECT_EQ(longerName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterName) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string longerName = "this is a long read name"; + + BamRecordImpl bam; + bam.Name(longerName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Name(readName); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithEmptyName) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string empty = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Name(empty); + + EXPECT_EQ(empty, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerSeq_NormalQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.CigarData(cigar); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerSeq_EmptyQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.CigarData(cigar); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterSeq_NormalQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterSeq_EmptyQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithEmptySeq) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string empty = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.SetSequenceAndQualities(empty, empty); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(empty, bam.Sequence()); + EXPECT_EQ(empty, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerCigar) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.CigarData(longerCigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(longerCigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterCigar) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(longerCigar); + bam.CigarData(cigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithEmptyCigar) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string empty = ""; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.CigarData(empty); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(empty, bam.CigarData().ToStdString()); + tests::CheckRawData(bam); +} + +// @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_Normal) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyName) +{ + const std::string readName = ""; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptySeqQual) +{ + const std::string readName = "foo"; + const std::string sequence = ""; + const std::string qualities = ""; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyCigar) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyTag) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(TagCollection()); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerName) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string longerName = "this is a long read name"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.Name(longerName); + + EXPECT_EQ(longerName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterName) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string longerName = "this is a long read name"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(longerName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.Name(readName); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptyName) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.Name(empty); + + EXPECT_EQ(empty, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerSeq_NormalQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.CigarData(cigar); + bam.Tags(tags); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerSeq_EmptyQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = ""; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(shortSeq, shortQual); + bam.CigarData(cigar); + bam.Tags(tags); + bam.SetSequenceAndQualities(sequence, qualities); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterSeq_NormalQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = "?]?]"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterSeq_EmptyQual) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string shortSeq = "ACGT"; + const std::string shortQual = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.SetSequenceAndQualities(shortSeq, shortQual); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(shortSeq, bam.Sequence()); + EXPECT_EQ(shortQual, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptySeq) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.SetSequenceAndQualities(empty, empty); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(empty, bam.Sequence()); + EXPECT_EQ(empty, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerCigar) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.CigarData(longerCigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(longerCigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterCigar) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string longerCigar = "100=10D100=10I100X"; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(longerCigar); + bam.Tags(tags); + bam.CigarData(cigar); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptyCigar) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + const std::string empty = ""; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.CigarData(empty); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(empty, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerTags) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.Tags(longerTags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterTags) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + TagCollection longerTags; + longerTags["HX"] = std::string("1abc75"); + longerTags["HX"].Modifier(TagModifier::HEX_STRING); + longerTags["CA"] = std::vector({34, 5, 125}); + longerTags["XY"] = (int32_t)-42; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(longerTags); + bam.Tags(tags); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + + std::string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + + const std::string sam = SamTagCodec::Encode(bam.Tags()); + EXPECT_EQ(expected, sam); + + tests::CheckRawData(bam); +} + +TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptyTags) +{ + const std::string readName = "foo"; + const std::string sequence = "ACGTACGTACGT"; + const std::string qualities = "?]?]?]?]?]?]"; + const std::string cigar = "100="; + + TagCollection tags; + tags["HX"] = std::string("1abc75"); + tags["HX"].Modifier(TagModifier::HEX_STRING); + tags["CA"] = std::vector({34, 5, 125}); + tags["XY"] = (int32_t)-42; + + BamRecordImpl bam; + bam.Name(readName); + bam.SetSequenceAndQualities(sequence, qualities); + bam.CigarData(cigar); + bam.Tags(tags); + bam.Tags(TagCollection()); + + EXPECT_EQ(readName, bam.Name()); + EXPECT_EQ(sequence, bam.Sequence()); + EXPECT_EQ(qualities, bam.Qualities().Fastq()); + EXPECT_EQ(cigar, bam.CigarData().ToStdString()); + EXPECT_EQ(0, bam.Tags().size()); + tests::CheckRawData(bam); +} diff --git a/tests/src/test_BamRecordMapping.cpp b/tests/src/test_BamRecordMapping.cpp new file mode 100644 index 0000000..d1a4af9 --- /dev/null +++ b/tests/src/test_BamRecordMapping.cpp @@ -0,0 +1,745 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +typedef vector f_data; + +namespace tests { + +static +BamRecord MakeRecord(const Position qStart, + const Position qEnd, + const string& seq, + const string& quals, + const string& tagBases, + const string& tagQuals, + const f_data& frames) +{ + BamRecordImpl impl; + impl.SetSequenceAndQualities(seq, quals); + + TagCollection tags; + tags["qs"] = qStart; + tags["qe"] = qEnd; + tags["ip"] = frames; + tags["pw"] = frames; + tags["dt"] = tagBases; + tags["st"] = tagBases; + tags["dq"] = tagQuals; + tags["iq"] = tagQuals; + tags["mq"] = tagQuals; + tags["sq"] = tagQuals; + tags["pq"] = tagQuals; + tags["pv"] = tagQuals; + impl.Tags(tags); + + return BamRecord(std::move(impl)); +} + +} // namespace tests + +TEST(BamRecordMappingTest, BasicMap) +{ + const Position qStart = 500; + const Position qEnd = 510; + const string seq = "AACCGTTAGC"; + const string quals = "?]?]?]?]?*"; + const string tagBases = "AACCGTTAGC"; + const string tagQuals = "?]?]?]?]?*"; + const f_data frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + const uint8_t mapQual = 80; + + const string seq_rev = "GCTAACGGTT"; + const string quals_rev = "*?]?]?]?]?"; + const string tagBases_rev = seq_rev; + const string tagQuals_rev = quals_rev; + const f_data frames_rev = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 }; + + const string s1_cigar = "10="; + const string s2_cigar = "5=3D5="; + const string s3_cigar = "4=1D2I2D4="; + + BamRecord s1 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s2 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s3 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s1_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s2_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s3_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + + s1.Map(0, 100, Strand::FORWARD, s1_cigar, mapQual); + s2.Map(0, 100, Strand::FORWARD, s2_cigar, mapQual); + s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual); + s1_rev.Map(0, 100, Strand::REVERSE, s1_cigar, mapQual); + s2_rev.Map(0, 100, Strand::REVERSE, s2_cigar, mapQual); + s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual); + + { // s1 - FORWARD + EXPECT_TRUE(s1.IsMapped()); + EXPECT_EQ(0, s1.ReferenceId()); + EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand()); + EXPECT_EQ(mapQual, s1.MapQuality()); + + EXPECT_EQ(qStart, s1.QueryStart()); + EXPECT_EQ(qEnd, s1.QueryEnd()); + EXPECT_EQ(500, s1.AlignedStart()); + EXPECT_EQ(510, s1.AlignedEnd()); // 500 + 10= + EXPECT_EQ(100, s1.ReferenceStart()); + EXPECT_EQ(110, s1.ReferenceEnd()); // 100 + 10= + + const BamRecordView view + { + s1, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq, view.Sequence()); + EXPECT_EQ(quals, view.Qualities().Fastq()); + EXPECT_EQ(tagBases, view.DeletionTags()); + EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, view.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, view.IPD().Data()); + } + + { // s1 - REVERSE + + EXPECT_TRUE(s1_rev.IsMapped()); + EXPECT_EQ(0, s1_rev.ReferenceId()); + EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand()); + EXPECT_EQ(mapQual, s1_rev.MapQuality()); + + EXPECT_EQ(qStart, s1_rev.QueryStart()); + EXPECT_EQ(qEnd, s1_rev.QueryEnd()); + EXPECT_EQ(500, s1_rev.AlignedStart()); + EXPECT_EQ(510, s1_rev.AlignedEnd()); // 500 + 10= + EXPECT_EQ(100, s1_rev.ReferenceStart()); + EXPECT_EQ(110, s1_rev.ReferenceEnd()); // 100 + 10= + + // native + const BamRecordView nativeView + { + s1_rev, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + EXPECT_EQ(seq, nativeView.Sequence()); + EXPECT_EQ(quals, nativeView.Qualities().Fastq()); + EXPECT_EQ(tagBases, nativeView.DeletionTags()); + EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, nativeView.IPD().Data()); + + // - genomic + const BamRecordView genomicView + { + s1_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + EXPECT_EQ(seq_rev, genomicView.Sequence()); + EXPECT_EQ(quals_rev, genomicView.Qualities().Fastq()); + EXPECT_EQ(tagBases_rev, genomicView.DeletionTags()); + EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_rev, genomicView.IPD().Data()); + } + + { // s2 - FORWARD + + EXPECT_TRUE(s2.IsMapped()); + EXPECT_EQ(0, s2.ReferenceId()); + EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand()); + EXPECT_EQ(mapQual, s2.MapQuality()); + + EXPECT_EQ(qStart, s2.QueryStart()); + EXPECT_EQ(qEnd, s2.QueryEnd()); + EXPECT_EQ(500, s2.AlignedStart()); + EXPECT_EQ(510, s2.AlignedEnd()); // 500 + 10= + EXPECT_EQ(100, s2.ReferenceStart()); + EXPECT_EQ(113, s2.ReferenceEnd()); // 100 + 10= + 3D + + const BamRecordView view + { + s2, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq, view.Sequence()); + EXPECT_EQ(quals, view.Qualities().Fastq()); + EXPECT_EQ(tagBases, view.DeletionTags()); + EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, view.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, view.IPD().Data()); + } + + { // s2 - REVERSE + + EXPECT_TRUE(s2_rev.IsMapped()); + EXPECT_EQ(0, s2_rev.ReferenceId()); + EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand()); + EXPECT_EQ(mapQual, s2_rev.MapQuality()); + + EXPECT_EQ(qStart, s2_rev.QueryStart()); + EXPECT_EQ(qEnd, s2_rev.QueryEnd()); + EXPECT_EQ(500, s2_rev.AlignedStart()); + EXPECT_EQ(510, s2_rev.AlignedEnd()); // 500 + 10= + EXPECT_EQ(100, s2_rev.ReferenceStart()); + EXPECT_EQ(113, s2_rev.ReferenceEnd()); // 100 + 10= + 3D + + // - native + const BamRecordView nativeView + { + s2_rev, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + EXPECT_EQ(seq, nativeView.Sequence()); + EXPECT_EQ(quals, nativeView.Qualities().Fastq()); + EXPECT_EQ(tagBases, nativeView.DeletionTags()); + EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, nativeView.IPD().Data()); + + // - genomic + const BamRecordView genomicView + { + s2_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + EXPECT_EQ(seq_rev, genomicView.Sequence()); + EXPECT_EQ(quals_rev, genomicView.Qualities().Fastq()); + EXPECT_EQ(tagBases_rev, genomicView.DeletionTags()); + EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_rev, genomicView.IPD().Data()); + } + + { // s3 - FORWARD + + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(0, s3.ReferenceId()); + EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand()); + EXPECT_EQ(mapQual, s3.MapQuality()); + + EXPECT_EQ(qStart, s3.QueryStart()); + EXPECT_EQ(qEnd, s3.QueryEnd()); + EXPECT_EQ(500, s3.AlignedStart()); + EXPECT_EQ(510, s3.AlignedEnd()); // 500 + 8= + 2I + EXPECT_EQ(100, s3.ReferenceStart()); + EXPECT_EQ(111, s3.ReferenceEnd()); // 100 + 8= + 3D + + const BamRecordView view + { + s3, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq, view.Sequence()); + EXPECT_EQ(quals, view.Qualities().Fastq()); + EXPECT_EQ(tagBases, view.DeletionTags()); + EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, view.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, view.IPD().Data()); + } + + { // s3 - REVERSE + + EXPECT_TRUE(s3_rev.IsMapped()); + EXPECT_EQ(0, s3_rev.ReferenceId()); + EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand()); + EXPECT_EQ(mapQual, s3_rev.MapQuality()); + + EXPECT_EQ(qStart, s3_rev.QueryStart()); + EXPECT_EQ(qEnd, s3_rev.QueryEnd()); + EXPECT_EQ(500, s3_rev.AlignedStart()); + EXPECT_EQ(510, s3_rev.AlignedEnd()); // 500 + 8= + 2I + EXPECT_EQ(100, s3_rev.ReferenceStart()); + EXPECT_EQ(111, s3_rev.ReferenceEnd()); // 100 + 8= + 3D + + // - native + const BamRecordView nativeView + { + s3_rev, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + EXPECT_EQ(seq, nativeView.Sequence()); + EXPECT_EQ(quals, nativeView.Qualities().Fastq()); + EXPECT_EQ(tagBases, nativeView.DeletionTags()); + EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, nativeView.IPD().Data()); + + // - genomic + const BamRecordView genomicView + { + s3_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + EXPECT_EQ(seq_rev, genomicView.Sequence()); + EXPECT_EQ(quals_rev, genomicView.Qualities().Fastq()); + EXPECT_EQ(tagBases_rev, genomicView.DeletionTags()); + EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_rev, genomicView.IPD().Data()); + } +} + +TEST(BamRecordMappingTest, SoftClipMapping) +{ + const Position qStart = 500; + const Position qEnd = 515; + const string seq = "TTAACCGTTAGCAAA"; + const string quals = "--?]?]?]?]?*+++"; + const string tagBases = "TTAACCGTTAGCAAA"; + const string tagQuals = "--?]?]?]?]?*+++"; + const f_data frames = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 }; + const uint8_t mapQual = 80; + + const string clipped_seq = "AACCGTTAGC"; + const string clipped_quals = "?]?]?]?]?*"; + const string clipped_tagBases = "AACCGTTAGC"; + const string clipped_tagQuals = "?]?]?]?]?*"; + const f_data clipped_frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + + const string seq_rev = "TTTGCTAACGGTTAA"; + const string quals_rev = "+++*?]?]?]?]?--"; + const string tagBases_rev = seq_rev; + const string tagQuals_rev = quals_rev; + const f_data frames_rev = { 10, 10, 10, 20, 30, 10, 40, 40, 30, 20, 20, 10, 10, 40, 40 }; + + const string clipped_seq_rev = "GCTAACGGTT"; + const string clipped_quals_rev = "*?]?]?]?]?"; + const string clipped_tagBases_rev = clipped_seq_rev; + const string clipped_tagQuals_rev = clipped_quals_rev; + const f_data clipped_frames_rev = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 }; + + const string s1_cigar = "2S10=3S"; + const string s2_cigar = "2S5=3D5=3S"; + const string s3_cigar = "2S4=1D2I2D4=3S"; + + BamRecord s1 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s2 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s3 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s1_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s2_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s3_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + + s1.Map(0, 100, Strand::FORWARD, s1_cigar, mapQual); + s2.Map(0, 100, Strand::FORWARD, s2_cigar, mapQual); + s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual); + s1_rev.Map(0, 100, Strand::REVERSE, s1_cigar, mapQual); + s2_rev.Map(0, 100, Strand::REVERSE, s2_cigar, mapQual); + s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual); + + { // s1 - FORWARD + + EXPECT_TRUE(s1.IsMapped()); + EXPECT_EQ(0, s1.ReferenceId()); + EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand()); + EXPECT_EQ(mapQual, s1.MapQuality()); + + EXPECT_EQ(qStart, s1.QueryStart()); // 500 + EXPECT_EQ(qEnd, s1.QueryEnd()); // QStart + seqLength + EXPECT_EQ(502, s1.AlignedStart()); // QStart + 2S + EXPECT_EQ(512, s1.AlignedEnd()); // AStart + 10= + EXPECT_EQ(100, s1.ReferenceStart()); // 100 + EXPECT_EQ(110, s1.ReferenceEnd()); // RefStart + 10= + + const BamRecordView view + { + s1, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq, view.Sequence()); + EXPECT_EQ(quals, view.Qualities().Fastq()); + EXPECT_EQ(tagBases, view.DeletionTags()); + EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, view.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, view.IPD().Data()); + } + + { // s1 - REVERSE + + EXPECT_TRUE(s1_rev.IsMapped()); + EXPECT_EQ(0, s1_rev.ReferenceId()); + EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand()); + EXPECT_EQ(mapQual, s1_rev.MapQuality()); + + EXPECT_EQ(qStart, s1_rev.QueryStart()); // 500 + EXPECT_EQ(qEnd, s1_rev.QueryEnd()); // QStart + seqLength + EXPECT_EQ(503, s1_rev.AlignedStart()); // QStart + 3S + EXPECT_EQ(513, s1_rev.AlignedEnd()); // AStart + 10= + EXPECT_EQ(100, s1_rev.ReferenceStart()); // 100 + EXPECT_EQ(110, s1_rev.ReferenceEnd()); // RefStart + 10= + + // - native + const BamRecordView nativeView + { + s1_rev, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + EXPECT_EQ(seq, nativeView.Sequence()); + EXPECT_EQ(quals, nativeView.Qualities().Fastq()); + EXPECT_EQ(tagBases, nativeView.DeletionTags()); + EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, nativeView.IPD().Data()); + + // - genomic + const BamRecordView genomicView + { + s1_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + EXPECT_EQ(seq_rev, genomicView.Sequence()); + EXPECT_EQ(quals_rev, genomicView.Qualities().Fastq()); + EXPECT_EQ(tagBases_rev, genomicView.DeletionTags()); + EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_rev, genomicView.IPD().Data()); + } + + { // s2 - FORWARD + + EXPECT_TRUE(s2.IsMapped()); + EXPECT_EQ(0, s2.ReferenceId()); + EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand()); + EXPECT_EQ(mapQual, s2.MapQuality()); + + EXPECT_EQ(qStart, s2.QueryStart()); // 500 + EXPECT_EQ(qEnd, s2.QueryEnd()); // QStart + seqLength + EXPECT_EQ(502, s2.AlignedStart()); // QStart + 2S + EXPECT_EQ(512, s2.AlignedEnd()); // AStart + 10= + EXPECT_EQ(100, s2.ReferenceStart()); // 100 + EXPECT_EQ(113, s2.ReferenceEnd()); // RefStart + 10= + 3D + + const BamRecordView view + { + s2, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq, view.Sequence()); + EXPECT_EQ(quals, view.Qualities().Fastq()); + EXPECT_EQ(tagBases, view.DeletionTags()); + EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, view.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, view.IPD().Data()); + } + + { // s2 - REVERSE + + EXPECT_TRUE(s2_rev.IsMapped()); + EXPECT_EQ(0, s2_rev.ReferenceId()); + EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand()); + EXPECT_EQ(mapQual, s2_rev.MapQuality()); + + EXPECT_EQ(qStart, s2_rev.QueryStart()); // 500 + EXPECT_EQ(qEnd, s2_rev.QueryEnd()); // QStart + seqLength + EXPECT_EQ(503, s2_rev.AlignedStart()); // QStart + 3S + EXPECT_EQ(513, s2_rev.AlignedEnd()); // AStart + 10= + EXPECT_EQ(100, s2_rev.ReferenceStart()); // 100 + EXPECT_EQ(113, s2_rev.ReferenceEnd()); // RefStart + 10= + 3D + + // - native + const BamRecordView nativeView + { + s2_rev, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + EXPECT_EQ(seq, nativeView.Sequence()); + EXPECT_EQ(quals, nativeView.Qualities().Fastq()); + EXPECT_EQ(tagBases, nativeView.DeletionTags()); + EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, nativeView.IPD().Data()); + + // - genomic + const BamRecordView genomicView + { + s2_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + EXPECT_EQ(seq_rev, genomicView.Sequence()); + EXPECT_EQ(quals_rev, genomicView.Qualities().Fastq()); + EXPECT_EQ(tagBases_rev, genomicView.DeletionTags()); + EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_rev, genomicView.IPD().Data()); + } + + { // s3 - FORWARD + + EXPECT_TRUE(s3.IsMapped()); + EXPECT_EQ(0, s3.ReferenceId()); + EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand()); + EXPECT_EQ(mapQual, s3.MapQuality()); + + EXPECT_EQ(qStart, s3.QueryStart()); // 500 + EXPECT_EQ(qEnd, s3.QueryEnd()); // QStart + seqLength + EXPECT_EQ(502, s3.AlignedStart()); // QStart + 2S + EXPECT_EQ(512, s3.AlignedEnd()); // AStart + 8= + 2I + EXPECT_EQ(100, s3.ReferenceStart()); // 100 + EXPECT_EQ(111, s3.ReferenceEnd()); // RefStart + 8= + 3D + + const BamRecordView view + { + s2, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq, view.Sequence()); + EXPECT_EQ(quals, view.Qualities().Fastq()); + EXPECT_EQ(tagBases, view.DeletionTags()); + EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, view.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, view.IPD().Data()); + } + + { // s3 - REVERSE + + EXPECT_TRUE(s3_rev.IsMapped()); + EXPECT_EQ(0, s3_rev.ReferenceId()); + EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand()); + EXPECT_EQ(mapQual, s3_rev.MapQuality()); + + EXPECT_EQ(qStart, s3_rev.QueryStart()); // 500 + EXPECT_EQ(qEnd, s3_rev.QueryEnd()); // QStart + seqLength + EXPECT_EQ(503, s3_rev.AlignedStart()); // QStart + 3S + EXPECT_EQ(513, s3_rev.AlignedEnd()); // AStart + 8= + 2I + EXPECT_EQ(100, s3_rev.ReferenceStart()); // 100 + EXPECT_EQ(111, s3_rev.ReferenceEnd()); // RefStart + 8= + 3D + + // - native + const BamRecordView nativeView + { + s3_rev, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + EXPECT_EQ(seq, nativeView.Sequence()); + EXPECT_EQ(quals, nativeView.Qualities().Fastq()); + EXPECT_EQ(tagBases, nativeView.DeletionTags()); + EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, nativeView.IPD().Data()); + + // - genomic + const BamRecordView genomicView + { + s3_rev, + Orientation::GENOMIC, + false, + false, + PulseBehavior::ALL + }; + EXPECT_EQ(seq_rev, genomicView.Sequence()); + EXPECT_EQ(quals_rev, genomicView.Qualities().Fastq()); + EXPECT_EQ(tagBases_rev, genomicView.DeletionTags()); + EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq()); + EXPECT_EQ(frames_rev, genomicView.IPD().Data()); + } +} + +TEST(BamRecordMappingTest, MappedCopy) +{ + const Position qStart = 500; + const Position qEnd = 510; + const string seq = "AACCGTTAGC"; + const string quals = "?]?]?]?]?*"; + const string tagBases = "AACCGTTAGC"; + const string tagQuals = "?]?]?]?]?*"; + const f_data frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + const uint8_t mapQual = 80; + const string cigar = "4=1D2I2D4="; + + const BamRecord orig = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + const BamRecord mapped = orig.Mapped(0, 100, Strand::FORWARD, cigar, mapQual); + + EXPECT_TRUE(mapped.IsMapped()); + EXPECT_EQ(0, mapped.ReferenceId()); + EXPECT_EQ(Strand::FORWARD, mapped.AlignedStrand()); + EXPECT_EQ(mapQual, mapped.MapQuality()); + + EXPECT_EQ(500, mapped.QueryStart()); // 500 + EXPECT_EQ(510, mapped.QueryEnd()); // QStart + seqLength + EXPECT_EQ(500, mapped.AlignedStart()); // QStart + EXPECT_EQ(510, mapped.AlignedEnd()); // QStart + 8= + 2I + EXPECT_EQ(100, mapped.ReferenceStart()); // 100 + EXPECT_EQ(111, mapped.ReferenceEnd()); // RefStart + 8= + 3D + + const BamRecordView view + { + mapped, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq, view.Sequence()); + EXPECT_EQ(quals, view.Qualities().Fastq()); + EXPECT_EQ(tagBases, view.DeletionTags()); + EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, view.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, view.IPD().Data()); +} + +TEST(BamRecordMappingTest, StaticMapped) +{ + const Position qStart = 500; + const Position qEnd = 510; + const string seq = "AACCGTTAGC"; + const string quals = "?]?]?]?]?*"; + const string tagBases = "AACCGTTAGC"; + const string tagQuals = "?]?]?]?]?*"; + const f_data frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + const uint8_t mapQual = 80; + const string cigar = "4=1D2I2D4="; + + const BamRecord orig = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + const BamRecord mapped = BamRecord::Mapped(orig, 0, 100, Strand::FORWARD, cigar, mapQual); + + EXPECT_TRUE(mapped.IsMapped()); + EXPECT_EQ(0, mapped.ReferenceId()); + EXPECT_EQ(Strand::FORWARD, mapped.AlignedStrand()); + EXPECT_EQ(mapQual, mapped.MapQuality()); + + EXPECT_EQ(500, mapped.QueryStart()); // 500 + EXPECT_EQ(510, mapped.QueryEnd()); // QStart + seqLength + EXPECT_EQ(500, mapped.AlignedStart()); // QStart + EXPECT_EQ(510, mapped.AlignedEnd()); // QStart + 8= + 2I + EXPECT_EQ(100, mapped.ReferenceStart()); // 100 + EXPECT_EQ(111, mapped.ReferenceEnd()); // RefStart + 8= + 3D + + const BamRecordView view + { + mapped, + Orientation::NATIVE, + false, + false, + PulseBehavior::ALL + }; + + EXPECT_EQ(seq, view.Sequence()); + EXPECT_EQ(quals, view.Qualities().Fastq()); + EXPECT_EQ(tagBases, view.DeletionTags()); + EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq()); + EXPECT_EQ(tagQuals, view.LabelQVs().Fastq()); + EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq()); + EXPECT_EQ(frames, view.IPD().Data()); +} diff --git a/tests/src/test_BamWriter.cpp b/tests/src/test_BamWriter.cpp new file mode 100644 index 0000000..75dffe5 --- /dev/null +++ b/tests/src/test_BamWriter.cpp @@ -0,0 +1,127 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +TEST(BamWriterTest, SingleWrite_UserRecord) +{ + const string fullName = "test/100/0_5"; + const string rgId = "6002b307"; + const vector expectedSnr = {0.2,0.2,0.2,0.2}; + + // setup header + const string hdrText = { + "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n" + "@RG\tID:6002b307\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;" + "SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\t" + "PU:test\tPM:SEQUEL\n" + }; + BamHeader inputHeader(hdrText); + + // setup record + BamRecord bamRecord(inputHeader); + bamRecord.impl_.Name(fullName); + bamRecord.impl_.SetSequenceAndQualities("ACGTC", 5); + bamRecord.impl_.CigarData(""); + bamRecord.impl_.Bin(0); + bamRecord.impl_.Flag(0); + bamRecord.impl_.InsertSize(0); + bamRecord.impl_.MapQuality(0); + bamRecord.impl_.MatePosition(-1); + bamRecord.impl_.MateReferenceId(-1); + bamRecord.impl_.Position(-1); + bamRecord.impl_.ReferenceId(-1); + bamRecord.impl_.SetMapped(false); + + TagCollection tags; + tags["zm"] = static_cast(100); + tags["qs"] = static_cast(0); + tags["qe"] = static_cast(5); + tags["np"] = static_cast(1); + tags["rq"] = static_cast(0.6); + tags["RG"] = rgId; + tags["sn"] = expectedSnr; + bamRecord.impl_.Tags(tags); + + // write record to file + const string generatedBamFn = tests::GeneratedData_Dir + "/bamwriter_generated.bam"; + { + BamWriter writer(generatedBamFn, inputHeader); + writer.Write(bamRecord); + } + + // check written header + BamFile file(generatedBamFn); + const auto header = file.Header(); + EXPECT_EQ(std::string("1.1"), header.Version()); + EXPECT_EQ(std::string("unknown"), header.SortOrder()); + EXPECT_EQ(std::string("3.0.1"), header.PacBioBamVersion()); + + // check written record + EntireFileQuery entireFile(file); + auto firstIter = entireFile.begin(); + auto record = *firstIter; + EXPECT_EQ(std::string("ACGTC"), record.Sequence()); + EXPECT_EQ(std::string("test/100/0_5"), record.FullName()); + EXPECT_TRUE(record.HasHoleNumber()); + EXPECT_TRUE(record.HasNumPasses()); + EXPECT_TRUE(record.HasQueryEnd()); + EXPECT_TRUE(record.HasQueryStart()); + EXPECT_TRUE(record.HasReadAccuracy()); + EXPECT_TRUE(record.HasSignalToNoise()); + EXPECT_EQ(100, record.HoleNumber()); + EXPECT_EQ(1, record.NumPasses()); + EXPECT_EQ(0, record.QueryStart()); + EXPECT_EQ(5, record.QueryEnd()); + EXPECT_EQ(expectedSnr, record.SignalToNoise()); + EXPECT_EQ(rgId, record.ReadGroupId()); + + // clean up + remove(generatedBamFn.c_str()); +} diff --git a/tests/src/test_BarcodeQuery.cpp b/tests/src/test_BarcodeQuery.cpp new file mode 100644 index 0000000..6ec02a8 --- /dev/null +++ b/tests/src/test_BarcodeQuery.cpp @@ -0,0 +1,53 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +TEST(BarcodeQueryTest, QueryOk) +{ + // come back with barcoded data +} diff --git a/tests/src/test_Cigar.cpp b/tests/src/test_Cigar.cpp new file mode 100644 index 0000000..796720a --- /dev/null +++ b/tests/src/test_Cigar.cpp @@ -0,0 +1,198 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +TEST(CigarTest, TypeToCar) +{ + EXPECT_EQ('M', CigarOperation::TypeToChar(CigarOperationType::ALIGNMENT_MATCH) ); + EXPECT_EQ('I', CigarOperation::TypeToChar(CigarOperationType::INSERTION) ); + EXPECT_EQ('D', CigarOperation::TypeToChar(CigarOperationType::DELETION) ); + EXPECT_EQ('N', CigarOperation::TypeToChar(CigarOperationType::REFERENCE_SKIP) ); + EXPECT_EQ('S', CigarOperation::TypeToChar(CigarOperationType::SOFT_CLIP) ); + EXPECT_EQ('H', CigarOperation::TypeToChar(CigarOperationType::HARD_CLIP) ); + EXPECT_EQ('P', CigarOperation::TypeToChar(CigarOperationType::PADDING) ); + EXPECT_EQ('=', CigarOperation::TypeToChar(CigarOperationType::SEQUENCE_MATCH) ); + EXPECT_EQ('X', CigarOperation::TypeToChar(CigarOperationType::SEQUENCE_MISMATCH) ); +} + +TEST(CigarTest, CharToType) +{ + EXPECT_EQ(CigarOperationType::ALIGNMENT_MATCH, CigarOperation::CharToType('M')); + EXPECT_EQ(CigarOperationType::INSERTION, CigarOperation::CharToType('I')); + EXPECT_EQ(CigarOperationType::DELETION, CigarOperation::CharToType('D')); + EXPECT_EQ(CigarOperationType::REFERENCE_SKIP, CigarOperation::CharToType('N')); + EXPECT_EQ(CigarOperationType::SOFT_CLIP, CigarOperation::CharToType('S')); + EXPECT_EQ(CigarOperationType::HARD_CLIP, CigarOperation::CharToType('H')); + EXPECT_EQ(CigarOperationType::PADDING, CigarOperation::CharToType('P')); + EXPECT_EQ(CigarOperationType::SEQUENCE_MATCH, CigarOperation::CharToType('=')); + EXPECT_EQ(CigarOperationType::SEQUENCE_MISMATCH, CigarOperation::CharToType('X')); +} + +TEST(CigarTest, SetOperationYieldsCorrectType) +{ + CigarOperation c1; c1.Type(CigarOperationType::ALIGNMENT_MATCH); + CigarOperation c2; c2.Type(CigarOperationType::INSERTION); + CigarOperation c3; c3.Type(CigarOperationType::DELETION); + CigarOperation c4; c4.Type(CigarOperationType::REFERENCE_SKIP); + CigarOperation c5; c5.Type(CigarOperationType::SOFT_CLIP); + CigarOperation c6; c6.Type(CigarOperationType::HARD_CLIP); + CigarOperation c7; c7.Type(CigarOperationType::PADDING); + CigarOperation c8; c8.Type(CigarOperationType::SEQUENCE_MATCH); + CigarOperation c9; c9.Type(CigarOperationType::SEQUENCE_MISMATCH); + + EXPECT_EQ('M', c1.Char()); + EXPECT_EQ('I', c2.Char()); + EXPECT_EQ('D', c3.Char()); + EXPECT_EQ('N', c4.Char()); + EXPECT_EQ('S', c5.Char()); + EXPECT_EQ('H', c6.Char()); + EXPECT_EQ('P', c7.Char()); + EXPECT_EQ('=', c8.Char()); + EXPECT_EQ('X', c9.Char()); +} + +TEST(CigarTest, SetTypeYieldsCorrectOperation) +{ + CigarOperation c1; c1.Char('M'); + CigarOperation c2; c2.Char('I'); + CigarOperation c3; c3.Char('D'); + CigarOperation c4; c4.Char('N'); + CigarOperation c5; c5.Char('S'); + CigarOperation c6; c6.Char('H'); + CigarOperation c7; c7.Char('P'); + CigarOperation c8; c8.Char('='); + CigarOperation c9; c9.Char('X'); + + EXPECT_EQ(CigarOperationType::ALIGNMENT_MATCH, c1.Type()); + EXPECT_EQ(CigarOperationType::INSERTION, c2.Type()); + EXPECT_EQ(CigarOperationType::DELETION, c3.Type()); + EXPECT_EQ(CigarOperationType::REFERENCE_SKIP, c4.Type()); + EXPECT_EQ(CigarOperationType::SOFT_CLIP, c5.Type()); + EXPECT_EQ(CigarOperationType::HARD_CLIP, c6.Type()); + EXPECT_EQ(CigarOperationType::PADDING, c7.Type()); + EXPECT_EQ(CigarOperationType::SEQUENCE_MATCH, c8.Type()); + EXPECT_EQ(CigarOperationType::SEQUENCE_MISMATCH, c9.Type()); +} + +TEST(CigarStringTest, FromStdString_Empty) +{ + const string emptyCigar = ""; + Cigar cigar = Cigar::FromStdString(emptyCigar); + EXPECT_TRUE(cigar.empty()); +} + +TEST(CigarStringTest, FromStdString_SingleOp) +{ + const string singleCigar = "100="; + + Cigar cigar = Cigar::FromStdString(singleCigar); + ASSERT_TRUE(cigar.size() == 1); + + const CigarOperation& op = cigar.front(); + EXPECT_TRUE(op.Char() == '='); + EXPECT_TRUE(op.Length() == 100); +} + +TEST(CigarStringTest, FromStdString_MultipleOps) +{ + const string multiCigar = "100=2D34I6=6X6="; + + Cigar cigar = Cigar::FromStdString(multiCigar); + ASSERT_TRUE(cigar.size() == 6); + + CigarOperation op0 = cigar.at(0); + CigarOperation op1 = cigar.at(1); + CigarOperation op2 = cigar.at(2); + CigarOperation op3 = cigar.at(3); + CigarOperation op4 = cigar.at(4); + CigarOperation op5 = cigar.at(5); + + EXPECT_TRUE(op0.Char() == '='); + EXPECT_TRUE(op0.Length() == 100); + EXPECT_TRUE(op1.Char() == 'D'); + EXPECT_TRUE(op1.Length() == 2); + EXPECT_TRUE(op2.Char() == 'I'); + EXPECT_TRUE(op2.Length() == 34); + EXPECT_TRUE(op3.Char() == '='); + EXPECT_TRUE(op3.Length() == 6); + EXPECT_TRUE(op4.Char() == 'X'); + EXPECT_TRUE(op4.Length() == 6); + EXPECT_TRUE(op5.Char() == '='); + EXPECT_TRUE(op5.Length() == 6); +} + +TEST(CigarStringTest, ToStdString_Empty) +{ + const string empty; + Cigar cigar; + EXPECT_EQ(empty, cigar.ToStdString()); +} + +TEST(CigarStringTest, ToStdString_SingleOp) +{ + const string singleCigar = "100="; + + Cigar cigar; + cigar.push_back( CigarOperation(CigarOperationType::SEQUENCE_MATCH, 100) ); + + EXPECT_EQ(singleCigar, cigar.ToStdString()); +} + +TEST(CigarStringTest, ToStdString_MultipleOps) +{ + const string multiCigar = "100=2D34I6=6X6="; + + Cigar cigar; + cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH, 100)); + cigar.push_back(CigarOperation(CigarOperationType::DELETION, 2)); + cigar.push_back(CigarOperation(CigarOperationType::INSERTION, 34)); + cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH, 6)); + cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MISMATCH, 6)); + cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH, 6)); + + EXPECT_EQ(multiCigar, cigar.ToStdString()); +} diff --git a/tests/src/test_Compare.cpp b/tests/src/test_Compare.cpp new file mode 100644 index 0000000..e5ed933 --- /dev/null +++ b/tests/src/test_Compare.cpp @@ -0,0 +1,739 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace tests { + +static inline +BamRecord makeRecordWithTag(const string& tagName, + const Tag& tag) +{ + auto r = BamRecord{ }; + r.Impl().AddTag(tagName, tag); + return r; +} + +static +BamRecord makeRecord(const Position qStart, + const Position qEnd, + const string& seq, + const string& quals, + const string& tagBases, + const string& tagQuals, + const vector& frames) +{ + BamRecordImpl impl; + impl.SetSequenceAndQualities(seq, quals); + + TagCollection tags; + tags["qs"] = qStart; + tags["qe"] = qEnd; + tags["ip"] = frames; + tags["pw"] = frames; + tags["dt"] = tagBases; + tags["st"] = tagBases; + tags["dq"] = tagQuals; + tags["iq"] = tagQuals; + tags["mq"] = tagQuals; + tags["sq"] = tagQuals; + tags["pq"] = tagQuals; + tags["pv"] = tagQuals; + impl.Tags(tags); + + return BamRecord(std::move(impl)); +} + +static +std::vector makeMappedRecords(void) +{ + const Position qStart = 500; + const Position qEnd = 510; + const string seq = "AACCGTTAGC"; + const string quals = "?]?]?]?]?*"; + const string tagBases = "AACCGTTAGC"; + const string tagQuals = "?]?]?]?]?*"; + const vector frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 }; + const uint8_t mapQual = 80; + + const string s1_cigar = "10="; + const string s2_cigar = "5=3D5="; + const string s3_cigar = "4=1D2I2D2X2="; + + BamRecord s1 = tests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s2 = tests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s3 = tests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s1_rev = tests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s2_rev = tests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + BamRecord s3_rev = tests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames); + + s1.Map(0, 100, Strand::FORWARD, s1_cigar, mapQual); + s2.Map(0, 100, Strand::FORWARD, s2_cigar, mapQual); + s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual); + s1_rev.Map(0, 100, Strand::REVERSE, s1_cigar, mapQual); + s2_rev.Map(0, 100, Strand::REVERSE, s2_cigar, mapQual); + s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual); + + return std::vector { s1, s2, s3, s1_rev, s2_rev, s3_rev }; +} + +} // namespace tests + +TEST(CompareTest, TypeToNameOk) +{ + EXPECT_EQ(string{"Compare::EQUAL"}, Compare::TypeToName(Compare::EQUAL)); + EXPECT_EQ(string{"Compare::NOT_EQUAL"}, Compare::TypeToName(Compare::NOT_EQUAL)); + EXPECT_EQ(string{"Compare::LESS_THAN"}, Compare::TypeToName(Compare::LESS_THAN)); + EXPECT_EQ(string{"Compare::LESS_THAN_EQUAL"}, Compare::TypeToName(Compare::LESS_THAN_EQUAL)); + EXPECT_EQ(string{"Compare::GREATER_THAN"}, Compare::TypeToName(Compare::GREATER_THAN)); + EXPECT_EQ(string{"Compare::GREATER_THAN_EQUAL"}, Compare::TypeToName(Compare::GREATER_THAN_EQUAL)); + EXPECT_EQ(string{"Compare::CONTAINS"}, Compare::TypeToName(Compare::CONTAINS)); + EXPECT_EQ(string{"Compare::NOT_CONTAINS"}, Compare::TypeToName(Compare::NOT_CONTAINS)); + + // invalid type throws + EXPECT_THROW(Compare::TypeToName(static_cast(42)), std::runtime_error); +} + +TEST(CompareTest, TypeToOperatorOk) +{ + { // normal + EXPECT_EQ(Compare::TypeToOperator(Compare::EQUAL), string{"=="}); + EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_EQUAL), string{"!="}); + EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN), string{"<"}); + EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN_EQUAL), string{"<="}); + EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN), string{">"}); + EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN_EQUAL), string{">="}); + EXPECT_EQ(Compare::TypeToOperator(Compare::CONTAINS), string{"&"}); + EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_CONTAINS), string{"~"}); + } + + { // alpha + EXPECT_EQ(Compare::TypeToOperator(Compare::EQUAL, true), string{"eq"}); + EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_EQUAL, true), string{"ne"}); + EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN, true), string{"lt"}); + EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN_EQUAL, true), string{"lte"}); + EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN, true), string{"gt"}); + EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN_EQUAL, true), string{"gte"}); + EXPECT_EQ(Compare::TypeToOperator(Compare::CONTAINS, true), string{"and"}); + EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_CONTAINS, true), string{"not"}); + } + + // invalid type throws + EXPECT_THROW(Compare::TypeToOperator(static_cast(42)), std::runtime_error); +} + +TEST(CompareTest, FromOperatorOk) +{ + EXPECT_EQ(Compare::EQUAL, Compare::TypeFromOperator("==")); + EXPECT_EQ(Compare::EQUAL, Compare::TypeFromOperator("=")); + EXPECT_EQ(Compare::EQUAL, Compare::TypeFromOperator("eq")); + EXPECT_EQ(Compare::NOT_EQUAL, Compare::TypeFromOperator("!=")); + EXPECT_EQ(Compare::NOT_EQUAL, Compare::TypeFromOperator("ne")); + EXPECT_EQ(Compare::LESS_THAN, Compare::TypeFromOperator("<")); + EXPECT_EQ(Compare::LESS_THAN, Compare::TypeFromOperator("lt")); + EXPECT_EQ(Compare::LESS_THAN, Compare::TypeFromOperator("<")); + EXPECT_EQ(Compare::LESS_THAN_EQUAL, Compare::TypeFromOperator("<=")); + EXPECT_EQ(Compare::LESS_THAN_EQUAL, Compare::TypeFromOperator("lte")); + EXPECT_EQ(Compare::LESS_THAN_EQUAL, Compare::TypeFromOperator("<=")); + EXPECT_EQ(Compare::GREATER_THAN, Compare::TypeFromOperator(">")); + EXPECT_EQ(Compare::GREATER_THAN, Compare::TypeFromOperator("gt")); + EXPECT_EQ(Compare::GREATER_THAN, Compare::TypeFromOperator(">")); + EXPECT_EQ(Compare::GREATER_THAN_EQUAL, Compare::TypeFromOperator(">=")); + EXPECT_EQ(Compare::GREATER_THAN_EQUAL, Compare::TypeFromOperator("gte")); + EXPECT_EQ(Compare::GREATER_THAN_EQUAL, Compare::TypeFromOperator(">=")); + EXPECT_EQ(Compare::CONTAINS, Compare::TypeFromOperator("&")); + EXPECT_EQ(Compare::NOT_CONTAINS, Compare::TypeFromOperator("~")); + + // invalid operator strings throw + EXPECT_THROW(Compare::TypeFromOperator(""), std::runtime_error); + EXPECT_THROW(Compare::TypeFromOperator("invalid"), std::runtime_error); +} + +TEST(CompareTest, AlignedEndOk) +{ + BamRecord r1; r1.alignedEnd_ = 300; + BamRecord r2; r2.alignedEnd_ = 200; + BamRecord r3; r3.alignedEnd_ = 400; + BamRecord r4; r4.alignedEnd_ = 100; + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::AlignedEnd()); + + EXPECT_EQ(r4.alignedEnd_, records.at(0).AlignedEnd()); + EXPECT_EQ(r2.alignedEnd_, records.at(1).AlignedEnd()); + EXPECT_EQ(r1.alignedEnd_, records.at(2).AlignedEnd()); + EXPECT_EQ(r3.alignedEnd_, records.at(3).AlignedEnd()); +} + +TEST(CompareTest, AlignedStartOk) +{ + BamRecord r1; r1.alignedStart_ = 300; + BamRecord r2; r2.alignedStart_ = 200; + BamRecord r3; r3.alignedStart_ = 400; + BamRecord r4; r4.alignedStart_ = 100; + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::AlignedStart()); + + EXPECT_EQ(r4.alignedStart_, records.at(0).AlignedStart()); + EXPECT_EQ(r2.alignedStart_, records.at(1).AlignedStart()); + EXPECT_EQ(r1.alignedStart_, records.at(2).AlignedStart()); + EXPECT_EQ(r3.alignedStart_, records.at(3).AlignedStart()); +} + +TEST(CompareTest, AlignedStrandOk) +{ + BamRecord r1; r1.Impl().SetReverseStrand(true); + BamRecord r2; r2.Impl().SetReverseStrand(false); + BamRecord r3; r3.Impl().SetReverseStrand(true); + BamRecord r4; r4.Impl().SetReverseStrand(false); + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::AlignedStrand()); + + EXPECT_EQ(Strand::FORWARD, records.at(0).AlignedStrand()); + EXPECT_EQ(Strand::FORWARD, records.at(1).AlignedStrand()); + EXPECT_EQ(Strand::REVERSE, records.at(2).AlignedStrand()); + EXPECT_EQ(Strand::REVERSE, records.at(3).AlignedStrand()); +} + +TEST(CompareTest, BarcodeForwardOk) +{ + BamRecord r1; r1.Barcodes(std::make_pair(30,20)); + BamRecord r2; r2.Barcodes(std::make_pair(20,30)); + BamRecord r3; r3.Barcodes(std::make_pair(40,10)); + BamRecord r4; r4.Barcodes(std::make_pair(10,40)); + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::BarcodeForward()); + + EXPECT_EQ(r4.BarcodeForward(), records.at(0).BarcodeForward()); + EXPECT_EQ(r2.BarcodeForward(), records.at(1).BarcodeForward()); + EXPECT_EQ(r1.BarcodeForward(), records.at(2).BarcodeForward()); + EXPECT_EQ(r3.BarcodeForward(), records.at(3).BarcodeForward()); +} + +TEST(CompareTest, BarcodeReverseOk) +{ + BamRecord r1; r1.Barcodes(std::make_pair(30,20)); + BamRecord r2; r2.Barcodes(std::make_pair(20,30)); + BamRecord r3; r3.Barcodes(std::make_pair(40,10)); + BamRecord r4; r4.Barcodes(std::make_pair(10,40)); + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::BarcodeReverse()); + + EXPECT_EQ(r3.BarcodeReverse(), records.at(0).BarcodeReverse()); + EXPECT_EQ(r1.BarcodeReverse(), records.at(1).BarcodeReverse()); + EXPECT_EQ(r2.BarcodeReverse(), records.at(2).BarcodeReverse()); + EXPECT_EQ(r4.BarcodeReverse(), records.at(3).BarcodeReverse()); +} + +TEST(CompareTest, BarcodeQualityOk) +{ + uint8_t q1 = 30; + uint8_t q2 = 20; + uint8_t q3 = 40; + uint8_t q4 = 10; + + auto records = vector + { + tests::makeRecordWithTag("bq", Tag(q1)), + tests::makeRecordWithTag("bq", Tag(q2)), + tests::makeRecordWithTag("bq", Tag(q3)), + tests::makeRecordWithTag("bq", Tag(q4)) + }; + std::sort(records.begin(), records.end(), Compare::BarcodeQuality()); + + EXPECT_EQ(q4, records.at(0).BarcodeQuality()); + EXPECT_EQ(q2, records.at(1).BarcodeQuality()); + EXPECT_EQ(q1, records.at(2).BarcodeQuality()); + EXPECT_EQ(q3, records.at(3).BarcodeQuality()); +} + +TEST(CompareTest, CustomCompareOk) +{ + struct CustomCompare : public Compare::MemberFunctionBase { }; + + auto records = vector + { + tests::makeRecordWithTag("dt", Tag(string("foo"))), + tests::makeRecordWithTag("dt", Tag(string("foo"))), + tests::makeRecordWithTag("dt", Tag(string("foo"))), + tests::makeRecordWithTag("dt", Tag(string("foo"))) + }; + records.push_back(BamRecord()); + records.push_back(BamRecord()); + records.push_back(BamRecord()); + records.push_back(BamRecord()); + EXPECT_EQ(8, records.size()); + + std::sort(records.begin(), records.end(), CustomCompare()); + + EXPECT_FALSE(records.at(0).HasDeletionTag()); + EXPECT_FALSE(records.at(1).HasDeletionTag()); + EXPECT_FALSE(records.at(2).HasDeletionTag()); + EXPECT_FALSE(records.at(3).HasDeletionTag()); + EXPECT_TRUE(records.at(4).HasDeletionTag()); + EXPECT_TRUE(records.at(5).HasDeletionTag()); + EXPECT_TRUE(records.at(6).HasDeletionTag()); + EXPECT_TRUE(records.at(7).HasDeletionTag()); +} + +TEST(CompareTest, FullNameOk) +{ + BamRecord r1; r1.Impl().Name("c"); + BamRecord r2; r2.Impl().Name("b"); + BamRecord r3; r3.Impl().Name("d"); + BamRecord r4; r4.Impl().Name("a"); + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::FullName()); + + EXPECT_EQ(r4.FullName(), records.at(0).FullName()); + EXPECT_EQ(r2.FullName(), records.at(1).FullName()); + EXPECT_EQ(r1.FullName(), records.at(2).FullName()); + EXPECT_EQ(r3.FullName(), records.at(3).FullName()); +} + +TEST(CompareTest, LocalContextFlagOk) +{ + BamRecord r1; r1.LocalContextFlags(LocalContextFlags::BARCODE_AFTER); + BamRecord r2; r2.LocalContextFlags(LocalContextFlags::ADAPTER_AFTER); + BamRecord r3; r3.LocalContextFlags(LocalContextFlags::REVERSE_PASS); + BamRecord r4; r4.LocalContextFlags(LocalContextFlags::NO_LOCAL_CONTEXT); + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::LocalContextFlag()); + + EXPECT_EQ(r4.LocalContextFlags(), records.at(0).LocalContextFlags()); + EXPECT_EQ(r2.LocalContextFlags(), records.at(1).LocalContextFlags()); + EXPECT_EQ(r1.LocalContextFlags(), records.at(2).LocalContextFlags()); + EXPECT_EQ(r3.LocalContextFlags(), records.at(3).LocalContextFlags()); +} + +TEST(CompareTest, MapQualityOk) +{ + BamRecord r1; r1.Impl().MapQuality(30); + BamRecord r2; r2.Impl().MapQuality(20); + BamRecord r3; r3.Impl().MapQuality(40); + BamRecord r4; r4.Impl().MapQuality(10); + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::MapQuality()); + + EXPECT_EQ(r4.MapQuality(), records.at(0).MapQuality()); + EXPECT_EQ(r2.MapQuality(), records.at(1).MapQuality()); + EXPECT_EQ(r1.MapQuality(), records.at(2).MapQuality()); + EXPECT_EQ(r3.MapQuality(), records.at(3).MapQuality()); +} + +TEST(CompareTest, MovieNameOk) +{ + auto rg1 = ReadGroupInfo { "a", "SUBREAD" }; + auto rg2 = ReadGroupInfo { "b", "SUBREAD" }; + auto rg3 = ReadGroupInfo { "c", "SUBREAD" }; + auto rg4 = ReadGroupInfo { "d", "SUBREAD" }; + + BamHeader header; + header.AddReadGroup(rg1) + .AddReadGroup(rg2) + .AddReadGroup(rg3) + .AddReadGroup(rg4); + + BamRecord r1(header); r1.ReadGroup(rg3); + BamRecord r2(header); r2.ReadGroup(rg2); + BamRecord r3(header); r3.ReadGroup(rg4); + BamRecord r4(header); r4.ReadGroup(rg1); + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::MovieName()); + + EXPECT_EQ(r4.MovieName(), records.at(0).MovieName()); + EXPECT_EQ(r2.MovieName(), records.at(1).MovieName()); + EXPECT_EQ(r1.MovieName(), records.at(2).MovieName()); + EXPECT_EQ(r3.MovieName(), records.at(3).MovieName()); +} + +TEST(CompareTest, NoneOk) +{ + BamRecord r1; r1.Impl().Name("c"); + BamRecord r2; r2.Impl().Name("b"); + BamRecord r3; r3.Impl().Name("d"); + BamRecord r4; r4.Impl().Name("a"); + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::None()); + + EXPECT_EQ(r1.FullName(), records.at(0).FullName()); + EXPECT_EQ(r2.FullName(), records.at(1).FullName()); + EXPECT_EQ(r3.FullName(), records.at(2).FullName()); + EXPECT_EQ(r4.FullName(), records.at(3).FullName()); +} + +TEST(CompareTest, NumDeletedBasesOk) +{ + // create test data + auto records = tests::makeMappedRecords(); + + // sanity checks on initial conditions + EXPECT_EQ(6, records.size()); + EXPECT_EQ(0, records.at(0).NumDeletedBases()); + EXPECT_EQ(3, records.at(1).NumDeletedBases()); + EXPECT_EQ(3, records.at(2).NumDeletedBases()); + EXPECT_EQ(0, records.at(3).NumDeletedBases()); + EXPECT_EQ(3, records.at(4).NumDeletedBases()); + EXPECT_EQ(3, records.at(5).NumDeletedBases()); + + // sort & check + std::sort(records.begin(), records.end(), Compare::NumDeletedBases()); + EXPECT_EQ(0, records.at(0).NumDeletedBases()); + EXPECT_EQ(0, records.at(1).NumDeletedBases()); + EXPECT_EQ(3, records.at(2).NumDeletedBases()); + EXPECT_EQ(3, records.at(3).NumDeletedBases()); + EXPECT_EQ(3, records.at(4).NumDeletedBases()); + EXPECT_EQ(3, records.at(5).NumDeletedBases()); +} + +TEST(CompareTest, NumInsertedBasesOk) +{ + // create test data + auto records = tests::makeMappedRecords(); + + // sanity checks on initial conditions + EXPECT_EQ(6, records.size()); + EXPECT_EQ(0, records.at(0).NumInsertedBases()); + EXPECT_EQ(0, records.at(1).NumInsertedBases()); + EXPECT_EQ(2, records.at(2).NumInsertedBases()); + EXPECT_EQ(0, records.at(3).NumInsertedBases()); + EXPECT_EQ(0, records.at(4).NumInsertedBases()); + EXPECT_EQ(2, records.at(5).NumInsertedBases()); + + // sort & check + std::sort(records.begin(), records.end(), Compare::NumInsertedBases()); + EXPECT_EQ(0, records.at(0).NumInsertedBases()); + EXPECT_EQ(0, records.at(1).NumInsertedBases()); + EXPECT_EQ(0, records.at(2).NumInsertedBases()); + EXPECT_EQ(0, records.at(3).NumInsertedBases()); + EXPECT_EQ(2, records.at(4).NumInsertedBases()); + EXPECT_EQ(2, records.at(5).NumInsertedBases()); +} + +TEST(CompareTest, NumMatchesOk) +{ + // create test data + auto records = tests::makeMappedRecords(); + + // sanity checks on initial conditions + EXPECT_EQ(6, records.size()); + EXPECT_EQ(10, records.at(0).NumMatches()); + EXPECT_EQ(10, records.at(1).NumMatches()); + EXPECT_EQ(6, records.at(2).NumMatches()); + EXPECT_EQ(10, records.at(3).NumMatches()); + EXPECT_EQ(10, records.at(4).NumMatches()); + EXPECT_EQ(6, records.at(5).NumMatches()); + + // sort & check + std::sort(records.begin(), records.end(), Compare::NumMatches()); + EXPECT_EQ(6, records.at(0).NumMatches()); + EXPECT_EQ(6, records.at(1).NumMatches()); + EXPECT_EQ(10, records.at(2).NumMatches()); + EXPECT_EQ(10, records.at(3).NumMatches()); + EXPECT_EQ(10, records.at(4).NumMatches()); + EXPECT_EQ(10, records.at(5).NumMatches()); +} + +TEST(CompareTest, NumMismatchesOk) +{ + // create test data + auto records = tests::makeMappedRecords(); + + // sanity checks on initial conditions + EXPECT_EQ(6, records.size()); + EXPECT_EQ(0, records.at(0).NumMismatches()); + EXPECT_EQ(0, records.at(1).NumMismatches()); + EXPECT_EQ(2, records.at(2).NumMismatches()); + EXPECT_EQ(0, records.at(3).NumMismatches()); + EXPECT_EQ(0, records.at(4).NumMismatches()); + EXPECT_EQ(2, records.at(5).NumMismatches()); + + // sort & check + std::sort(records.begin(), records.end(), Compare::NumMismatches()); + EXPECT_EQ(0, records.at(0).NumMismatches()); + EXPECT_EQ(0, records.at(1).NumMismatches()); + EXPECT_EQ(0, records.at(2).NumMismatches()); + EXPECT_EQ(0, records.at(3).NumMismatches()); + EXPECT_EQ(2, records.at(4).NumMismatches()); + EXPECT_EQ(2, records.at(5).NumMismatches()); +} + +TEST(CompareTest, QueryEndOk) +{ + Position q1 = 30; + Position q2 = 20; + Position q3 = 40; + Position q4 = 10; + + auto records = vector + { + tests::makeRecordWithTag("qe", Tag(q1)), + tests::makeRecordWithTag("qe", Tag(q2)), + tests::makeRecordWithTag("qe", Tag(q3)), + tests::makeRecordWithTag("qe", Tag(q4)) + }; + std::sort(records.begin(), records.end(), Compare::QueryEnd()); + + EXPECT_EQ(q4, records.at(0).QueryEnd()); + EXPECT_EQ(q2, records.at(1).QueryEnd()); + EXPECT_EQ(q1, records.at(2).QueryEnd()); + EXPECT_EQ(q3, records.at(3).QueryEnd()); +} + +TEST(CompareTest, QueryStartOk) +{ + Position q1 = 30; + Position q2 = 20; + Position q3 = 40; + Position q4 = 10; + + auto records = vector + { + tests::makeRecordWithTag("qs", Tag(q1)), + tests::makeRecordWithTag("qs", Tag(q2)), + tests::makeRecordWithTag("qs", Tag(q3)), + tests::makeRecordWithTag("qs", Tag(q4)) + }; + std::sort(records.begin(), records.end(), Compare::QueryStart()); + + EXPECT_EQ(q4, records.at(0).QueryStart()); + EXPECT_EQ(q2, records.at(1).QueryStart()); + EXPECT_EQ(q1, records.at(2).QueryStart()); + EXPECT_EQ(q3, records.at(3).QueryStart()); +} + +TEST(CompareTest, ReadGroupIdOk) +{ + auto rg1 = ReadGroupInfo { "foo", "SUBREAD" }; + auto rg2 = ReadGroupInfo { "bar", "SUBREAD" }; + auto rg3 = ReadGroupInfo { "c", "SUBREAD" }; + auto rg4 = ReadGroupInfo { "d", "SUBREAD" }; + + BamHeader header; + header.AddReadGroup(rg1) + .AddReadGroup(rg2) + .AddReadGroup(rg3) + .AddReadGroup(rg4); + + BamRecord r1(header); r1.ReadGroup(rg3); // -> 99365356 + BamRecord r2(header); r2.ReadGroup(rg2); // -> d9f305e4 + BamRecord r3(header); r3.ReadGroup(rg4); // -> 54397cd6 + BamRecord r4(header); r4.ReadGroup(rg1); // -> a60ddc69 + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::ReadGroupId()); // lexical, NOT numeric ordering + + EXPECT_EQ(r3.ReadGroupId(), records.at(0).ReadGroupId()); + EXPECT_EQ(r1.ReadGroupId(), records.at(1).ReadGroupId()); + EXPECT_EQ(r4.ReadGroupId(), records.at(2).ReadGroupId()); + EXPECT_EQ(r2.ReadGroupId(), records.at(3).ReadGroupId()); +} + +TEST(CompareTest, ReadGroupNumericIdOk) +{ + auto rg1 = ReadGroupInfo { "a", "SUBREAD" }; + auto rg2 = ReadGroupInfo { "b", "SUBREAD" }; + auto rg3 = ReadGroupInfo { "c", "SUBREAD" }; + auto rg4 = ReadGroupInfo { "d", "SUBREAD" }; + + BamHeader header; + header.AddReadGroup(rg1) + .AddReadGroup(rg2) + .AddReadGroup(rg3) + .AddReadGroup(rg4); + + BamRecord r1(header); r1.ReadGroup(rg3); // -> -1724492970 + BamRecord r2(header); r2.ReadGroup(rg2); // -> 235381373 + BamRecord r3(header); r3.ReadGroup(rg4); // -> 1413053654 + BamRecord r4(header); r4.ReadGroup(rg1); // -> 1153643386 + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::ReadGroupNumericId()); // numeric ordering + + EXPECT_EQ(r1.ReadGroupNumericId(), records.at(0).ReadGroupNumericId()); + EXPECT_EQ(r2.ReadGroupNumericId(), records.at(1).ReadGroupNumericId()); + EXPECT_EQ(r4.ReadGroupNumericId(), records.at(2).ReadGroupNumericId()); + EXPECT_EQ(r3.ReadGroupNumericId(), records.at(3).ReadGroupNumericId()); +} + +TEST(CompareTest, ReadAccuracyOk) +{ + Accuracy a1 = 30; + Accuracy a2 = 20; + Accuracy a3 = 40; + Accuracy a4 = 10; + + auto records = vector + { + tests::makeRecordWithTag("rq", Tag(a1)), + tests::makeRecordWithTag("rq", Tag(a2)), + tests::makeRecordWithTag("rq", Tag(a3)), + tests::makeRecordWithTag("rq", Tag(a4)) + }; + std::sort(records.begin(), records.end(), Compare::ReadAccuracy()); + + EXPECT_EQ(a4, records.at(0).ReadAccuracy()); + EXPECT_EQ(a2, records.at(1).ReadAccuracy()); + EXPECT_EQ(a1, records.at(2).ReadAccuracy()); + EXPECT_EQ(a3, records.at(3).ReadAccuracy()); +} + +TEST(CompareTest, ReferenceEndOk) +{ + // create test data + auto records = tests::makeMappedRecords(); + + // sanity checks on initial conditions + EXPECT_EQ(6, records.size()); + EXPECT_EQ(110, records.at(0).ReferenceEnd()); + EXPECT_EQ(113, records.at(1).ReferenceEnd()); + EXPECT_EQ(111, records.at(2).ReferenceEnd()); + EXPECT_EQ(110, records.at(3).ReferenceEnd()); + EXPECT_EQ(113, records.at(4).ReferenceEnd()); + EXPECT_EQ(111, records.at(5).ReferenceEnd()); + + // sort & check + std::sort(records.begin(), records.end(), Compare::ReferenceEnd()); + EXPECT_EQ(110, records.at(0).ReferenceEnd()); + EXPECT_EQ(110, records.at(1).ReferenceEnd()); + EXPECT_EQ(111, records.at(2).ReferenceEnd()); + EXPECT_EQ(111, records.at(3).ReferenceEnd()); + EXPECT_EQ(113, records.at(4).ReferenceEnd()); + EXPECT_EQ(113, records.at(5).ReferenceEnd()); +} + +TEST(CompareTest, ReferenceIdOk) +{ + BamRecord r1; r1.Impl().ReferenceId(30); + BamRecord r2; r2.Impl().ReferenceId(20); + BamRecord r3; r3.Impl().ReferenceId(40); + BamRecord r4; r4.Impl().ReferenceId(10); + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::ReferenceId()); + + EXPECT_EQ(r4.ReferenceId(), records.at(0).ReferenceId()); + EXPECT_EQ(r2.ReferenceId(), records.at(1).ReferenceId()); + EXPECT_EQ(r1.ReferenceId(), records.at(2).ReferenceId()); + EXPECT_EQ(r3.ReferenceId(), records.at(3).ReferenceId()); +} + +TEST(CompareTest, ReferenceNameOk) +{ + auto seq1 = SequenceInfo { "seq1" }; + auto seq2 = SequenceInfo { "seq2" }; + auto seq3 = SequenceInfo { "seq3" }; + auto seq4 = SequenceInfo { "seq4" }; + + BamHeader header; + header.AddSequence(seq1) // -> 0 + .AddSequence(seq2) // -> 1 + .AddSequence(seq3) // -> 2 + .AddSequence(seq4); // -> 3 + + BamRecord r1(header); r1.Impl().SetMapped(true); r1.Impl().ReferenceId(2); + BamRecord r2(header); r2.Impl().SetMapped(true); r2.Impl().ReferenceId(1); + BamRecord r3(header); r3.Impl().SetMapped(true); r3.Impl().ReferenceId(3); + BamRecord r4(header); r4.Impl().SetMapped(true); r4.Impl().ReferenceId(0); + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::ReferenceName()); + + EXPECT_EQ(seq1.Name(), records.at(0).ReferenceName()); + EXPECT_EQ(seq2.Name(), records.at(1).ReferenceName()); + EXPECT_EQ(seq3.Name(), records.at(2).ReferenceName()); + EXPECT_EQ(seq4.Name(), records.at(3).ReferenceName()); +} + +TEST(CompareTest, ReferenceStartOk) +{ + BamRecord r1; r1.Impl().Position(30); + BamRecord r2; r2.Impl().Position(20); + BamRecord r3; r3.Impl().Position(40); + BamRecord r4; r4.Impl().Position(10); + + auto records = vector{ r1, r2, r3, r4 }; + std::sort(records.begin(), records.end(), Compare::ReferenceStart()); + + EXPECT_EQ(r4.ReferenceStart(), records.at(0).ReferenceStart()); + EXPECT_EQ(r2.ReferenceStart(), records.at(1).ReferenceStart()); + EXPECT_EQ(r1.ReferenceStart(), records.at(2).ReferenceStart()); + EXPECT_EQ(r3.ReferenceStart(), records.at(3).ReferenceStart()); +} + +TEST(CompareTest, ZmwOk) +{ + int32_t z1 = 30; + int32_t z2 = 20; + int32_t z3 = 40; + int32_t z4 = 10; + + auto records = vector + { + tests::makeRecordWithTag("zm", Tag(z1)), + tests::makeRecordWithTag("zm", Tag(z2)), + tests::makeRecordWithTag("zm", Tag(z3)), + tests::makeRecordWithTag("zm", Tag(z4)) + }; + std::sort(records.begin(), records.end(), Compare::Zmw()); + + EXPECT_EQ(z4, records.at(0).HoleNumber()); + EXPECT_EQ(z2, records.at(1).HoleNumber()); + EXPECT_EQ(z1, records.at(2).HoleNumber()); + EXPECT_EQ(z3, records.at(3).HoleNumber()); +} diff --git a/tests/src/test_DataSetCore.cpp b/tests/src/test_DataSetCore.cpp new file mode 100644 index 0000000..df9eae2 --- /dev/null +++ b/tests/src/test_DataSetCore.cpp @@ -0,0 +1,537 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace tests { + +static inline +DataSet CreateDataSet(void) +{ + DataSet d; + d.Name("foo"); + return d; +} + +} // namespace tests + +TEST(DataSetCoreTest, XmlNameParts) +{ + internal::XmlName name("ns:node_name"); + EXPECT_EQ(boost::string_ref("ns"), name.Prefix()); + EXPECT_EQ(boost::string_ref("node_name"), name.LocalName()); + EXPECT_EQ(boost::string_ref("ns:node_name"), name.QualifiedName()); + + internal::XmlName bareName("node_name"); + EXPECT_EQ(boost::string_ref(""), bareName.Prefix()); + EXPECT_EQ(boost::string_ref("node_name"), bareName.LocalName()); + EXPECT_EQ(boost::string_ref("node_name"), bareName.QualifiedName()); + + internal::XmlName leadingColon(":node_name"); + EXPECT_EQ(boost::string_ref(""), leadingColon.Prefix()); + EXPECT_EQ(boost::string_ref(":node_name"), leadingColon.LocalName()); + EXPECT_EQ(boost::string_ref(":node_name"), leadingColon.QualifiedName()); +} + +TEST(DataSetCoreTest, DefaultsOk) +{ + DataSet dataset; + EXPECT_EQ(DataSet::GENERIC, dataset.Type()); + EXPECT_FALSE(dataset.CreatedAt().empty()); + EXPECT_FALSE(dataset.MetaType().empty()); + EXPECT_FALSE(dataset.TimeStampedName().empty()); + EXPECT_FALSE(dataset.UniqueId().empty()); + EXPECT_FALSE(dataset.Version().empty()); + + EXPECT_EQ(0, dataset.TimeStampedName().find("pacbio_dataset_")); + + EXPECT_TRUE(dataset.Format().empty()); + EXPECT_TRUE(dataset.ModifiedAt().empty()); + EXPECT_TRUE(dataset.Name().empty()); + EXPECT_TRUE(dataset.ResourceId().empty()); + EXPECT_TRUE(dataset.Tags().empty()); + EXPECT_EQ(0, dataset.ExternalResources().Size()); + EXPECT_EQ(0, dataset.Filters().Size()); + EXPECT_EQ(0, dataset.SubDataSets().Size()); + + EXPECT_EQ(string{"3.0.1"}, dataset.Version()); +} + +TEST(DataSetCoreTest, TimeStampedNamesOk) +{ + DataSet dataset; + AlignmentSet alignmentSet; + BarcodeSet barcodeSet; + ContigSet contigSet; + ConsensusAlignmentSet consensusAlignmentSet; + ConsensusReadSet consensusReadSet; + HdfSubreadSet hdfSubreadSet; + ReferenceSet referenceSet; + SubreadSet subreadSet; + + EXPECT_EQ(0, dataset.TimeStampedName().find("pacbio_dataset_dataset-")); + EXPECT_EQ(0, alignmentSet.TimeStampedName().find("pacbio_dataset_alignmentset-")); + EXPECT_EQ(0, barcodeSet.TimeStampedName().find("pacbio_dataset_barcodeset-")); + EXPECT_EQ(0, contigSet.TimeStampedName().find("pacbio_dataset_contigset-")); + EXPECT_EQ(0, consensusAlignmentSet.TimeStampedName().find("pacbio_dataset_consensusalignmentset-")); + EXPECT_EQ(0, consensusReadSet.TimeStampedName().find("pacbio_dataset_consensusreadset-")); + EXPECT_EQ(0, hdfSubreadSet.TimeStampedName().find("pacbio_dataset_hdfsubreadset-")); + EXPECT_EQ(0, referenceSet.TimeStampedName().find("pacbio_dataset_referenceset-")); + EXPECT_EQ(0, subreadSet.TimeStampedName().find("pacbio_dataset_subreadset-")); +} + +TEST(DataSetCoreTest, BasicGettersSettersOk) +{ + DataSet dataset; + dataset.CreatedAt("now"); + dataset.Format("format"); + dataset.MetaType("meta"); + dataset.ModifiedAt("later"); + dataset.Name("foo"); + dataset.ResourceId("path/to/file"); + dataset.Tags("tag tag"); + dataset.TimeStampedName("now:30"); + dataset.UniqueId("uuid"); + dataset.Version("0.0.0"); + + EXPECT_EQ(string("now"), dataset.CreatedAt()); + EXPECT_EQ(string("format"), dataset.Format()); + EXPECT_EQ(string("meta"), dataset.MetaType()); + EXPECT_EQ(string("later"), dataset.ModifiedAt()); + EXPECT_EQ(string("foo"), dataset.Name()); + EXPECT_EQ(string("path/to/file"), dataset.ResourceId()); + EXPECT_EQ(string("tag tag"), dataset.Tags()); + EXPECT_EQ(string("now:30"), dataset.TimeStampedName()); + EXPECT_EQ(string("uuid"), dataset.UniqueId()); + EXPECT_EQ(string("0.0.0"), dataset.Version()); +} + +TEST(DataSetCoreTest, CopyOk) +{ + DataSet d1; + d1.Name("foo"); + + // copy ctor + DataSet d2(d1); + EXPECT_EQ(string("foo"), d2.Name()); + + // copy assignment + DataSet d3; + d3 = d1; + EXPECT_EQ(string("foo"), d3.Name()); +} + +TEST(DataSetCoreTest, MoveOk) +{ + DataSet d1; + d1.Name("foo"); + + // move ctor +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpessimizing-move" +#endif + DataSet d2(std::move(tests::CreateDataSet())); +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + EXPECT_EQ(string("foo"), d2.Name()); + + // move assignment + DataSet d3; +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpessimizing-move" +#endif + d3 = std::move(tests::CreateDataSet()); +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + EXPECT_EQ(string("foo"), d3.Name()); +} + +TEST(DataSetCoreTest, AddExternalResources) +{ + DataSet dataset; + EXPECT_EQ(0, dataset.ExternalResources().Size()); + + ExternalResource resource1("metatype", "id"); + resource1.Name("file1"); + + ExternalResource resource2("metatype", "id2"); + resource2.Name("file2"); + + dataset.ExternalResources().Add(resource1); + dataset.ExternalResources().Add(resource2); + EXPECT_EQ(2, dataset.ExternalResources().Size()); + + // disallow duplicates (checking on ResourceId) + ExternalResource duplicateResource("metatype", "id"); + dataset.ExternalResources().Add(duplicateResource); + EXPECT_EQ(2, dataset.ExternalResources().Size()); + + // direct access + const ExternalResources& resources = dataset.ExternalResources(); + EXPECT_EQ(string("file1"), resources[0].Name()); + EXPECT_EQ(string("file2"), resources[1].Name()); + + // iterable + size_t i = 0; + for (auto r : resources) { + if (i == 0) + EXPECT_EQ(string("file1"), r.Name()); + else + EXPECT_EQ(string("file2"), r.Name()); + ++i; + } +} + +TEST(DataSetCoreTest, EditExternalResources) +{ + DataSet dataset; + + ExternalResource resource("metatype", "id"); + resource.Name("file1"); + dataset.ExternalResources().Add(resource); + + resource.Name("file2").ResourceId("id2"); + dataset.ExternalResources().Add(resource); + EXPECT_EQ(2, dataset.ExternalResources().Size()); + + // edit + dataset.ExternalResources()[0].Name("some new name"); + EXPECT_EQ(string("some new name"), dataset.ExternalResources()[0].Name()); + EXPECT_EQ(string("file2"), dataset.ExternalResources()[1].Name()); +} + +TEST(DataSetCoreTest, NestedExternalResources) +{ + ExternalResource resource("metatype", "filename"); + resource.ExternalResources().Add(ExternalResource("metatype.child", "filename.child")); + resource.ExternalResources().Add(ExternalResource("metatype.child2", "filename.child2")); + + const ExternalResources& childResources = resource.ExternalResources(); + EXPECT_EQ(2, childResources.Size()); + EXPECT_EQ(string("metatype.child"), childResources[0].MetaType()); + EXPECT_EQ(string("metatype.child2"), childResources[1].MetaType()); + EXPECT_EQ(string("filename.child"), childResources[0].ResourceId()); + EXPECT_EQ(string("filename.child2"), childResources[1].ResourceId()); +} + +TEST(DataSetCoreTest, AddFilters) +{ + DataSet dataset; + EXPECT_EQ(0, dataset.Filters().Size()); + + Filter filter; + filter.Properties().Add(Property("rq", "0.85", ">")); + filter.Properties().Add(Property("RNAME", "chr1", "==")); + EXPECT_EQ(2, filter.Properties().Size()); + + Filter filter2; + filter2.Properties().Add(Property("rq", "0.50", ">=")); + filter2.Properties().Add(Property("RNAME", "chr2", "!=")); + EXPECT_EQ(2, filter2.Properties().Size()); + + dataset.Filters().Add(filter); + dataset.Filters().Add(filter2); + + const Filters& filters = dataset.Filters(); + EXPECT_EQ(2, filters.Size()); + EXPECT_EQ(2, filters[0].Properties().Size()); + EXPECT_EQ(2, filters[1].Properties().Size()); + + // direct access + const Property& p0 = filters[0].Properties()[0]; + EXPECT_EQ(string("rq"), p0.Name()); + EXPECT_EQ(string("0.85"), p0.Value()); + EXPECT_EQ(string(">"), p0.Operator()); + + const Property& p1 = filters[0].Properties()[1]; + EXPECT_EQ(string("RNAME"), p1.Name()); + EXPECT_EQ(string("chr1"), p1.Value()); + EXPECT_EQ(string("=="), p1.Operator()); + + const Property& p2 = filters[1].Properties()[0]; + EXPECT_EQ(string("rq"), p2.Name()); + EXPECT_EQ(string("0.50"), p2.Value()); + EXPECT_EQ(string(">="), p2.Operator()); + + const Property& p3 = filters[1].Properties()[1]; + EXPECT_EQ(string("RNAME"), p3.Name()); + EXPECT_EQ(string("chr2"), p3.Value()); + EXPECT_EQ(string("!="), p3.Operator()); + + // iteratable + size_t i = 0; + size_t j = 0; + for (const Filter& f : filters) { + if (i == 0) { + const Properties& properties = f.Properties(); + for (const Property& p : properties) { + if (j == 0) { + EXPECT_EQ(string("rq"), p.Name()); + EXPECT_EQ(string("0.85"), p.Value()); + EXPECT_EQ(string(">"), p.Operator()); + } else { + EXPECT_EQ(string("RNAME"), p.Name()); + EXPECT_EQ(string("chr1"), p.Value()); + EXPECT_EQ(string("=="), p.Operator()); + } + ++j; + } + } else { + const Properties& properties = f.Properties(); + for (const Property& p : properties) { + if (j == 0) { + EXPECT_EQ(string("rq"), p.Name()); + EXPECT_EQ(string("0.50"), p.Value()); + EXPECT_EQ(string(">="), p.Operator()); + } else { + EXPECT_EQ(string("RNAME"), p.Name()); + EXPECT_EQ(string("chr2"), p.Value()); + EXPECT_EQ(string("!="), p.Operator()); + } + ++j; + } + } + ++i; + j = 0; + } + +} + +TEST(DataSetCoreTest, EditFilters) +{ + DataSet dataset; + EXPECT_EQ(0, dataset.Filters().Size()); + + Filter filter; + filter.Properties().Add(Property("rq", "0.85", ">")); + filter.Properties().Add(Property("RNAME", "chr1", "==")); + EXPECT_EQ(2, filter.Properties().Size()); + + Filter filter2; + filter2.Properties().Add(Property("rq", "0.50", ">=")); + filter2.Properties().Add(Property("RNAME", "chr2", "!=")); + EXPECT_EQ(2, filter2.Properties().Size()); + + dataset.Filters().Add(filter); + dataset.Filters().Add(filter2); + EXPECT_EQ(2, dataset.Filters().Size()); + EXPECT_EQ(2, dataset.Filters()[0].Properties().Size()); + EXPECT_EQ(2, dataset.Filters()[1].Properties().Size()); + + // edit property in-place + Property& p = dataset.Filters()[0].Properties()[0]; + p.Name("someNewName"); + p.Value("someNewValue"); + p.Operator("=="); + + const Property& p0 = dataset.Filters()[0].Properties()[0]; + EXPECT_EQ(string("someNewName"), p0.Name()); + EXPECT_EQ(string("someNewValue"), p0.Value()); + EXPECT_EQ(string("=="), p0.Operator()); + + const Property& p1 = dataset.Filters()[0].Properties()[1]; + EXPECT_EQ(string("RNAME"), p1.Name()); + EXPECT_EQ(string("chr1"), p1.Value()); + EXPECT_EQ(string("=="), p1.Operator()); + + const Property& p2 = dataset.Filters()[1].Properties()[0]; + EXPECT_EQ(string("rq"), p2.Name()); + EXPECT_EQ(string("0.50"), p2.Value()); + EXPECT_EQ(string(">="), p2.Operator()); + + const Property& p3 = dataset.Filters()[1].Properties()[1]; + EXPECT_EQ(string("RNAME"), p3.Name()); + EXPECT_EQ(string("chr2"), p3.Value()); + EXPECT_EQ(string("!="), p3.Operator()); +} + +TEST(DataSetCoreTest, AddSubDataSets) +{ + DataSet dataset; + EXPECT_EQ(0, dataset.SubDataSets().Size()); + + DataSetBase sub1; + sub1.Name("subset_1"); + + DataSetBase sub2; + sub2.Name("subset_2"); + + dataset.SubDataSets().Add(sub1); + dataset.SubDataSets().Add(sub2); + EXPECT_EQ(2, dataset.SubDataSets().Size()); + + // direct access + const SubDataSets& subdatasets = dataset.SubDataSets(); + EXPECT_EQ(string("subset_1"), subdatasets[0].Name()); + EXPECT_EQ(string("subset_2"), subdatasets[1].Name()); + + // iterable + size_t i = 0; + for (const DataSetBase& ds : subdatasets) { + if (i == 0) + EXPECT_EQ(string("subset_1"), ds.Name()); + else + EXPECT_EQ(string("subset_2"), ds.Name()); + ++i; + } +} + +TEST(DataSetCoreTest, EditSubDataSets) +{ + DataSet dataset; + EXPECT_EQ(0, dataset.SubDataSets().Size()); + + DataSetBase sub1; + sub1.Name("subset_1"); + + DataSetBase sub2; + sub2.Name("subset_2"); + + dataset.SubDataSets().Add(sub1); + dataset.SubDataSets().Add(sub2); + EXPECT_EQ(2, dataset.SubDataSets().Size()); + + // edit + dataset.SubDataSets()[0].Name("subset_1_edited"); + + // direct access + const SubDataSets& subdatasets = dataset.SubDataSets(); + EXPECT_EQ(string("subset_1_edited"), subdatasets[0].Name()); + EXPECT_EQ(string("subset_2"), subdatasets[1].Name()); + + // iterable + size_t i = 0; + for (const DataSetBase& ds : subdatasets) { + if (i == 0) + EXPECT_EQ(string("subset_1_edited"), ds.Name()); + else + EXPECT_EQ(string("subset_2"), ds.Name()); + ++i; + } +} + +TEST(DataSetCoreTest, RemoveExternalResources) +{ + DataSet dataset; + EXPECT_EQ(0, dataset.ExternalResources().Size()); + + ExternalResource resource1("metatype", "id"); + resource1.Name("file1"); + + ExternalResource resource2("metatype", "id2"); + resource2.Name("file2"); + + dataset.ExternalResources().Add(resource1); + dataset.ExternalResources().Add(resource2); + EXPECT_EQ(2, dataset.ExternalResources().Size()); + + // remove + dataset.ExternalResources().Remove(resource1); + EXPECT_EQ(1, dataset.ExternalResources().Size()); + + // direct access + const ExternalResources& resources = dataset.ExternalResources(); + EXPECT_EQ(string("file2"), resources[0].Name()); + + // iterable + size_t i = 0; + for (auto r : resources) { + if (i == 0) + EXPECT_EQ(string("file2"), r.Name()); + ++i; + } +} + +TEST(DataSetCoreTest, RemoveFilters) +{ + DataSet dataset; + EXPECT_EQ(0, dataset.Filters().Size()); + + Filter filter; + filter.Properties().Add(Property("rq", "0.85", ">")); + filter.Properties().Add(Property("RNAME", "chr1", "==")); + EXPECT_EQ(2, filter.Properties().Size()); + + Filter filter2; + filter2.Properties().Add(Property("rq", "0.50", ">=")); + filter2.Properties().Add(Property("RNAME", "chr2", "!=")); + EXPECT_EQ(2, filter2.Properties().Size()); + + dataset.Filters().Add(filter); + dataset.Filters().Add(filter2); + EXPECT_EQ(2, dataset.Filters().Size()); + + // remove + dataset.Filters().Remove(filter); + EXPECT_EQ(1, dataset.Filters().Size()); + + const Filters& filters = dataset.Filters(); + EXPECT_EQ(2, filters[0].Properties().Size()); +} + +TEST(DataSetCoreTest, RemoveSubDataSets) +{ + DataSet dataset; + EXPECT_EQ(0, dataset.SubDataSets().Size()); + + DataSetBase sub1; + sub1.Name("subset_1"); + + DataSetBase sub2; + sub2.Name("subset_2"); + + dataset.SubDataSets().Add(sub1); + dataset.SubDataSets().Add(sub2); + EXPECT_EQ(2, dataset.SubDataSets().Size()); + + // remove + dataset.SubDataSets().Remove(sub2); + EXPECT_EQ(1, dataset.SubDataSets().Size()); +} diff --git a/tests/src/test_DataSetIO.cpp b/tests/src/test_DataSetIO.cpp new file mode 100644 index 0000000..8f2adb5 --- /dev/null +++ b/tests/src/test_DataSetIO.cpp @@ -0,0 +1,1516 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include "../src/FileUtils.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +const string alignedBamFn = tests::Data_Dir + "/aligned.bam"; +const string bamGroupFofn = tests::Generated_Dir + "/group.fofn"; + +const string ali1XmlFn = tests::Data_Dir + "/dataset/ali1.xml"; +const string ali2XmlFn = tests::Data_Dir + "/dataset/ali2.xml"; +const string ali3XmlFn = tests::Data_Dir + "/dataset/ali3.xml"; +const string ali4XmlFn = tests::Data_Dir + "/dataset/ali4.xml"; +const string mappingStaggeredXmlFn = tests::Data_Dir + "/dataset/bam_mapping_staggered.xml"; +const string barcodeXmlFn = tests::Data_Dir + "/dataset/barcode.dataset.xml"; +const string ccsReadXmlFn = tests::Data_Dir + "/dataset/ccsread.dataset.xml"; +const string lambdaContigsXmlFn = tests::Data_Dir + "/dataset/lambda_contigs.xml"; +const string pbalchemyXmlFn = tests::Data_Dir + "/dataset/pbalchemy10kbp.xml"; +const string referenceXmlFn = tests::Data_Dir + "/dataset/reference.dataset.xml"; +const string subread1XmlFn = tests::Data_Dir + "/dataset/subread_dataset1.xml"; +const string subread2XmlFn = tests::Data_Dir + "/dataset/subread_dataset2.xml"; +const string subread3XmlFn = tests::Data_Dir + "/dataset/subread_dataset3.xml"; +const string transformedXmlFn = tests::Data_Dir + "/dataset/transformed_rs_subread_dataset.xml"; + +static void TestFromXmlString(void); +static void TestAli1Xml(void); +static void TestAli2Xml(void); +static void TestAli3Xml(void); +static void TestAli4Xml(void); +static void TestMappingStaggeredXml(void); +static void TestBarcodeXml(void); +static void TestCcsReadXml(void); +static void TestLambdaContigsXml(void); +static void TestPbalchemyXml(void); +static void TestReferenceXml(void); +static void TestSubread1Xml(void); +static void TestSubread2Xml(void); +static void TestSubread3Xml(void); +static void TestTransformedXml(void); + +static inline +void changeCurrentDirectory(const std::string& dir) +{ ASSERT_EQ(0, chdir(dir.c_str())); } + +TEST(DataSetIOTest, FromBamFilename) +{ + DataSet dataset(alignedBamFn); + + EXPECT_EQ(1, dataset.ExternalResources().Size()); + const ExternalResource& bamRef = dataset.ExternalResources()[0]; + + EXPECT_EQ(alignedBamFn, bamRef.ResourceId()); +} + +TEST(DataSetIOTest, FromBamFilenames) +{ + std::ifstream fofn(bamGroupFofn); + std::vector files; + std::string file; + while (std::getline(fofn, file)) if (!file.empty()) files.emplace_back(file); + DataSet dataset(files); + EXPECT_EQ(3, dataset.ExternalResources().Size()); +} + +TEST(DataSetIOTest, FromBamFileObject) +{ + BamFile bamFile(alignedBamFn); + DataSet dataset(bamFile.Filename()); + + EXPECT_EQ(1, dataset.ExternalResources().Size()); + const ExternalResource& bamRef = dataset.ExternalResources()[0]; + + EXPECT_EQ(alignedBamFn, bamRef.ResourceId()); +} + +TEST(DataSetIOTest, FromFofn) +{ + DataSet dataset(bamGroupFofn); + EXPECT_EQ(3, dataset.ExternalResources().Size()); +} + +TEST(DataSetIOTest, FromXml) +{ + EXPECT_NO_THROW(TestFromXmlString()); +} + +TEST(DataSetIOTest, FromXmlFile) +{ + EXPECT_NO_THROW(TestAli1Xml()); + EXPECT_NO_THROW(TestAli2Xml()); + EXPECT_NO_THROW(TestAli3Xml()); + EXPECT_NO_THROW(TestAli4Xml()); + EXPECT_NO_THROW(TestMappingStaggeredXml()); + EXPECT_NO_THROW(TestBarcodeXml()); + EXPECT_NO_THROW(TestCcsReadXml()); + EXPECT_NO_THROW(TestLambdaContigsXml()); + EXPECT_NO_THROW(TestPbalchemyXml()); + EXPECT_NO_THROW(TestReferenceXml()); + EXPECT_NO_THROW(TestSubread1Xml()); + EXPECT_NO_THROW(TestSubread2Xml()); + EXPECT_NO_THROW(TestSubread3Xml()); + EXPECT_NO_THROW(TestTransformedXml()); +} + +TEST(DataSetIOTest, ThrowsOnNonexistentFofnFile) +{ + EXPECT_THROW(DataSet{"does/not/exist.fofn"}, std::runtime_error); +} + +TEST(DataSetIOTest, ThrowsOnNonexistentXmlFile) +{ + EXPECT_THROW(DataSet{"does/not/exist.xml"}, std::runtime_error); +} + +TEST(DataSetIOTest, ToXml) +{ + // top-level data + DataSet dataset(DataSet::ALIGNMENT); + dataset.CreatedAt("2015-01-27T09:00:01"); + dataset.MetaType("PacBio.DataSet.AlignmentSet"); + dataset.Name("DataSet_AlignmentSet"); + dataset.Tags("barcode moreTags mapping mytags"); + dataset.TimeStampedName("my_tsn"); + dataset.UniqueId("b095d0a3-94b8-4918-b3af-a3f81bbe519c"); + dataset.Attribute("xmlns", "http://pacificbiosciences.com/PacBioDatasets.xsd") + .Attribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance") + .Attribute("xsi:schemaLocation", "http://pacificbiosciences.com/PacBioDatasets.xsd"); + + // external resources + ExternalResource resource1("AlignmentFile.AlignmentBamFile", "file:/mnt/path/to/alignments2.bam"); + resource1.Name("Third Alignments BAM"); + resource1.Description("Points to an example Alignments BAM file."); + resource1.Tags("Example"); + resource1.TimeStampedName("my_tsn"); + resource1.UniqueId("my_uuid"); + FileIndex pbi1("PacBio.Index.PacBioIndex", "file:/mnt/path/to/alignments2.pbi"); + pbi1.TimeStampedName("my_tsn"); + pbi1.UniqueId("my_uuid"); + resource1.FileIndices().Add(pbi1); + dataset.ExternalResources().Add(resource1); + + ExternalResource resource2("AlignmentFile.AlignmentBamFile", "file:./alignments3.bam"); + resource2.Name("Fourth Alignments BAM"); + resource2.Description("Points to another example Alignments BAM file, by relative path."); + resource2.Tags("Example"); + resource2.TimeStampedName("my_tsn"); + resource2.UniqueId("my_uuid"); + FileIndex pbi2("PacBio.Index.PacBioIndex", "file:/mnt/path/to/alignments3.pbi"); + pbi2.TimeStampedName("my_tsn"); + pbi2.UniqueId("my_uuid"); + + resource2.FileIndices().Add(pbi2); + dataset.ExternalResources().Add(resource2); + + // sub-datasets with filters + DataSetBase subDataSet1; + subDataSet1.Name("HighQuality Read Alignments"); + subDataSet1.TimeStampedName("my_tsn"); + subDataSet1.UniqueId("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"); + Filter filter1; + filter1.Properties().Add(Property("rq", "0.85", ">")); + subDataSet1.Filters().Add(filter1); + dataset.SubDataSets().Add(subDataSet1); + + DataSetBase subDataSet2; + subDataSet2.Name("Alignments to chromosome 1"); + subDataSet2.TimeStampedName("my_tsn"); + subDataSet2.UniqueId("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"); + Filter filter2; + filter2.Properties().Add(Property("RNAME", "chr1", "==")); + subDataSet2.Filters().Add(filter2); + dataset.SubDataSets().Add(subDataSet2); + + // write dataset + const string expectedXml = + "\n" + "\n" + "\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\n" + "\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\t\n" + "\t\t\t\t\t\t\" Value=\"0.85\" />\n" + "\t\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\t\n" + "\t\t\t\t\t\t\n" + "\t\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\n" + "\n"; + + stringstream s; + dataset.SaveToStream(s); + EXPECT_EQ(expectedXml, s.str()); +} + +static void TestFromXmlString(void) +{ + const string inputXml = + "\n" + "\n" + "\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\n" + "\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\t\n" + "\t\t\t\t\t\t\" Value=\"0.85\" />\n" + "\t\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\t\n" + "\t\t\t\t\t\t\n" + "\t\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\n" + "\n"; + + const DataSet dataset = DataSet::FromXml(inputXml); + + EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type()); + EXPECT_EQ("2015-01-27T09:00:01", dataset.CreatedAt()); + EXPECT_EQ("PacBio.DataSet.AlignmentSet", dataset.MetaType()); + EXPECT_EQ("DataSet_AlignmentSet", dataset.Name()); + EXPECT_EQ("barcode moreTags mapping mytags", dataset.Tags()); + EXPECT_EQ("b095d0a3-94b8-4918-b3af-a3f81bbe519c", dataset.UniqueId()); + EXPECT_EQ("2.3.0", dataset.Version()); + EXPECT_EQ("http://pacificbiosciences.com/PacBioDataModel.xsd", dataset.Attribute("xmlns")); + EXPECT_EQ("http://www.w3.org/2001/XMLSchema-instance", dataset.Attribute("xmlns:xsi")); + + const ExternalResources& resources = dataset.ExternalResources(); + EXPECT_EQ(2, resources.Size()); + + const ExternalResource& resource1 = resources[0]; + EXPECT_EQ("Third Alignments BAM", resource1.Name()); + EXPECT_EQ("Points to an example Alignments BAM file.", resource1.Description()); + EXPECT_EQ("AlignmentFile.AlignmentBamFile", resource1.MetaType()); + EXPECT_EQ("file:/mnt/path/to/alignments2.bam", resource1.ResourceId()); + EXPECT_EQ("Example", resource1.Tags()); + const FileIndices& fileIndices1 = resource1.FileIndices(); + EXPECT_EQ(1, fileIndices1.Size()); + const FileIndex& pbi1 = fileIndices1[0]; + EXPECT_EQ("PacBio.Index.PacBioIndex", pbi1.MetaType()); + EXPECT_EQ("file:/mnt/path/to/alignments2.pbi", pbi1.ResourceId()); + + const ExternalResource& resource2 = resources[1]; + EXPECT_EQ("Fourth Alignments BAM", resource2.Name()); + EXPECT_EQ("Points to another example Alignments BAM file, by relative path.", resource2.Description()); + EXPECT_EQ("AlignmentFile.AlignmentBamFile", resource2.MetaType()); + EXPECT_EQ("file:./alignments3.bam", resource2.ResourceId()); + EXPECT_EQ("Example", resource2.Tags()); + const FileIndices& fileIndices2 = resource2.FileIndices(); + EXPECT_EQ(1, fileIndices2.Size()); + const FileIndex& pbi2 = fileIndices2[0]; + EXPECT_EQ("PacBio.Index.PacBioIndex", pbi2.MetaType()); + EXPECT_EQ("file:/mnt/path/to/alignments3.pbi", pbi2.ResourceId()); + + const SubDataSets& subDatasets = dataset.SubDataSets(); + EXPECT_EQ(2, subDatasets.Size()); + + const DataSetBase& sub1 = subDatasets[0]; + EXPECT_EQ("HighQuality Read Alignments", sub1.Name()); + EXPECT_EQ("ab95d0a3-94b8-4918-b3af-a3f81bbe519c", sub1.UniqueId()); + EXPECT_EQ("2.3.0", sub1.Version()); + const Filters& sub1Filters = sub1.Filters(); + EXPECT_EQ(1, sub1Filters.Size()); + const Filter& sub1Filter = sub1Filters[0]; + EXPECT_EQ(1, sub1Filter.Properties().Size()); + const Property& property1 = sub1Filter.Properties()[0]; + EXPECT_EQ("rq", property1.Name()); + EXPECT_EQ(">", property1.Operator()); + EXPECT_EQ("0.85", property1.Value()); + + const DataSetBase& sub2 = subDatasets[1]; + EXPECT_EQ("Alignments to chromosome 1", sub2.Name()); + EXPECT_EQ("ac95d0a3-94b8-4918-b3af-a3f81bbe519c", sub2.UniqueId()); + EXPECT_EQ("2.3.0", sub2.Version()); + const Filters& sub2Filters = sub2.Filters(); + EXPECT_EQ(1, sub2Filters.Size()); + const Filter& sub2Filter = sub2Filters[0]; + EXPECT_EQ(1, sub2Filter.Properties().Size()); + const Property& property2 = sub2Filter.Properties()[0]; + EXPECT_EQ("RNAME", property2.Name()); + EXPECT_EQ("==", property2.Operator()); + EXPECT_EQ("chr1", property2.Value()); +} + +static void TestAli1Xml(void) +{ + const DataSet dataset(ali1XmlFn); + EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type()); + EXPECT_EQ(string("2015-01-27T09:00:01"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.AlignmentSet"), dataset.MetaType()); + EXPECT_EQ(string("DataSet_AlignmentSet"), dataset.Name()); + EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags()); + EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + EXPECT_EQ(0, dataset.Filters().Size()); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(2, resources.Size()); + for (size_t i = 0; i < resources.Size(); ++i) { + const ExternalResource& resource = resources[i]; + if (i == 0) { + EXPECT_EQ(string("First Alignments BAM"), resource.Name()); + EXPECT_EQ(string("Points to an example Alignments BAM file."), resource.Description()); + EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/alignments0.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/alignments0.pbi"), index.ResourceId()); + } + else { + EXPECT_EQ(string("Second Alignments BAM"), resource.Name()); + EXPECT_EQ(string("Points to another example Alignments BAM file, by relative path."), resource.Description()); + EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:./alignments1.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/alignments1.pbi"), index.ResourceId()); + } + } + + const SubDataSets& subdatasets = dataset.SubDataSets(); + ASSERT_EQ(2, subdatasets.Size()); + for (size_t i = 0; i < subdatasets.Size(); ++i) { + const DataSetBase& subdataset = subdatasets[i]; + if (i == 0) { + EXPECT_EQ(string(""), subdataset.CreatedAt()); + EXPECT_EQ(string(""), subdataset.MetaType()); + EXPECT_EQ(string("HighQuality Read Alignments"), subdataset.Name()); + EXPECT_EQ(string(""), subdataset.Tags()); + EXPECT_EQ(string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), subdataset.Version()); + + const Filters& filters = subdataset.Filters(); + ASSERT_EQ(1, filters.Size()); + const Filter& filter = filters[0]; + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("rq"), property.Name()); + EXPECT_EQ(string("0.85"), property.Value()); + EXPECT_EQ(string(">"), property.Operator()); + } + else { + EXPECT_EQ(string(""), subdataset.CreatedAt()); + EXPECT_EQ(string(""), subdataset.MetaType()); + EXPECT_EQ(string("Alignments to chromosome 1"), subdataset.Name()); + EXPECT_EQ(string(""), subdataset.Tags()); + EXPECT_EQ(string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), subdataset.Version()); + + const Filters& filters = subdataset.Filters(); + ASSERT_EQ(1, filters.Size()); + const Filter& filter = filters[0]; + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("RNAME"), property.Name()); + EXPECT_EQ(string("chr1"), property.Value()); + EXPECT_EQ(string("=="), property.Operator()); + } + } +} + +static void TestAli2Xml(void) +{ + const DataSet dataset(ali2XmlFn); + EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type()); + EXPECT_EQ(string("2015-01-27T09:00:01"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.AlignmentSet"), dataset.MetaType()); + EXPECT_EQ(string("DataSet_AlignmentSet"), dataset.Name()); + EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags()); + EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + EXPECT_EQ(0, dataset.Filters().Size()); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(2, resources.Size()); + for (size_t i = 0; i < resources.Size(); ++i) { + const ExternalResource& resource = resources[i]; + if (i == 0) { + EXPECT_EQ(string("First Alignments BAM"), resource.Name()); + EXPECT_EQ(string("Points to an example Alignments BAM file."), resource.Description()); + EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/alignments2.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/alignments2.pbi"), index.ResourceId()); + } + else { + EXPECT_EQ(string("Second Alignments BAM"), resource.Name()); + EXPECT_EQ(string("Points to another example Alignments BAM file, by relative path."), resource.Description()); + EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:./alignments3.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/alignments3.pbi"), index.ResourceId()); + } + } + + const SubDataSets& subdatasets = dataset.SubDataSets(); + ASSERT_EQ(2, subdatasets.Size()); + for (size_t i = 0; i < subdatasets.Size(); ++i) { + const DataSetBase& subdataset = subdatasets[i]; + if (i == 0) { + EXPECT_EQ(string(""), subdataset.CreatedAt()); + EXPECT_EQ(string(""), subdataset.MetaType()); + EXPECT_EQ(string("HighQuality Read Alignments"), subdataset.Name()); + EXPECT_EQ(string(""), subdataset.Tags()); + EXPECT_EQ(string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), subdataset.Version()); + + const Filters& filters = subdataset.Filters(); + ASSERT_EQ(1, filters.Size()); + const Filter& filter = filters[0]; + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("rq"), property.Name()); + EXPECT_EQ(string("0.85"), property.Value()); + EXPECT_EQ(string(">"), property.Operator()); + } + else { + EXPECT_EQ(string(""), subdataset.CreatedAt()); + EXPECT_EQ(string(""), subdataset.MetaType()); + EXPECT_EQ(string("Alignments to chromosome 1"), subdataset.Name()); + EXPECT_EQ(string(""), subdataset.Tags()); + EXPECT_EQ(string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), subdataset.Version()); + + const Filters& filters = subdataset.Filters(); + ASSERT_EQ(1, filters.Size()); + const Filter& filter = filters[0]; + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("RNAME"), property.Name()); + EXPECT_EQ(string("chr1"), property.Value()); + EXPECT_EQ(string("=="), property.Operator()); + } + } +} + +static void TestAli3Xml(void) +{ + const DataSet dataset(ali3XmlFn); + EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type()); + EXPECT_EQ(string("2015-01-27T09:00:01"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.AlignmentSet"), dataset.MetaType()); + EXPECT_EQ(string("DataSet_AlignmentSet"), dataset.Name()); + EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags()); + EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + EXPECT_EQ(0, dataset.Filters().Size()); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(2, resources.Size()); + for (size_t i = 0; i < resources.Size(); ++i) { + const ExternalResource& resource = resources[i]; + if (i == 0) { + EXPECT_EQ(string("First Alignments BAM"), resource.Name()); + EXPECT_EQ(string("Points to an example Alignments BAM file."), resource.Description()); + EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/alignments2.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/alignments2.pbi"), index.ResourceId()); + } + else { + EXPECT_EQ(string("Second Alignments BAM"), resource.Name()); + EXPECT_EQ(string("Points to another example Alignments BAM file, by relative path."), resource.Description()); + EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:./alignments3.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/alignments3.pbi"), index.ResourceId()); + } + } + + const SubDataSets& subdatasets = dataset.SubDataSets(); + ASSERT_EQ(2, subdatasets.Size()); + for (size_t i = 0; i < subdatasets.Size(); ++i) { + const DataSetBase& subdataset = subdatasets[i]; + if (i == 0) { + EXPECT_EQ(string(""), subdataset.CreatedAt()); + EXPECT_EQ(string(""), subdataset.MetaType()); + EXPECT_EQ(string("HighQuality Read Alignments"), subdataset.Name()); + EXPECT_EQ(string(""), subdataset.Tags()); + EXPECT_EQ(string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), subdataset.Version()); + + const Filters& filters = subdataset.Filters(); + ASSERT_EQ(1, filters.Size()); + const Filter& filter = filters[0]; + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("rq"), property.Name()); + EXPECT_EQ(string("0.75"), property.Value()); + EXPECT_EQ(string(">"), property.Operator()); + } + else { + EXPECT_EQ(string(""), subdataset.CreatedAt()); + EXPECT_EQ(string(""), subdataset.MetaType()); + EXPECT_EQ(string("Alignments to chromosome 1"), subdataset.Name()); + EXPECT_EQ(string(""), subdataset.Tags()); + EXPECT_EQ(string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), subdataset.Version()); + + const Filters& filters = subdataset.Filters(); + ASSERT_EQ(1, filters.Size()); + const Filter& filter = filters[0]; + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("RNAME"), property.Name()); + EXPECT_EQ(string("chr1"), property.Value()); + EXPECT_EQ(string("=="), property.Operator()); + } + } +} + +static void TestAli4Xml(void) +{ + const DataSet dataset(ali4XmlFn); + EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type()); + EXPECT_EQ(string("2015-01-27T09:00:01"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.AlignmentSet"), dataset.MetaType()); + EXPECT_EQ(string("DataSet_AlignmentSet"), dataset.Name()); + EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags()); + EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + EXPECT_EQ(0, dataset.Filters().Size()); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(2, resources.Size()); + for (size_t i = 0; i < resources.Size(); ++i) { + const ExternalResource& resource = resources[i]; + if (i == 0) { + EXPECT_EQ(string("First Alignments BAM"), resource.Name()); + EXPECT_EQ(string("Points to an example Alignments BAM file."), resource.Description()); + EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/alignments0.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/alignments0.pbi"), index.ResourceId()); + } + else { + EXPECT_EQ(string("Second Alignments BAM"), resource.Name()); + EXPECT_EQ(string("Points to another example Alignments BAM file, by relative path."), resource.Description()); + EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:./alignments1.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/alignments1.pbi"), index.ResourceId()); + } + } + + const SubDataSets& subdatasets = dataset.SubDataSets(); + ASSERT_EQ(2, subdatasets.Size()); + for (size_t i = 0; i < subdatasets.Size(); ++i) { + const DataSetBase& subdataset = subdatasets[i]; + if (i == 0) { + EXPECT_EQ(string(""), subdataset.CreatedAt()); + EXPECT_EQ(string(""), subdataset.MetaType()); + EXPECT_EQ(string("HighQuality Read Alignments"), subdataset.Name()); + EXPECT_EQ(string(""), subdataset.Tags()); + EXPECT_EQ(string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), subdataset.Version()); + + const Filters& filters = subdataset.Filters(); + ASSERT_EQ(1, filters.Size()); + const Filter& filter = filters[0]; + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("rq"), property.Name()); + EXPECT_EQ(string("0.85"), property.Value()); + EXPECT_EQ(string(">"), property.Operator()); + } + else { + EXPECT_EQ(string(""), subdataset.CreatedAt()); + EXPECT_EQ(string(""), subdataset.MetaType()); + EXPECT_EQ(string("Alignments to chromosome 1"), subdataset.Name()); + EXPECT_EQ(string(""), subdataset.Tags()); + EXPECT_EQ(string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), subdataset.Version()); + + const Filters& filters = subdataset.Filters(); + ASSERT_EQ(1, filters.Size()); + const Filter& filter = filters[0]; + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("RNAME"), property.Name()); + EXPECT_EQ(string("chr1"), property.Value()); + EXPECT_EQ(string("=="), property.Operator()); + } + } +} + +static void TestMappingStaggeredXml(void) +{ + const DataSet dataset(mappingStaggeredXmlFn); + EXPECT_EQ(DataSet::GENERIC, dataset.Type()); + EXPECT_EQ(string("2015-05-13T10:58:26"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.DataSet"), dataset.MetaType()); + EXPECT_EQ(string(""), dataset.Name()); + EXPECT_EQ(string(""), dataset.Tags()); + EXPECT_EQ(string("30f72098-bc5b-e06b-566c-8b28dda909a8"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + EXPECT_EQ(0, dataset.Filters().Size()); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(2, resources.Size()); + for (size_t i = 0; i < resources.Size(); ++i) { + const ExternalResource& resource = resources[i]; + if (i == 0) { + EXPECT_EQ(string(""), resource.Name()); + EXPECT_EQ(string(""), resource.Description()); + EXPECT_EQ(string(""), resource.MetaType()); + EXPECT_EQ(string("file:tests/data/bam_mapping_1.bam"), resource.ResourceId()); + EXPECT_EQ(string(""), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:tests/data/bam_mapping_1.bam.bai"), index.ResourceId()); + } + else { + EXPECT_EQ(string(""), resource.Name()); + EXPECT_EQ(string(""), resource.Description()); + EXPECT_EQ(string(""), resource.MetaType()); + EXPECT_EQ(string("file:tests/data/bam_mapping_2.bam"), resource.ResourceId()); + EXPECT_EQ(string(""), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:tests/data/bam_mapping_2.bam.bai"), index.ResourceId()); + } + } + + const SubDataSets& subdatasets = dataset.SubDataSets(); + ASSERT_EQ(2, subdatasets.Size()); + for (size_t i = 0; i < subdatasets.Size(); ++i) { + const DataSetBase& subdataset = subdatasets[i]; + if (i == 0) { + EXPECT_EQ(string("2015-05-13T10:58:26"), subdataset.CreatedAt()); + EXPECT_EQ(string(""), subdataset.MetaType()); + EXPECT_EQ(string(""), subdataset.Name()); + EXPECT_EQ(string(""), subdataset.Tags()); + EXPECT_EQ(string("c5402d06-4643-057c-e300-fe229b4e8909"), subdataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), subdataset.Version()); + + const ExternalResources& resources = subdataset.ExternalResources(); + ASSERT_EQ(1, resources.Size()); + const ExternalResource& resource = resources[0]; + EXPECT_EQ(string("file:tests/data/bam_mapping_2.bam"), resource.ResourceId()); + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:tests/data/bam_mapping_2.bam.bai"), index.ResourceId()); + } + else { + EXPECT_EQ(string("2015-05-13T10:58:26"), subdataset.CreatedAt()); + EXPECT_EQ(string(""), subdataset.MetaType()); + EXPECT_EQ(string(""), subdataset.Name()); + EXPECT_EQ(string(""), subdataset.Tags()); + EXPECT_EQ(string("f8b54a55-5fb7-706f-ab35-39afc9c86924"), subdataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), subdataset.Version()); + + const ExternalResources& resources = subdataset.ExternalResources(); + ASSERT_EQ(1, resources.Size()); + const ExternalResource& resource = resources[0]; + EXPECT_EQ(string("file:tests/data/bam_mapping_1.bam"), resource.ResourceId()); + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:tests/data/bam_mapping_1.bam.bai"), index.ResourceId()); + } + } +} + +static void TestBarcodeXml(void) +{ + const DataSet dataset(barcodeXmlFn); + EXPECT_EQ(DataSet::BARCODE, dataset.Type()); + EXPECT_EQ(string("2015-01-27T09:00:01"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.BarcodeSet"), dataset.MetaType()); + EXPECT_EQ(string("DataSet_BarcodeSet"), dataset.Name()); + EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags()); + EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + EXPECT_EQ(0, dataset.Filters().Size()); + EXPECT_EQ(0, dataset.SubDataSets().Size()); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(1, resources.Size()); + const ExternalResource& resource = resources[0]; + EXPECT_EQ(string("First Barcodes FASTA"), resource.Name()); + EXPECT_EQ(string("Points to an example Barcodes FASTA file."), resource.Description()); + EXPECT_EQ(string("BarcodeFile.BarcodeFastaFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/barcode.fasta"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const DataSetMetadata& metadata = dataset.Metadata(); + EXPECT_EQ(string("30"), metadata.NumRecords()); + EXPECT_EQ(string("400"), metadata.TotalLength()); + + // access metadata extensions directly for now + EXPECT_EQ(string("paired"), metadata.ChildText("BarcodeConstruction")); +} + +static void TestCcsReadXml(void) +{ + const DataSet dataset(ccsReadXmlFn); + EXPECT_EQ(DataSet::CONSENSUS_READ, dataset.Type()); + EXPECT_EQ(string("2015-01-27T09:00:01"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.ConsensusReadSet"), dataset.MetaType()); + EXPECT_EQ(string("DataSet_ConsensusReadSet"), dataset.Name()); + EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags()); + EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + EXPECT_EQ(0, dataset.Filters().Size()); + EXPECT_EQ(0, dataset.SubDataSets().Size()); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(2, resources.Size()); + for (size_t i = 0; i < resources.Size(); ++i) { + const ExternalResource& resource = resources[i]; + if (i == 0) { + EXPECT_EQ(string("First ConsensusRead BAM"), resource.Name()); + EXPECT_EQ(string("Points to an example ConsensusRead BAM file."), resource.Description()); + EXPECT_EQ(string("PacBio.ConsensusReadFile.ConsensusReadBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/ccsreads0.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("PacBio.Index.PacBioIndex"), index.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/ccsreads0.pbi"), index.ResourceId()); + } + else { + EXPECT_EQ(string("Second ConsensusRead BAM"), resource.Name()); + EXPECT_EQ(string("Points to another example ConsensusRead BAM file."), resource.Description()); + EXPECT_EQ(string("PacBio.ConsensusReadFile.ConsensusReadBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/ccsreads1.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("PacBio.Index.PacBioIndex"), index.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/ccsreads0.pbi"), index.ResourceId()); + } + } +} + +static void TestLambdaContigsXml(void) +{ + const DataSet dataset(lambdaContigsXmlFn); + EXPECT_EQ(DataSet::REFERENCE, dataset.Type()); + EXPECT_EQ(string("2015-05-28T10:56:36"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.ReferenceSet"), dataset.MetaType()); + EXPECT_EQ(string(""), dataset.Name()); + EXPECT_EQ(string(""), dataset.Tags()); + EXPECT_EQ(string("596e87db-34f9-d2fd-c905-b017543170e1"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + EXPECT_EQ(0, dataset.Filters().Size()); + EXPECT_EQ(0, dataset.SubDataSets().Size()); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(1, resources.Size()); + const ExternalResource& resource = resources[0]; + EXPECT_EQ(string("file:tests/data/lambda_contigs.fasta"), resource.ResourceId()); +} + +static void TestPbalchemyXml(void) +{ + const DataSet dataset(pbalchemyXmlFn); + EXPECT_EQ(DataSet::GENERIC, dataset.Type()); + EXPECT_EQ(string("2015-05-22T16:56:16"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.DataSet"), dataset.MetaType()); + EXPECT_EQ(string(""), dataset.Name()); + EXPECT_EQ(string(""), dataset.Tags()); + EXPECT_EQ(string("58e3f7c5-24c1-b58b-fbd5-37de268cc2f0"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + EXPECT_EQ(0, dataset.SubDataSets().Size()); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(1, resources.Size()); + const ExternalResource& resource = resources[0]; + EXPECT_EQ(string("file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam"), resource.ResourceId()); + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam.bai"), index.ResourceId()); + + // TYPOs: Should be Filter Properties/Property not Parameter(s) + +} + +static void TestReferenceXml(void) +{ + const DataSet dataset(referenceXmlFn); + EXPECT_EQ(DataSet::REFERENCE, dataset.Type()); + EXPECT_EQ(string("2015-01-27T09:00:01"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.ReferenceSet"), dataset.MetaType()); + EXPECT_EQ(string("DataSet_ReferenceSet"), dataset.Name()); + EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags()); + EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + EXPECT_EQ(0, dataset.Filters().Size()); + EXPECT_EQ(0, dataset.SubDataSets().Size()); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(1, resources.Size()); + const ExternalResource& resource = resources[0]; + EXPECT_EQ(string("First References FASTA"), resource.Name()); + EXPECT_EQ(string("Points to an example references FASTA file."), resource.Description()); + EXPECT_EQ(string("PacBio.ReferenceFile.ReferenceFastaFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/reference.fasta"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(2, fileIndices.Size()); + for (size_t i = 0; i < fileIndices.Size(); ++i) { + const FileIndex& index = fileIndices[i]; + if (i == 0) { + EXPECT_EQ(string("PacBio.Index.SaWriterIndex"), index.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/reference.fasta.sa"), index.ResourceId()); + } + else { + EXPECT_EQ(string("PacBio.Index.SamIndex"), index.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/reference.fasta.fai"), index.ResourceId()); + } + } + + const DataSetMetadata& metadata = dataset.Metadata(); + EXPECT_EQ(string("500"), metadata.NumRecords()); + EXPECT_EQ(string("5000000"), metadata.TotalLength()); + + // access metadata extensions directly for now + EXPECT_EQ(string("Tribble"), metadata.ChildText("Organism")); + EXPECT_EQ(string("Diploid"), metadata.ChildText("Ploidy")); + + const internal::DataSetListElement& contigs = + metadata.Child >("Contigs"); + ASSERT_EQ(1, contigs.NumChildren()); + const internal::DataSetElement& contig = contigs[0]; + EXPECT_EQ(string("gi|229359445|emb|AM181176.4|"), contig.Attribute("Name")); + EXPECT_EQ(string("Pseudomonas fluorescens SBW25 complete genome|quiver"), contig.Attribute("Description")); + EXPECT_EQ(string("6722109"), contig.Attribute("Length")); + EXPECT_EQ(string("f627c795efad7ce0050ed42b942d408e"), contig.Attribute("Digest")); +} + +static void TestSubread1Xml(void) +{ + const DataSet dataset(subread1XmlFn); + EXPECT_EQ(DataSet::SUBREAD, dataset.Type()); + EXPECT_EQ(string("2015-01-27T09:00:01"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.SubreadSet"), dataset.MetaType()); + EXPECT_EQ(string("DataSet_SubreadSet"), dataset.Name()); + EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags()); + EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(2, resources.Size()); + for (size_t i = 0; i < resources.Size(); ++i) { + const ExternalResource& resource = resources[i]; + if (i == 0) { + EXPECT_EQ(string("First Subreads BAM"), resource.Name()); + EXPECT_EQ(string("Points to an example Subreads BAM file."), resource.Description()); + EXPECT_EQ(string("SubreadFile.SubreadBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/subreads0.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/subreads0.pbi"), index.ResourceId()); + } + else { + EXPECT_EQ(string("Second Subreads BAM"), resource.Name()); + EXPECT_EQ(string("Points to another example Subreads BAM file."), resource.Description()); + EXPECT_EQ(string("SubreadFile.SubreadBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/subreads1.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/subreads0.pbi"), index.ResourceId()); + } + } + + const Filters& filters = dataset.Filters(); + ASSERT_EQ(2, filters.Size()); + for (size_t i = 0; i < filters.Size(); ++i) { + const Filter& filter = filters[i]; + if (i == 0) { + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("rq"), property.Name()); + EXPECT_EQ(string("0.75"), property.Value()); + EXPECT_EQ(string(">"), property.Operator()); + } else { + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("QNAME"), property.Name()); + EXPECT_EQ(string("100/0/0_100"), property.Value()); + EXPECT_EQ(string("=="), property.Operator()); + } + } + + const DataSetMetadata& metadata = dataset.Metadata(); + EXPECT_EQ(string("500"), metadata.NumRecords()); + EXPECT_EQ(string("500000"), metadata.TotalLength()); +} + +static void TestSubread2Xml(void) +{ + const DataSet dataset(subread2XmlFn); + EXPECT_EQ(DataSet::SUBREAD, dataset.Type()); + EXPECT_EQ(string("2015-01-27T09:00:01"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.SubreadSet"), dataset.MetaType()); + EXPECT_EQ(string("DataSet_SubreadSet"), dataset.Name()); + EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags()); + EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(2, resources.Size()); + for (size_t i = 0; i < resources.Size(); ++i) { + const ExternalResource& resource = resources[i]; + if (i == 0) { + EXPECT_EQ(string("First Subreads BAM"), resource.Name()); + EXPECT_EQ(string("Points to an example Subreads BAM file."), resource.Description()); + EXPECT_EQ(string("SubreadFile.SubreadBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/subreads2.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/subreads2.pbi"), index.ResourceId()); + } + else { + EXPECT_EQ(string("Second Subreads BAM"), resource.Name()); + EXPECT_EQ(string("Points to another example Subreads BAM file."), resource.Description()); + EXPECT_EQ(string("SubreadFile.SubreadBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/subreads3.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/subreads3.pbi"), index.ResourceId()); + } + } + + const Filters& filters = dataset.Filters(); + ASSERT_EQ(2, filters.Size()); + for (size_t i = 0; i < filters.Size(); ++i) { + const Filter& filter = filters[i]; + if (i == 0) { + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("rq"), property.Name()); + EXPECT_EQ(string("0.75"), property.Value()); + EXPECT_EQ(string(">"), property.Operator()); + } else { + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("QNAME"), property.Name()); + EXPECT_EQ(string("100/0/0_100"), property.Value()); + EXPECT_EQ(string("=="), property.Operator()); + } + } + + const DataSetMetadata& metadata = dataset.Metadata(); + EXPECT_EQ(string("500"), metadata.NumRecords()); + EXPECT_EQ(string("500000"), metadata.TotalLength()); +} + +static void TestSubread3Xml(void) +{ + const DataSet dataset(subread3XmlFn); + EXPECT_EQ(DataSet::SUBREAD, dataset.Type()); + EXPECT_EQ(string("2015-01-27T09:00:01"), dataset.CreatedAt()); + EXPECT_EQ(string("PacBio.DataSet.SubreadSet"), dataset.MetaType()); + EXPECT_EQ(string("DataSet_SubreadSet"), dataset.Name()); + EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags()); + EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId()); + EXPECT_EQ(string("2.3.0"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"), dataset.Attribute("xmlns:xsi")); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation")); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(2, resources.Size()); + for (size_t i = 0; i < resources.Size(); ++i) { + const ExternalResource& resource = resources[i]; + if (i == 0) { + EXPECT_EQ(string("First Subreads BAM"), resource.Name()); + EXPECT_EQ(string("Points to an example Subreads BAM file."), resource.Description()); + EXPECT_EQ(string("SubreadFile.SubreadBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/subreads2.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/subreads2.pbi"), index.ResourceId()); + } + else { + EXPECT_EQ(string("Second Subreads BAM"), resource.Name()); + EXPECT_EQ(string("Points to another example Subreads BAM file."), resource.Description()); + EXPECT_EQ(string("SubreadFile.SubreadBamFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/path/to/subreads3.bam"), resource.ResourceId()); + EXPECT_EQ(string("Example"), resource.Tags()); + + const FileIndices& fileIndices = resource.FileIndices(); + ASSERT_EQ(1, fileIndices.Size()); + const FileIndex& index = fileIndices[0]; + EXPECT_EQ(string("file:///mnt/path/to/subreads3.pbi"), index.ResourceId()); + } + } + + const Filters& filters = dataset.Filters(); + ASSERT_EQ(2, filters.Size()); + for (size_t i = 0; i < filters.Size(); ++i) { + const Filter& filter = filters[i]; + if (i == 0) { + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("rq"), property.Name()); + EXPECT_EQ(string("0.85"), property.Value()); + EXPECT_EQ(string(">"), property.Operator()); + } else { + const Properties& properties = filter.Properties(); + ASSERT_EQ(1, properties.Size()); + const Property& property = properties[0]; + EXPECT_EQ(string("QNAME"), property.Name()); + EXPECT_EQ(string("100/0/0_100"), property.Value()); + EXPECT_EQ(string("=="), property.Operator()); + } + } + + const DataSetMetadata& metadata = dataset.Metadata(); + EXPECT_EQ(string("500"), metadata.NumRecords()); + EXPECT_EQ(string("500000"), metadata.TotalLength()); +} + +static void TestTransformedXml(void) +{ + const DataSet dataset(transformedXmlFn); + EXPECT_EQ(DataSet::HDF_SUBREAD, dataset.Type()); + EXPECT_EQ(string("PacBio.DataSet.SubreadSet"), dataset.MetaType()); + EXPECT_EQ(string("Subreads from run r001173_42129_130607"), dataset.Name()); + EXPECT_EQ(string("pacbio.secondary.instrument=RS"), dataset.Tags()); + EXPECT_EQ(string("abbc9183-b01e-4671-8c12-19efee534647"), dataset.UniqueId()); + EXPECT_EQ(string("0.5"), dataset.Version()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns")); + EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema"), dataset.Attribute("xmlns:xs")); + EXPECT_EQ(string("http://www.w3.org/2005/xpath-functions"), dataset.Attribute("xmlns:fn")); + EXPECT_EQ(string("java:java.util.UUID"), dataset.Attribute("xmlns:uuid")); + EXPECT_EQ(string("http://whatever"), dataset.Attribute("xmlns:bax")); + + EXPECT_EQ(0, dataset.Filters().Size()); + EXPECT_EQ(0, dataset.SubDataSets().Size()); + + const ExternalResources& resources = dataset.ExternalResources(); + ASSERT_EQ(3, resources.Size()); + for (size_t i = 0; i < resources.Size(); ++i) { + const ExternalResource& resource = resources[i]; + if (i == 0) { + EXPECT_EQ(string("PacBio.SubreadFile.BaxFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/secondary-siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.0.bax.h5"), + resource.ResourceId()); + } + else if (i == 1) { + EXPECT_EQ(string("PacBio.SubreadFile.BaxFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/secondary-siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.1.bax.h5"), + resource.ResourceId()); + } + else { + EXPECT_EQ(string("PacBio.SubreadFile.BaxFile"), resource.MetaType()); + EXPECT_EQ(string("file:///mnt/secondary-siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.2.bax.h5"), + resource.ResourceId()); + } + } + + const DataSetMetadata& metadata = dataset.Metadata(); + EXPECT_EQ(string("150000"), metadata.NumRecords()); + EXPECT_EQ(string("50000000"), metadata.TotalLength()); +} + +TEST(DataSetIOTest, InspectMalformedXml) +{ + const string xmlFn = tests::Data_Dir + "/dataset/malformed.xml"; + + DataSet ds(xmlFn); + stringstream s; + ds.SaveToStream(s); + + const string expected = + "\n" + "\n" + "\t\n" + "\t\t\n" + "\t\n" + "\t\n" + "\t\t50000000\n" + "\t\t150000\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t2.3.0.1.142990\n" + "\t\t\t\tNRT@172.31.128.10:8082, SwVer=2301.142990, HwVer=1.0\n" + "\t\t\t\t\n" + "\t\t\t\t\tr000013_42267_150403\n" + "\t\t\t\t\tInst42267-040315-SAT-100pM-2kb-P6C4\n" + "\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\tInst42267-040315-SAT-100pM-2kb-P6C4\n" + "\t\t\t\t\tInst42267-040315-SAT-100pM-2kb-P6C4\n" + "\t\t\t\t\t0.0\n" + "\t\t\t\t\tfalse\n" + "\t\t\t\t\tfalse\n" + "\t\t\t\t\tfalse\n" + "\t\t\t\t\t1\n" + "\t\t\t\t\t\n" + "\t\t\t\t\t\t251acf71-9eb0-489e-9dd1-cdbd11432752\n" + "\t\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\t\n" + "\t\t\t\t\t\t\n" + "\t\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t7\n" + "\t\t\t\t4\n" + "\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\tBasecallerV1\n" + "\t\t\t\t\t2-3-0_P6-C4.xml\n" + "\t\t\t\t\t\n" + "\t\t\t\t\t\n" + "\t\t\t\t\t\tAnalysis_Results\n" + "\t\t\t\t\t\trsy://mp-rsync/vol55//RS_DATA_STAGING/42267/Inst42267-040315-SAT-100pM-2kb-P6C4_13/A04_7/\n" + "\t\t\t\t\t\t\n" + "\t\t\t\t\t\t\tFasta\n" + "\t\t\t\t\t\t\n" + "\t\t\t\t\t\tBases\n" + "\t\t\t\t\t\tMinimal\n" + "\t\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\n" + "\n"; + + EXPECT_EQ(expected, s.str()); +} + +TEST(DataSetIOTest, RelativePathCarriedThroughOk_FromString) +{ + const string inputXml = + "\n" + "\n" + "\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\n" + "\n"; + + auto dataset = DataSet::FromXml(inputXml); + + stringstream stream; + dataset.SaveToStream(stream); + auto outputXml = stream.str(); + + EXPECT_EQ(inputXml, outputXml); +} + +TEST(DataSetIOTest, RelativePathCarriedThroughOk_FromFile) +{ + DataSet dataset(tests::Data_Dir + "/relative/relative.xml"); + auto resources = dataset.ExternalResources(); + EXPECT_EQ("./a/test.bam", resources[0].ResourceId()); + EXPECT_EQ("./b/test1.bam", resources[1].ResourceId()); + EXPECT_EQ("./b/test2.bam", resources[2].ResourceId()); + + stringstream out; + dataset.SaveToStream(out); + + auto newDataset = DataSet::FromXml(out.str()); + auto newResources = newDataset.ExternalResources(); + EXPECT_EQ("./a/test.bam", newResources[0].ResourceId()); + EXPECT_EQ("./b/test1.bam", newResources[1].ResourceId()); + EXPECT_EQ("./b/test2.bam", newResources[2].ResourceId()); +} + +TEST(DataSetIOTest, DataSetFromRelativeBamFilename) +{ + // cache initial directory and move to location so we can test relatvie filename ok + const string startingDirectory = internal::FileUtils::CurrentWorkingDirectory(); + + const string targetDirectory = tests::Data_Dir + "/dataset"; + changeCurrentDirectory(targetDirectory); + ASSERT_EQ(targetDirectory, internal::FileUtils::CurrentWorkingDirectory()); + + EXPECT_NO_THROW( + { + const string relativeBamFn = "../phi29.bam"; + const DataSet ds(relativeBamFn); + const auto& files = ds.BamFiles(); + EXPECT_EQ(1, files.size()); + }); + + // restore working directory + changeCurrentDirectory(startingDirectory); +} + diff --git a/tests/src/test_DataSetMetadata.cpp b/tests/src/test_DataSetMetadata.cpp new file mode 100644 index 0000000..ac5d469 --- /dev/null +++ b/tests/src/test_DataSetMetadata.cpp @@ -0,0 +1,63 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace tests { + +//static inline +//DataSet CreateDataSet(void) +//{ +// DataSet d; +// d.Name("foo"); +// return d; +//} + +} // namespace tests + +TEST(DataSetMetadataTest, DummyTest) { + EXPECT_TRUE(true); +} diff --git a/tests/src/test_DataSetQuery.cpp b/tests/src/test_DataSetQuery.cpp new file mode 100644 index 0000000..996fbfe --- /dev/null +++ b/tests/src/test_DataSetQuery.cpp @@ -0,0 +1,500 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "TestData.h" +#include +#include +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +const string alignedBamFn = tests::Data_Dir + "/aligned.bam"; +const string aligned2BamFn = tests::Data_Dir + "/aligned2.bam"; +const string alignedCopyBamFn = tests::GeneratedData_Dir + "/aligned.bam"; +const string aligned2CopyBamFn = tests::GeneratedData_Dir + "/aligned2.bam"; + +const string group_fofn = tests::Generated_Dir + "/group.fofn"; +const string group_file1 = tests::Data_Dir + "/group/test1.bam"; +const string group_file2 = tests::Data_Dir + "/group/test2.bam"; +const string group_file3 = tests::Data_Dir + "/group/test3.bam"; + +const vector group_file1_names = +{ + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/24962/0_427" +}; + +const vector group_file2_names = +{ + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2114_2531", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/4101_5571", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237" +}; + +const vector group_file3_names = +{ + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/45203/0_893", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/45203/0_893", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/3759_4005", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/4052_4686", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/4732_4869", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/9482_9628", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/9675_10333", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/10378_10609", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49194/0_798", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49194/845_1541", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49521/0_134" +}; + +static inline +bool InGroup(const string& name, const vector& group) +{ + for (const string& s : group) { + if (s == name) + return true; + } + return false; +} + +TEST(DataSetQueryTest, EntireFileQueryTest) +{ + // single file + EXPECT_NO_THROW( + { + BamFile bamFile(alignedBamFn); + + DataSet dataset; + dataset.ExternalResources().Add(bamFile); + + int count =0; + EntireFileQuery query(dataset); // from DataSet object + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(4, count); + + count = 0; + EntireFileQuery query2(alignedBamFn); // from BAM filename + for (const BamRecord& record : query2) { + (void)record; + ++count; + } + EXPECT_EQ(4, count); + + count = 0; + EntireFileQuery query3(bamFile); // from BamFile object + for (const BamRecord& record : query3) { + (void)record; + ++count; + } + EXPECT_EQ(4, count); + }); + + // duplicate file attempt + EXPECT_NO_THROW( + { + BamFile bamFile(alignedBamFn); + + DataSet dataset; + dataset.ExternalResources().Add(bamFile); + dataset.ExternalResources().Add(bamFile); + + int count =0; + EntireFileQuery query(dataset); + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(4, count); // same as single + }); + + // true multi-file dataset + EXPECT_NO_THROW( + { + BamFile file1(group_file1); // 1 read + BamFile file2(group_file2); // 4 reads + BamFile file3(group_file3); // 13 reads + + DataSet dataset; + dataset.ExternalResources().Add(file1); + dataset.ExternalResources().Add(file2); + dataset.ExternalResources().Add(file3); + + int count = 0; + EntireFileQuery query(dataset); + for (const BamRecord& record : query) { + + // ensure sequential merge of files + if (count == 0) EXPECT_TRUE(InGroup(record.FullName(), group_file1_names)); + else if (count < 5) EXPECT_TRUE(InGroup(record.FullName(), group_file2_names)); + else EXPECT_TRUE(InGroup(record.FullName(), group_file3_names)); + + ++count; + } + EXPECT_EQ(18, count); + }); + + // same as above, from FOFN + EXPECT_NO_THROW( + { + int count = 0; + + DataSet dataset(group_fofn); + EntireFileQuery query(dataset); + for (const BamRecord& record : query) { + + // ensure sequential merge of files + if (count == 0) EXPECT_TRUE(InGroup(record.FullName(), group_file1_names)); + else if (count < 5) EXPECT_TRUE(InGroup(record.FullName(), group_file2_names)); + else EXPECT_TRUE(InGroup(record.FullName(), group_file3_names)); + + ++count; + } + EXPECT_EQ(18, count); + }); +} + +TEST(DataSetQueryTest, GenomicIntervalQueryTest) +{ + const string rname = "lambda_NEB3011"; + + // single file + EXPECT_NO_THROW( + { + DataSet dataset(alignedBamFn); // from BAM filename + + // count records + int count = 0; + GenomicInterval interval(rname, 5000, 6000); + GenomicIntervalQuery query(interval, dataset); + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(2, count); + + // adjust interval and pass back in + count = 0; + interval.Start(9000); + interval.Stop(9500); + query.Interval(interval); + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(2, count); + + // unknown ref + count = 0; + interval.Name("does not exist"); + interval.Start(0); + interval.Stop(100); + EXPECT_THROW(query.Interval(interval), std::exception); + for (const BamRecord& record : query) { // iteration is still safe, just returns no data + (void)record; + ++count; + } + EXPECT_EQ(0, count); + + // adjust again - make sure we can read a real region after an invalid one + interval.Name(rname); + interval.Start(5000); + interval.Stop(6000); + query.Interval(interval); + count = 0; + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(2, count); + }); + + // duplicate file + EXPECT_NO_THROW( + { + BamFile bamFile(alignedBamFn); + + DataSet dataset; + dataset.ExternalResources().Add(bamFile); + dataset.ExternalResources().Add(bamFile); + + // count records & also ensure sorted merge + int count = 0; + int prevId = 0; + int prevPos = 0; + + GenomicInterval interval(rname, 5000, 6000); + GenomicIntervalQuery query(interval, dataset); + for (const BamRecord& record : query) { + + EXPECT_TRUE(record.ReferenceId() >= prevId); + EXPECT_TRUE(record.ReferenceStart() >= prevPos); + + prevId = record.ReferenceId(); + prevPos = record.ReferenceStart(); + ++count; + } + EXPECT_EQ(2, count); // same as single file + + // adjust interval and pass back in + count = 0; + interval.Start(9000); + interval.Stop(10000); + query.Interval(interval); + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(2, count); // same as single file + + // unknown ref + count = 0; + interval.Name("does not exist"); + interval.Start(0); + interval.Stop(100); + EXPECT_THROW(query.Interval(interval), std::exception); + for (const BamRecord& record : query) { // iteration is still safe, just returns no data + (void)record; + ++count; + } + EXPECT_EQ(0, count); // same as single file + + // adjust again - make sure we can read a real region after an invalid one + interval.Name(rname); + interval.Start(5000); + interval.Stop(5300); + query.Interval(interval); + count = 0; + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(2, count); // same as single file + }); + + // multi file BAM (same record content for easy testing, but different filename(ResourceId) + EXPECT_NO_THROW( + { + BamFile bamFile(alignedBamFn); + BamFile copyFile(alignedCopyBamFn); + + DataSet dataset; + dataset.ExternalResources().Add(bamFile); + dataset.ExternalResources().Add(copyFile); + + // count records & also ensure sorted merge + int count = 0; + int prevId = 0; + int prevPos = 0; + + GenomicInterval interval(rname, 5000, 6000); + GenomicIntervalQuery query(interval, dataset); + for (const BamRecord& record : query) { + + EXPECT_TRUE(record.ReferenceId() >= prevId); + EXPECT_TRUE(record.ReferenceStart() >= prevPos); + + prevId = record.ReferenceId(); + prevPos = record.ReferenceStart(); + ++count; + } + EXPECT_EQ(4, count); // single file * 2 + + // adjust interval and pass back in + count = 0; + interval.Start(9000); + interval.Stop(10000); + query.Interval(interval); + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(4, count); // single file * 2 + + // unknown ref + count = 0; + interval.Name("does not exist"); + interval.Start(0); + interval.Stop(100); + EXPECT_THROW(query.Interval(interval), std::exception); + for (const BamRecord& record : query) { // iteration is still safe, just returns no data + (void)record; + ++count; + } + EXPECT_EQ(0, count); // single file * 2 + + // adjust again - make sure we can read a real region after an invalid one + interval.Name(rname); + interval.Start(5000); + interval.Stop(5300); + query.Interval(interval); + count = 0; + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(4, count); // single file * 2 + }); +} + +// TODO: implement me +TEST(DataSetQueryTest, QNameQueryTest) +{ + EXPECT_TRUE(true); +} + +TEST(DataSetQueryTest, ZmwQueryTest) +{ + const std::vector whitelist = { 13473, 30983 }; + + // single file + EXPECT_NO_THROW( + { + BamFile bamFile(aligned2BamFn); + ASSERT_TRUE(bamFile.PacBioIndexExists()); + DataSet dataset(bamFile); + + int count = 0; + ZmwQuery query(whitelist, dataset); + for (const BamRecord& record: query) { + const int32_t holeNumber = record.HoleNumber(); + EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983); + ++count; + } + EXPECT_EQ(4, count); + }); + + // multi-file + EXPECT_NO_THROW( + { + BamFile bamFile(aligned2BamFn); + BamFile bamFile2(aligned2CopyBamFn); + ASSERT_TRUE(bamFile.PacBioIndexExists()); + ASSERT_TRUE(bamFile2.PacBioIndexExists()); + + DataSet dataset; + dataset.ExternalResources().Add(ExternalResource(bamFile)); + dataset.ExternalResources().Add(ExternalResource(bamFile2)); + + int count = 0; + ZmwQuery query(whitelist, dataset); + for (const BamRecord& r : query) { + const auto holeNumber = r.HoleNumber(); + EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983); + ++count; + } + EXPECT_EQ(8, count); + }); +} + +TEST(DataSetQueryTest, ZmwGroupQueryTest) +{ + const std::vector whitelist = { 13473, 30983 }; + + // single-file + EXPECT_NO_THROW( + { + BamFile bamFile(aligned2BamFn); + ASSERT_TRUE(bamFile.PacBioIndexExists()); + DataSet dataset(bamFile); + + int count = 0; + int32_t groupZmw = -1; + ZmwGroupQuery query(whitelist, dataset); + for (const vector& group : query) { + for (const BamRecord& record: group) { + const auto holeNumber = record.HoleNumber(); + if (groupZmw == -1) + groupZmw = holeNumber; + EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983); + EXPECT_EQ(groupZmw, holeNumber); + ++count; + } + groupZmw = -1; + } + EXPECT_EQ(4, count); + }); + + // multi-file + EXPECT_NO_THROW( + { + BamFile bamFile(aligned2BamFn); + BamFile bamFile2(aligned2CopyBamFn); + ASSERT_TRUE(bamFile.PacBioIndexExists()); + ASSERT_TRUE(bamFile2.PacBioIndexExists()); + + DataSet dataset; + dataset.ExternalResources().Add(ExternalResource(bamFile)); + dataset.ExternalResources().Add(ExternalResource(bamFile2)); + + int totalCount = 0; + int numRecordsInGroup = 0; + int groupCount = 0; + int32_t groupZmw = -1; + ZmwGroupQuery query(whitelist, dataset); + for (const vector& group : query) { + for (const BamRecord& record: group) { + const auto holeNumber = record.HoleNumber(); + ++numRecordsInGroup; + if (groupZmw == -1) + groupZmw = holeNumber; + EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983); + EXPECT_EQ(groupZmw, holeNumber); + ++totalCount; + } + if (groupCount == 0) + EXPECT_EQ(4, numRecordsInGroup); + else if (groupCount == 1) + EXPECT_EQ(4, numRecordsInGroup); + else + EXPECT_TRUE(false); // should not get here + numRecordsInGroup = 0; + ++groupCount; + groupZmw = -1; + } + EXPECT_EQ(8, totalCount); + }); +} diff --git a/tests/src/test_DataSetXsd.cpp b/tests/src/test_DataSetXsd.cpp new file mode 100644 index 0000000..1238122 --- /dev/null +++ b/tests/src/test_DataSetXsd.cpp @@ -0,0 +1,182 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "TestData.h" +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +TEST(DataSetXsdTest, DefaultsOk) +{ + NamespaceRegistry registry; + + const NamespaceInfo& baseInfo = registry.Namespace(XsdType::BASE_DATA_MODEL); + const NamespaceInfo& dsInfo = registry.Namespace(XsdType::DATASETS); + const NamespaceInfo& defaultInfo = registry.DefaultNamespace(); + + EXPECT_EQ(XsdType::DATASETS, registry.DefaultXsd()); + + EXPECT_EQ(string("pbds"), dsInfo.Name()); + EXPECT_EQ(string("pbbase"), baseInfo.Name()); + EXPECT_EQ(string("pbds"), defaultInfo.Name()); + + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioBaseDataModel.xsd"), baseInfo.Uri()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dsInfo.Uri()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), defaultInfo.Uri()); +} + +TEST(DataSetXsdTest, EditDefaultOk) +{ + NamespaceRegistry registry; + registry.SetDefaultXsd(XsdType::DATASETS); + + const NamespaceInfo& defaultInfo = registry.DefaultNamespace(); + + EXPECT_EQ(XsdType::DATASETS, registry.DefaultXsd()); + EXPECT_EQ(string("pbds"), defaultInfo.Name()); + EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), defaultInfo.Uri()); +} + +TEST(DataSetXsdTest, EditRegistryOk) +{ + NamespaceRegistry registry; + registry.Register(XsdType::DATASETS, NamespaceInfo("custom", "http://custom/uri.xsd")); + + const NamespaceInfo& dsInfo = registry.Namespace(XsdType::DATASETS); + + EXPECT_EQ(string("custom"), dsInfo.Name()); + EXPECT_EQ(string("http://custom/uri.xsd"), dsInfo.Uri()); +} + +TEST(DataSetXsdTest, EditDatasetRegistry) +{ + DataSet dataset(DataSet::ALIGNMENT); + dataset.CreatedAt("2015-01-27T09:00:01"); + dataset.MetaType("PacBio.DataSet.AlignmentSet"); + dataset.Name("DataSet_AlignmentSet"); + dataset.Tags("barcode moreTags mapping mytags"); + dataset.TimeStampedName("my_time_stamped_name"); + dataset.UniqueId("b095d0a3-94b8-4918-b3af-a3f81bbe519c"); + dataset.Attribute("xmlns", "http://pacificbiosciences.com/PacBioDatasets.xsd") + .Attribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance") + .Attribute("xsi:schemaLocation", "http://pacificbiosciences.com/PacBioDatasets.xsd"); + + ExternalResource ext("Fake.MetaType", "filename"); + ext.TimeStampedName("custom_tsn") + .UniqueId("my_uuid"); + dataset.ExternalResources().Add(ext); + + dataset.Namespaces().Register(XsdType::BASE_DATA_MODEL, NamespaceInfo("custom", "http://custom/uri.xsd")); + + const string expectedXml = + "\n" + "\n" + "\t\n" + "\t\t\n" + "\t\n" + "\n"; + + stringstream s; + dataset.SaveToStream(s); + EXPECT_EQ(expectedXml, s.str()); +} + +TEST(DataSetXsdTest, ElementRegistryOk) +{ + { // default namespaces + + DataSet ds; + + // append child elements that do not have a C++ built-in, nor namespace prefix with addition + DataSetMetadata& metadata = ds.Metadata(); + metadata.AddChild(internal::DataSetElement("SummaryStats")); + metadata.AddChild(internal::DataSetElement("CopyFiles")); + metadata.AddChild(internal::DataSetElement("BioSamples")); + metadata.AddChild(internal::DataSetElement("AutomationParameters")); + + stringstream s; + ds.SaveToStream(s); + const string output = s.str(); + + // check that default namespace is propagated properly + EXPECT_TRUE(output.find("pbds:SummaryStats") != string::npos); + EXPECT_TRUE(output.find("pbmeta:CopyFiles") != string::npos); + EXPECT_TRUE(output.find("pbsample:BioSamples") != string::npos); + EXPECT_TRUE(output.find("pbbase:AutomationParameters") != string::npos); + } + + { // custom namespaces + + DataSet ds; + + // setup custom namespaces + ds.Namespaces().Register(XsdType::BASE_DATA_MODEL, NamespaceInfo("custom_base", "http://custom/base.xsd")); + ds.Namespaces().Register(XsdType::COLLECTION_METADATA, NamespaceInfo("custom_meta", "http://custom/meta.xsd")); + ds.Namespaces().Register(XsdType::DATASETS, NamespaceInfo("custom_ds", "http://custom/datasets.xsd")); + ds.Namespaces().Register(XsdType::SAMPLE_INFO, NamespaceInfo("custom_sample", "http://custom/base.xsd")); + + // append child elements that do not have a C++ built-in, nor namespace prefix with addition + DataSetMetadata& metadata = ds.Metadata(); + metadata.AddChild(internal::DataSetElement("SummaryStats")); + metadata.AddChild(internal::DataSetElement("CopyFiles")); + metadata.AddChild(internal::DataSetElement("BioSamples")); + metadata.AddChild(internal::DataSetElement("AutomationParameters")); + + stringstream s; + ds.SaveToStream(s); + const string output = s.str(); + + // check that custom namespace is propagated properly + EXPECT_TRUE(output.find("custom_ds:SummaryStats") != string::npos); + EXPECT_TRUE(output.find("custom_meta:CopyFiles") != string::npos); + EXPECT_TRUE(output.find("custom_sample:BioSamples") != string::npos); + EXPECT_TRUE(output.find("custom_base:AutomationParameters") != string::npos); + } +} diff --git a/tests/src/test_EndToEnd.cpp b/tests/src/test_EndToEnd.cpp new file mode 100644 index 0000000..9675914 --- /dev/null +++ b/tests/src/test_EndToEnd.cpp @@ -0,0 +1,254 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#define protected public +#endif + +#include "TestData.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::tests; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace tests { + +struct Bam1Deleter +{ + void operator()(bam1_t* b) { + if (b) + bam_destroy1(b); + b = nullptr; + } +}; + +struct SamFileDeleter +{ + void operator()(samFile* file) { + if (file) + sam_close(file); + file = nullptr; + } +}; + +struct BamHdrDeleter +{ + void operator()(bam_hdr_t* hdr) { + if (hdr) + bam_hdr_destroy(hdr); + hdr = nullptr; + } +}; + +const string inputBamFn = tests::Data_Dir + "/aligned.bam"; +const string goldStandardSamFn = tests::Data_Dir + "/aligned.sam"; +const string generatedBamFn = tests::GeneratedData_Dir + "/generated.bam"; +const string generatedSamFn = tests::GeneratedData_Dir + "/generated.sam"; +const vector generatedFiles = { generatedBamFn, generatedSamFn }; + +static inline +int RunBam2Sam(const string& bamFn, + const string& samFn, + const string& args = string()) +{ + stringstream s; + s << tests::Bam2Sam << " " << args << " " << bamFn << " > " << samFn; + return system(s.str().c_str()); +} + +static inline +int RunDiff(const string& fn1, const string& fn2) +{ + stringstream s; + s << "diff " << fn1 << " " << fn2; + return system(s.str().c_str()); +} + +static inline +void Remove(const vector& files) +{ + for (const auto& fn : files) + remove(fn.c_str()); +} + +static inline +void CheckGeneratedOutput(void) +{ + // convert to sam & diff against gold standard + const int convertRet = RunBam2Sam(generatedBamFn, generatedSamFn); + const int diffRet = RunDiff(goldStandardSamFn, generatedSamFn); + EXPECT_EQ(0, convertRet); + EXPECT_EQ(0, diffRet); + + // clean up + Remove(generatedFiles); +} + +} // namespace tests +} // namespace BAM +} // namespace PacBio + +// sanity check for rest of tests below +TEST(EndToEndTest, ReadAndWrite_PureHtslib) +{ + { // scoped to force flush & close before conversion/diff + + // open files + + unique_ptr inWrapper(sam_open(inputBamFn.c_str(), "r")); + samFile* in = inWrapper.get(); + ASSERT_TRUE(in); + + unique_ptr outWrapper(sam_open(generatedBamFn.c_str(), "wb")); + samFile* out = outWrapper.get(); + ASSERT_TRUE(out); + + // fetch & write header + + unique_ptr headerWrapper(sam_hdr_read(in)); + bam_hdr_t* hdr = headerWrapper.get(); + ASSERT_TRUE(hdr); + ASSERT_EQ(0, sam_hdr_write(out, hdr)); + + // fetch & write records + + unique_ptr record(bam_init1()); + bam1_t* b = record.get(); + ASSERT_TRUE(b); + + while (sam_read1(in, hdr, b) >= 0) + sam_write1(out, hdr, b); + } + + CheckGeneratedOutput(); +} + +TEST(EndToEndTest, ReadAndWrite_SingleThread) +{ + EXPECT_NO_THROW( + { + // open input BAM file + BamFile bamFile(tests::inputBamFn); + + // open output BAM file + BamWriter writer(tests::generatedBamFn, bamFile.Header(), BamWriter::DefaultCompression, 1); + + // copy BAM file + EntireFileQuery entireFile(bamFile); + for (const BamRecord& record : entireFile) + writer.Write(record); + }); + + CheckGeneratedOutput(); +} + +TEST(EndToEndTest, ReadAndWrite_APIDefaultThreadCount) +{ + EXPECT_NO_THROW( + { + // open input BAM file + BamFile bamFile(inputBamFn); + + // open output BAM file + BamWriter writer(generatedBamFn, bamFile.Header()); + + // copy BAM file + EntireFileQuery entireFile(bamFile); + for (const BamRecord& record : entireFile) + writer.Write(record); + }); + + CheckGeneratedOutput(); +} + +TEST(EndToEndTest, ReadAndWrite_SystemDefaultThreadCount) +{ + EXPECT_NO_THROW( + { + // open input BAM file + BamFile bamFile(inputBamFn); + + // open output BAM file + BamWriter writer(generatedBamFn, + bamFile.Header(), + BamWriter::DefaultCompression, + 0); + + // copy BAM file + EntireFileQuery entireFile(bamFile); + for (const BamRecord& record : entireFile) + writer.Write(record); + }); + + CheckGeneratedOutput(); +} + +TEST(EndToEndTest, ReadAndWrite_UserThreadCount) +{ + EXPECT_NO_THROW( + { + // open input BAM file + BamFile bamFile(inputBamFn); + + // open output BAM file + BamWriter writer(generatedBamFn, + bamFile.Header(), + BamWriter::DefaultCompression, + 3); + + // copy BAM file + EntireFileQuery entireFile(bamFile); + for (const BamRecord& record : entireFile) + writer.Write(record); + }); + + CheckGeneratedOutput(); +} diff --git a/tests/src/test_EntireFileQuery.cpp b/tests/src/test_EntireFileQuery.cpp new file mode 100644 index 0000000..47c25db --- /dev/null +++ b/tests/src/test_EntireFileQuery.cpp @@ -0,0 +1,137 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +const string inputBamFn = tests::Data_Dir + "/aligned.bam"; + +TEST(EntireFileQueryTest, CountRecords) +{ + EXPECT_NO_THROW( + { + BamFile bamFile(inputBamFn); + int count = 0; + EntireFileQuery entireFile(bamFile); + for (const BamRecord& record : entireFile) { + (void)record; + ++count; + } + + EXPECT_EQ(4, count); + }); +} + +TEST(EntireFileQueryTest, NonConstBamRecord) +{ + EXPECT_NO_THROW( + { + BamFile bamFile(inputBamFn); + int count = 0; + EntireFileQuery entireFile(bamFile); + for (BamRecord& record : entireFile) { + (void)record; + ++count; + } + + EXPECT_EQ(4, count); + }); +} + +TEST(BamRecordTest, HandlesDeletionOK) +{ + // this file raised no error in Debug mode, but segfaulted when + // trying to access the aligned qualities in Release mode + + const string problemBamFn = tests::Data_Dir + "/segfault.bam"; + BamFile bamFile(problemBamFn); + int count = 0; + EntireFileQuery entireFile(bamFile); + for (const BamRecord& record : entireFile) { + + const auto rawQualities = record.Qualities(Orientation::GENOMIC, false); + const auto alignedQualities = record.Qualities(Orientation::GENOMIC, true); + + const string rawExpected = + "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"; + + // 1=1D98= + const string alignedExpected = + "I!IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"; + + EXPECT_EQ(rawExpected, rawQualities.Fastq()); + EXPECT_EQ(alignedExpected, alignedQualities.Fastq()); + + ++count; + } + + EXPECT_EQ(1, count); +} + + +TEST(BamRecordTest, ReferenceName) +{ + { // check reference name of first record + const string exampleBam = tests::Data_Dir + "/aligned.bam"; + BamFile bamFile(exampleBam); + EntireFileQuery records(bamFile); + auto firstIter = records.begin(); + auto& firstRecord = *firstIter; + ASSERT_TRUE(firstRecord.IsMapped()); + EXPECT_EQ("lambda_NEB3011", firstRecord.ReferenceName()); + } + + { // unmapped records have no reference name, should throw + const string exampleBam = tests::Data_Dir + "/unmap1.bam"; + BamFile bamFile(exampleBam); + EntireFileQuery records(bamFile); + auto firstIter = records.begin(); + auto& firstRecord = *firstIter; + ASSERT_FALSE(firstRecord.IsMapped()); + EXPECT_THROW(firstRecord.ReferenceName(), std::runtime_error); + } +} diff --git a/tests/src/test_Fasta.cpp b/tests/src/test_Fasta.cpp new file mode 100644 index 0000000..25b0390 --- /dev/null +++ b/tests/src/test_Fasta.cpp @@ -0,0 +1,105 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +static void CheckSequence(const size_t index, const FastaSequence& seq) +{ + SCOPED_TRACE("checking FASTA seq:" + std::to_string(index)); + switch (index) { + case 0 : + EXPECT_EQ("1", seq.Name()); + EXPECT_EQ("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG", seq.Bases()); + break; + + case 1 : + EXPECT_EQ("2", seq.Name()); + EXPECT_EQ("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC", seq.Bases()); + break; + + case 2 : + EXPECT_EQ("3", seq.Name()); + EXPECT_EQ("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT", seq.Bases()); + break; + + default: + ASSERT_TRUE(false); // invalid index + } +} + +TEST(FastaSequenceTest, BasicConstructorOk) +{ + FastaSequence seq{ "1", "GATTACA" }; + EXPECT_EQ("1", seq.Name()); + EXPECT_EQ("GATTACA", seq.Bases()); +} + +TEST(FastaReaderTest, IterableOk) +{ + const string fn = tests::GeneratedData_Dir + "/normal.fa"; + FastaReader reader{ fn }; + + size_t count = 0; + FastaSequence seq; + while (reader.GetNext(seq)) { + CheckSequence(count, seq); + ++count; + } + EXPECT_EQ(3, count); +} + +TEST(FastaReaderTest, ReadAllOk) +{ + const string fn = tests::GeneratedData_Dir + "/normal.fa"; + + size_t count = 0; + for (const auto& seq : FastaReader::ReadAll(fn)) { + CheckSequence(count, seq); + ++count; + } + EXPECT_EQ(3, count); +} diff --git a/tests/src/test_FileUtils.cpp b/tests/src/test_FileUtils.cpp new file mode 100644 index 0000000..c1b4beb --- /dev/null +++ b/tests/src/test_FileUtils.cpp @@ -0,0 +1,326 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +TEST(FileUtilsTest, ExistsOk) +{ + EXPECT_FALSE(FileUtils::Exists("does_not_exist.txt")); + + const string tmp = tests::GeneratedData_Dir + "/pbbam_exists_check.tmp"; + const string cmd = string("touch ") + tmp; + ASSERT_EQ(0, system(cmd.c_str())); + EXPECT_TRUE(FileUtils::Exists(tmp)); +} + +TEST(FileUtilsTest, LastModifiedOk) +{ + // a little tricky to check without going a full 'mock' filesystem route, but we can approximate + // + // also, I can't seem to get better than second resolution (on OSX 10.9/clang at least, st_mtimespec.tv_nsec is always zero) + + const auto now = CurrentTime(); + const auto nowDuration = now.time_since_epoch(); + const auto nowSeconds = chrono::duration_cast(nowDuration).count(); + + const string tmp = tests::GeneratedData_Dir + "/pbbam_lastmod_check.tmp"; + const string rmCmd = string("rm ") + tmp; + const string touchCmd = string("touch ") + tmp; + int ret = system(rmCmd.c_str()); + (void)ret; // unused + ASSERT_EQ(0, system(touchCmd.c_str())); + + const auto stamp = FileUtils::LastModified(tmp); + const auto stampDuration = stamp.time_since_epoch(); + const auto stampSeconds = chrono::duration_cast(stampDuration).count(); + + EXPECT_LE(nowSeconds, stampSeconds); +} + +TEST(FileUtilsTest, ResolvedFilePathOk) +{ + const string testFrom = "/path/to/myDir"; + + // "raw" filenames - no URI scheme + + const string absolutePath = "/absolute/path/to/file.txt"; + const string relativePath = "../relative/path/to/file.txt"; + const string noPathFn = "file.txt"; + + const string resolvedAbsolutePath = FileUtils::ResolvedFilePath(absolutePath, testFrom); + const string resolvedRelativePath = FileUtils::ResolvedFilePath(relativePath, testFrom); + const string resolvedNoPath = FileUtils::ResolvedFilePath(noPathFn, testFrom); + const string resolvedAbsolutePath_defaultFrom = FileUtils::ResolvedFilePath(absolutePath); + const string resolvedRelativePath_defaultFrom = FileUtils::ResolvedFilePath(relativePath); + const string resolvedNoPath_defaultFrom = FileUtils::ResolvedFilePath(noPathFn); + + EXPECT_EQ("/absolute/path/to/file.txt", resolvedAbsolutePath); + EXPECT_EQ("/path/to/myDir/../relative/path/to/file.txt", resolvedRelativePath); + EXPECT_EQ("/path/to/myDir/file.txt", resolvedNoPath); + + EXPECT_EQ("/absolute/path/to/file.txt", resolvedAbsolutePath_defaultFrom); + EXPECT_EQ("./../relative/path/to/file.txt", resolvedRelativePath_defaultFrom); + EXPECT_EQ("./file.txt", resolvedNoPath_defaultFrom); + + // filenames with URI scheme ("file://") + + const string absoluteSchemeFn = "file:///absolute/path/to/file.txt"; + const string relativeSchemeFn = "file://../relative/path/to/file.txt"; + const string noPathSchemeFn = "file://file.txt"; + + const string resolvedAbsoluteSchemePath = FileUtils::ResolvedFilePath(absoluteSchemeFn, testFrom); + const string resolvedRelativeSchemePath = FileUtils::ResolvedFilePath(relativeSchemeFn, testFrom); + const string resolvedNoPathSchemeFn = FileUtils::ResolvedFilePath(noPathSchemeFn, testFrom); + const string resolvedAbsoluteSchemePath_defaultFrom = FileUtils::ResolvedFilePath(absoluteSchemeFn); + const string resolvedRelativeSchemePath_defaultFrom = FileUtils::ResolvedFilePath(relativeSchemeFn); + const string resolvedNoPathSchemeFn_defaultFrom = FileUtils::ResolvedFilePath(noPathSchemeFn); + + EXPECT_EQ("/absolute/path/to/file.txt", resolvedAbsoluteSchemePath); + EXPECT_EQ("/path/to/myDir/../relative/path/to/file.txt", resolvedRelativeSchemePath); + EXPECT_EQ("/path/to/myDir/file.txt", resolvedNoPathSchemeFn); + + EXPECT_EQ("/absolute/path/to/file.txt", resolvedAbsoluteSchemePath_defaultFrom); + EXPECT_EQ("./../relative/path/to/file.txt", resolvedRelativeSchemePath_defaultFrom); + EXPECT_EQ("./file.txt", resolvedNoPathSchemeFn_defaultFrom); +} + +TEST(FileUtilsTest, SizeOk) +{ + const string tmp = tests::GeneratedData_Dir + "/pbbam_empty_file.tmp"; + const string cmd = string("touch ") + tmp; + ASSERT_EQ(0, system(cmd.c_str())); + EXPECT_EQ(0, FileUtils::Size(tmp)); + + EXPECT_THROW(FileUtils::Size("does_not_exist.txt"), std::runtime_error); +} + +// #################################################################################################### +// The code below is part of a simple check whether or not a (Windows-only) file path is absolute. +// +// NOTE: (and this is admittedly brittle for maintenance, but) the internal methods used are literally +// copied here for direct driving. There's likely a better way going forward, than the manual copy/paste. +// But in the absence of a similar runtime environment to build in & test against, while +// the motivating behavior is blocking other work, this lets me get the fix in their hands ASAP and still +// have some test code poking it beforehand. -DB +// +namespace test_windows { + +static string removeFileUriScheme(const string& uri) +{ + assert(!uri.empty()); + + auto schemeLess = uri; + const auto fileScheme = string{"file://"}; + const auto schemeFound = schemeLess.find(fileScheme); + if (schemeFound != string::npos) { + if (schemeFound != 0) + throw runtime_error("Malformed URI: scheme not at beginning"); + schemeLess = schemeLess.substr(fileScheme.size()); + } + return schemeLess; +} + +static +string removeDiskName(const string& filePath) +{ + if (filePath.size() >= 2) { + const char firstChar = filePath.at(0); + if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':')) + return filePath.substr(2); + } + return filePath; +} + +static const char native_pathSeparator = '\\'; + +static bool native_pathIsAbsolute(const string& filePath) +{ + assert(!filePath.empty()); + + // if starts with single slash or double slash [cases 1,3] + if (boost::algorithm::starts_with(filePath, "\\")) + return true; + + // if starts with single or double-dots -> not absolute [case 4 + ".\file.txt"] + if (boost::algorithm::starts_with(filePath, ".")) + return false; + + // if starts with drive name and colon ("C:\foo\bar.txt") + if (filePath.size() >= 2) { + const char firstChar = filePath.at(0); + if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':')) + return native_pathIsAbsolute(removeDiskName(filePath)); + } + + // otherwise, likely relative + return false; +} + +static string native_resolvedFilePath(const string& filePath, + const string& from) +{ + // strip file:// scheme if present + auto schemeLess = removeFileUriScheme(filePath); + + // if empty or already absolute path, just return it + // upfront empty check simplifies further parsing logic + if (schemeLess.empty() || native_pathIsAbsolute(schemeLess)) + return schemeLess; + + // else make relative from the provided 'from' directory + // + // first pop disk name, then any leading single-dot '.' + // + // since we're prepending the 'from' directory, we can remove + // any leading './' form our file path. this may just mean that + // we pop it off to add it right back (when from == '.'), but this + // keeps it consistent with other 'from' parent directories + // + schemeLess = removeDiskName(schemeLess); + + const bool thisDirAtStart = (schemeLess.find(".") == 0); + if (thisDirAtStart) { + if (schemeLess.find(native_pathSeparator) == 1) + schemeLess = schemeLess.substr(2); + } + return from + native_pathSeparator + schemeLess; +} + +} // namespace test_windows + +TEST(FileUtilsTest, WindowsPathsOk) +{ + { // remove disk name + + // "C:\tmp.txt" + string f1 = "C:\\tmp.txt"; + EXPECT_EQ(string("\\tmp.txt"), test_windows::removeDiskName(f1)); + + // "C:tmp.txt" + string f2 = "C:tmp.txt"; + EXPECT_EQ(string("tmp.txt"), test_windows::removeDiskName(f2)); + + // "\tmp.txt" + string f3 = "\\tmp.txt"; + EXPECT_EQ(f3, test_windows::removeDiskName(f3)); + + // "tmp.txt" + string f4 = "tmp.txt"; + EXPECT_EQ(f4, test_windows::removeDiskName(f4)); + } + + { // isAbsolute ? + + // "\\server\path\to\tmp.txt" + EXPECT_TRUE(test_windows::native_pathIsAbsolute("\\\\server\\path\\to\tmp.txt")); + + // "..\tmp.txt" + EXPECT_FALSE(test_windows::native_pathIsAbsolute("..\\tmp.txt")); + + // ".\tmp.txt" + EXPECT_FALSE(test_windows::native_pathIsAbsolute(".\\tmp.txt")); + + // "C:\path\to\tmp.txt" + EXPECT_TRUE(test_windows::native_pathIsAbsolute("C:\\path\\to\\tmp.txt")); + + // "C:..\path\to\tmp.txt" + EXPECT_FALSE(test_windows::native_pathIsAbsolute("C:..\\path\\to\\tmp.txt")); + } + + { // resolve file path + + const string myRootDir = "C:\\path\\to\\myRootDir"; + + // "\\server\path\to\tmp.txt" + const string fn1 = "\\\\server\\path\\to\tmp.txt"; + const string fn1_expected = fn1; + EXPECT_EQ(fn1_expected, test_windows::native_resolvedFilePath(fn1, myRootDir)); + + // "..\tmp.txt" + const string fn2 = "..\\tmp.txt"; + const string fn2_expected = "C:\\path\\to\\myRootDir\\..\\tmp.txt"; + EXPECT_EQ(fn2_expected, test_windows::native_resolvedFilePath(fn2, myRootDir)); + + // ".\tmp.txt" + const string fn3 = ".\\tmp.txt"; + const string fn3_expected = "C:\\path\\to\\myRootDir\\tmp.txt"; + EXPECT_EQ(fn3_expected, test_windows::native_resolvedFilePath(fn3, myRootDir)); + + // "C:\path\to\tmp.txt" + const string fn4 = "C:\\path\\to\\tmp.txt"; + const string fn4_expected = fn4; + EXPECT_EQ(fn4_expected, test_windows::native_resolvedFilePath(fn4, myRootDir)); + + // "C:..\path\to\tmp.txt" + const string fn5 = "C:..\\path\\to\\tmp.txt"; + const string fn5_expected = "C:\\path\\to\\myRootDir\\..\\path\\to\\tmp.txt"; + EXPECT_EQ(fn5_expected, test_windows::native_resolvedFilePath(fn5, myRootDir)); + + // "C:tmp.txt" + const string fn6 = "C:tmp.txt"; + const string fn6_expected = "C:\\path\\to\\myRootDir\\tmp.txt"; + EXPECT_EQ(fn6_expected, test_windows::native_resolvedFilePath(fn6, myRootDir)); + EXPECT_EQ(fn3_expected, test_windows::native_resolvedFilePath(fn6, myRootDir)); // our path is equivalent to fn3's "./temp.txt" + } +} +// +// #################################################################################################### + + diff --git a/tests/src/test_Frames.cpp b/tests/src/test_Frames.cpp new file mode 100644 index 0000000..797eb6a --- /dev/null +++ b/tests/src/test_Frames.cpp @@ -0,0 +1,97 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace tests { + +static const vector testFrames = +{ + 0, 8, 140, 0, 0, 7, 4, 0, 85, 2, + 1, 3, 2, 10, 1, 20, 47, 10, 9, 60, + 20, 3, 12, 5, 13, 165, 6, 14, 22, 12, + 2, 4, 9, 218, 27, 3, 15, 2, 17, 2, + 45, 24, 89, 10, 7, 1, 11, 15, 0, 7, + 0, 28, 17, 12, 6, 10, 37, 0, 12, 52, + 0, 7, 1, 14, 3, 26, 12, 0, 20, 17, + 2, 13, 2, 9, 13, 7, 15, 29, 3, 6, + 2, 1, 28, 10, 3, 14, 7, 1, 22, 1, + 6, 6, 0, 19, 31, 6, 2, 14, 0, 0, + 1000, 947, 948 +}; + +static const vector encodedFrames = +{ + 0, 8, 102, 0, 0, 7, 4, 0, 75, 2, 1, 3, 2, + 10, 1, 20, 47, 10, 9, 60, 20, 3, 12, 5, 13, 115, + 6, 14, 22, 12, 2, 4, 9, 135, 27, 3, 15, 2, 17, + 2, 45, 24, 77, 10, 7, 1, 11, 15, 0, 7, 0, 28, + 17, 12, 6, 10, 37, 0, 12, 52, 0, 7, 1, 14, 3, + 26, 12, 0, 20, 17, 2, 13, 2, 9, 13, 7, 15, 29, + 3, 6, 2, 1, 28, 10, 3, 14, 7, 1, 22, 1, 6, + 6, 0, 19, 31, 6, 2, 14, 0, 0, + 255, 254, 255 +}; + +} // namespace tests + +TEST(FramesTest, Constructors) +{ + const Frames f; + ASSERT_TRUE(f.Data().empty()); + + const Frames f2(tests::testFrames); + const auto d = f2.Data(); + ASSERT_EQ(tests::testFrames, d); +} + +TEST(FramesTest, Encoded) +{ + const Frames f(tests::testFrames); + const auto e = f.Encode(); + ASSERT_EQ(tests::encodedFrames, e); +} diff --git a/tests/src/test_GenomicIntervalQuery.cpp b/tests/src/test_GenomicIntervalQuery.cpp new file mode 100644 index 0000000..96727d7 --- /dev/null +++ b/tests/src/test_GenomicIntervalQuery.cpp @@ -0,0 +1,157 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +const string inputBamFn = tests::Data_Dir + "/aligned.bam"; + +TEST(GenomicIntervalQueryTest, ReuseQueryAndCountRecords) +{ + const string rname = "lambda_NEB3011"; + + BamFile bamFile(inputBamFn); + + // setup with normal interval + int count = 0; + GenomicInterval interval(rname, 5000, 6000); + GenomicIntervalQuery query(interval, bamFile); + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(2, count); + + // adjust interval and pass back in + count = 0; + interval.Start(9300); + interval.Stop(9400); + query.Interval(interval); + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(2, count); + + // adjust again (empty region) + count = 0; + interval.Name(rname); + interval.Start(1000); + interval.Stop(2000); + query.Interval(interval); + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(0, count); + + // unknown ref + count = 0; + interval.Name("does not exist"); + interval.Start(0); + interval.Stop(100); + EXPECT_THROW(query.Interval(interval), std::runtime_error); + for (const BamRecord& record : query) { // iteration is still safe, just returns no data + (void)record; + ++count; + } + EXPECT_EQ(0, count); + + // adjust again - make sure we can read a real region after an invalid one + interval.Name(rname); + interval.Start(5000); + interval.Stop(6000); + query.Interval(interval); + count = 0; + for (const BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(2, count); +} + +TEST(GenomicIntervalQueryTest, NonConstBamRecord) +{ + EXPECT_NO_THROW( + { + BamFile bamFile(inputBamFn); + int count = 0; + + GenomicInterval interval("lambda_NEB3011", 8000, 10000); + GenomicIntervalQuery query(interval, bamFile); + for (BamRecord& record : query) { + (void)record; + ++count; + } + EXPECT_EQ(2, count); + }); +} + +TEST(GenomicIntervalQueryTest, MissingBaiShouldThrow) +{ + GenomicInterval interval("lambda_NEB3011", 0, 100); + const string phi29Bam = tests::Data_Dir + "/phi29.bam"; + const string hasBaiBam = tests::Data_Dir + "/aligned.bam"; + + { // single file, missing BAI + EXPECT_THROW(GenomicIntervalQuery query(interval, phi29Bam), std::runtime_error); + } + + { // from dataset, all missing BAI + DataSet ds; + ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam)); + ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam)); + EXPECT_THROW(GenomicIntervalQuery query(interval, ds), std::runtime_error); + } + + { // from dataset, mixed BAI presence + DataSet ds; + ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam)); + ds.ExternalResources().Add(ExternalResource("PacBio.AlignmentFile.AlignmentBamFile", hasBaiBam)); + EXPECT_THROW(GenomicIntervalQuery query(interval, ds), std::runtime_error); + } +} diff --git a/tests/src/test_IndexedFastaReader.cpp b/tests/src/test_IndexedFastaReader.cpp new file mode 100644 index 0000000..68a0331 --- /dev/null +++ b/tests/src/test_IndexedFastaReader.cpp @@ -0,0 +1,212 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include "pbbam/IndexedFastaReader.h" +#include "pbbam/BamRecord.h" +#include "pbbam/BamFile.h" +#include "pbbam/EntireFileQuery.h" +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +const string lambdaFasta = tests::Data_Dir + "/lambdaNEB.fa"; +const string singleInsertionBam = tests::Data_Dir + "/aligned.bam"; + +TEST(IndexedFastaReaderTests, PrintSingleInsertion) +{ + IndexedFastaReader r(lambdaFasta); + + // Open BAM file + BamFile bamFile(singleInsertionBam); + EntireFileQuery bamQuery(bamFile); + + auto it = bamQuery.begin(); + auto record = *it++; + EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT", + r.ReferenceSubsequence(record, Orientation::NATIVE, true)); + EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT", + r.ReferenceSubsequence(record, Orientation::NATIVE, true, true)); + EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT", + r.ReferenceSubsequence(record, Orientation::GENOMIC, true)); + EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT", + r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true)); + record = *it++; + EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT", + r.ReferenceSubsequence(record, Orientation::NATIVE, true)); + EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT", + r.ReferenceSubsequence(record, Orientation::NATIVE, true, true)); + EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT", + r.ReferenceSubsequence(record, Orientation::GENOMIC, true)); + EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT", + r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true)); + record = *it++; + EXPECT_EQ("----------------------------------------------------AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA", + r.ReferenceSubsequence(record, Orientation::NATIVE, true)); + EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA", + r.ReferenceSubsequence(record, Orientation::NATIVE, true, true)); + EXPECT_EQ("----------------------------------------------------AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA", + r.ReferenceSubsequence(record, Orientation::GENOMIC, true)); + EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA", + r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true)); + record = *it++; + EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA----------------------------------------------------", + r.ReferenceSubsequence(record, Orientation::GENOMIC, true)); + EXPECT_EQ("----------------------------------------------------TTGCCGCTGTT-ACCGTGCTGCGATCTTCTGCCATCGACGGACGTCCCACATTGGTGACTT", + r.ReferenceSubsequence(record, Orientation::NATIVE, true)); + EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA", + r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true)); + EXPECT_EQ("TTGCCGCTGTT-ACCGTGCTGCGATCTTCTGCCATCGACGGACGTCCCACATTGGTGACTT", + r.ReferenceSubsequence(record, Orientation::NATIVE, true, true)); + + // { + // std::stringstream output; + // auto itSS = bamQuery.begin(); + // { + // const auto recordSS = *itSS; + // output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true) << std::endl; + // output << recordSS.Sequence(Orientation::NATIVE, true) << std::endl; + // output << std::endl; + // output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true, true) << std::endl; + // output << recordSS.Sequence(Orientation::NATIVE, true, true) << std::endl; + // output << std::endl; + // output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true) << std::endl; + // output << recordSS.Sequence(Orientation::GENOMIC, true) << std::endl; + // output << std::endl; + // output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true, true) << std::endl; + // output << recordSS.Sequence(Orientation::GENOMIC, true, true) << std::endl; + // output << std::endl; + // } + // ++itSS; + // { + // const auto recordSS = *itSS; + // output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true) << std::endl; + // output << recordSS.Sequence(Orientation::NATIVE, true) << std::endl; + // output << std::endl; + // output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true, true) << std::endl; + // output << recordSS.Sequence(Orientation::NATIVE, true, true) << std::endl; + // output << std::endl; + // output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true) << std::endl; + // output << recordSS.Sequence(Orientation::GENOMIC, true) << std::endl; + // output << std::endl; + // output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true, true) << std::endl; + // output << recordSS.Sequence(Orientation::GENOMIC, true, true) << std::endl; + // output << std::endl; + // } + // ++itSS; + // { + // const auto recordSS = *itSS; + // output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true) << std::endl; + // output << recordSS.Sequence(Orientation::NATIVE, true) << std::endl; + // output << std::endl; + // output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true, true) << std::endl; + // output << recordSS.Sequence(Orientation::NATIVE, true, true) << std::endl; + // output << std::endl; + // output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true) << std::endl; + // output << recordSS.Sequence(Orientation::GENOMIC, true) << std::endl; + // output << std::endl; + // output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true, true) << std::endl; + // output << recordSS.Sequence(Orientation::GENOMIC, true, true) << std::endl; + // output << std::endl; + // } + // ++itSS; + // { + // const auto recordSS = *itSS; + // output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true) << std::endl; + // output << recordSS.Sequence(Orientation::GENOMIC, true) << std::endl; + // output << std::endl; + // output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true) << std::endl; + // output << recordSS.Sequence(Orientation::NATIVE, true) << std::endl; + // output << std::endl; + // output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true, true) << std::endl; + // output << recordSS.Sequence(Orientation::GENOMIC, true, true) << std::endl; + // output << std::endl; + // output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true, true) << std::endl; + // output << recordSS.Sequence(Orientation::NATIVE, true, true) << std::endl; + // } + // std::cerr << output.str(); + // } +} + +TEST(IndexedFastaReaderTests, ReadLambda) +{ + IndexedFastaReader r(lambdaFasta); + + EXPECT_TRUE(r.HasSequence("lambda_NEB3011")); + EXPECT_FALSE(r.HasSequence("dog")); + EXPECT_EQ(1, r.NumSequences()); + EXPECT_EQ(48502, r.SequenceLength("lambda_NEB3011")); + + string seq = r.Subsequence("lambda_NEB3011:0-10"); + EXPECT_EQ("GGGCGGCGAC", seq); + + string seq2 = r.Subsequence("lambda_NEB3011", 0, 10); + EXPECT_EQ("GGGCGGCGAC", seq2); + + // subsequence extending beyond bounds returns clipped + string seq3 = r.Subsequence("lambda_NEB3011", 48400, 48600); + EXPECT_EQ(102, seq3.length()); + + // bad subsequence +} + +TEST(IndexedFastaReaderTests, Errors) +{ + IndexedFastaReader r(lambdaFasta); + + // + // attempt access without "opening" + // + // EXPECT_THROW(r.NumSequences(), std::exception); + // EXPECT_THROW(r.HasSequence("lambda_NEB3011"), std::exception); + // EXPECT_THROW(r.SequenceLength("lambda_NEB3011"), std::exception); + // EXPECT_THROW(r.Subsequence("lambda_NEB3011:0-10"), std::exception); + + // + // invalid accesses after opening + // + EXPECT_THROW(r.SequenceLength("dog"), std::exception); + EXPECT_THROW(r.Subsequence("dog:0-10"), std::exception); +} diff --git a/tests/src/test_Intervals.cpp b/tests/src/test_Intervals.cpp new file mode 100644 index 0000000..14231b1 --- /dev/null +++ b/tests/src/test_Intervals.cpp @@ -0,0 +1,330 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +TEST(IntervalTest, Constructors) +{ + Interval empty; + Interval singleton(4); + Interval normal(5, 8); + + EXPECT_EQ(0, empty.Start()); + EXPECT_EQ(0, empty.Stop()); + + EXPECT_EQ(4, singleton.Start()); + EXPECT_EQ(5, singleton.Stop()); + + EXPECT_EQ(5, normal.Start()); + EXPECT_EQ(8, normal.Stop()); + + // TODO: check out-of-order intervals, etc +} + +TEST(IntervalTest, EqualityTest) +{ + Interval empty; + Interval empty2; + + Interval singleton(4); + Interval sameAsSingleton(4,5); + + Interval normal(5, 8); + Interval sameAsNormal(5, 8); + + Interval different(20, 40); + + // self-equality + EXPECT_TRUE(empty == empty); + EXPECT_TRUE(singleton == singleton); + EXPECT_TRUE(normal == normal); + EXPECT_TRUE(different == different); + + // same values equality + EXPECT_TRUE(empty == empty2); + EXPECT_TRUE(singleton == sameAsSingleton); + EXPECT_TRUE(normal == sameAsNormal); + + // different values + EXPECT_FALSE(empty == singleton); + EXPECT_FALSE(empty == normal); + EXPECT_FALSE(empty == different); + EXPECT_FALSE(singleton == normal); + EXPECT_FALSE(normal == different); +} + +TEST(IntervalTest, Copy) +{ + Interval interval1(5, 8); + Interval interval2(interval1); + Interval interval3 = interval1; + + EXPECT_TRUE(interval1 == interval1); + EXPECT_TRUE(interval1 == interval2); + EXPECT_TRUE(interval1 == interval3); +} + +TEST(IntervalTest, Modifier) +{ + Interval interval1(5, 8); + Interval interval2(interval1); + interval2.Start(2); + interval2.Stop(10); + + EXPECT_FALSE(interval1 == interval2); + EXPECT_EQ(2, interval2.Start()); + EXPECT_EQ(10, interval2.Stop()); +} + +TEST(IntervalTest, CoverTest) +{ + Interval interval1(2, 4); + Interval interval2(3, 5); + Interval interval3(6, 8); + Interval interval4(1, 7); + Interval interval5(5, 8); + + EXPECT_TRUE(interval1.Covers(interval1)); // self-cover: a.covers(a) + EXPECT_TRUE(interval1.CoveredBy(interval1)); // self-cover: a.coveredBy(a) + + EXPECT_TRUE(interval2.CoveredBy(interval4)); // a.coveredBy(b) + EXPECT_TRUE(interval4.Covers(interval2)); // thus b.covers(a) + EXPECT_FALSE(interval2 == interval4); // if a != b + EXPECT_FALSE(interval2.Covers(interval4)); // then !a.covers(b) + + EXPECT_FALSE(interval2.Covers(interval3)); // completely disjoint + EXPECT_FALSE(interval3.Covers(interval2)); + EXPECT_FALSE(interval2.CoveredBy(interval3)); + EXPECT_FALSE(interval3.CoveredBy(interval2)); + + EXPECT_FALSE(interval2.Covers(interval5)); // a.stop == b.start + EXPECT_FALSE(interval2.CoveredBy(interval5)); + + EXPECT_TRUE(interval5.Covers(interval3)); // shared endpoint, start contained, thus a.covers(b) + EXPECT_TRUE(interval3.CoveredBy(interval5)); // and b.coveredBy(a) +} + +TEST(IntervalTest, IntersectTest) +{ + Interval interval1(2, 4); + Interval interval2(3, 5); + Interval interval3(6, 8); + Interval interval4(1, 7); + Interval interval5(5, 8); + + EXPECT_TRUE(interval1.Intersects(interval1)); // self-intersection: a.intersects(a) + + EXPECT_TRUE(interval1.Intersects(interval2)); // if a.intersects(b) + EXPECT_TRUE(interval2.Intersects(interval1)); // then b.intersects(a) + + EXPECT_TRUE(interval4.Covers(interval1)); // if b.covers(a), + EXPECT_TRUE(interval1.Intersects(interval4)); // then a.intersects(b) + EXPECT_TRUE(interval4.Intersects(interval1)); // and b.intersects(a) + + EXPECT_FALSE(interval2.Intersects(interval3)); // b.start > a.stop (obvious disjoint) + EXPECT_FALSE(interval2.Intersects(interval5)); // b.start == a.stop (intervals are right open, so disjoint) +} + +TEST(IntervalTest, ValidityTest) +{ + Interval interval1; // default ctor + Interval interval2(0,0); // start == stop (zero) + Interval interval3(4,4); // start == stop (nonzero) + Interval interval4(0,1); // start < stop (start is zero) + Interval interval5(4,5); // start < stop (start is nonzero) + Interval interval6(5,4); // start > stop + + EXPECT_FALSE(interval1.IsValid()); + EXPECT_FALSE(interval2.IsValid()); + EXPECT_FALSE(interval3.IsValid()); + EXPECT_TRUE(interval4.IsValid()); + EXPECT_TRUE(interval5.IsValid()); + EXPECT_FALSE(interval6.IsValid()); +} + +TEST(IntervalTest, LengthTest) +{ + Interval interval1(2, 4); + Interval interval2(3, 5); + Interval interval3(6, 8); + Interval interval4(1, 7); + Interval interval5(5, 8); + + EXPECT_EQ(2, interval1.Length()); + EXPECT_EQ(2, interval2.Length()); + EXPECT_EQ(2, interval3.Length()); + EXPECT_EQ(6, interval4.Length()); + EXPECT_EQ(3, interval5.Length()); + + // TODO: check out-of-order intervals, etc +} + +TEST(GenomicIntervalTest, DefaultConstructor) +{ + GenomicInterval gi; + EXPECT_EQ("", gi.Name()); + EXPECT_EQ(0, gi.Start()); + EXPECT_EQ(0, gi.Stop()); +} + +TEST(GenomicIntervalTest, ExplicitConstructor) +{ + GenomicInterval gi("foo", 100, 200); + EXPECT_EQ("foo", gi.Name()); + EXPECT_EQ(100, gi.Start()); + EXPECT_EQ(200, gi.Stop()); +} + +TEST(GenomicIntervalTest, RegionStringConstructor) +{ + GenomicInterval gi("foo:100-200"); + EXPECT_EQ("foo", gi.Name()); + EXPECT_EQ(100, gi.Start()); + EXPECT_EQ(200, gi.Stop()); + + GenomicInterval refOnly("foo"); + EXPECT_EQ("foo", refOnly.Name()); + EXPECT_EQ(0, refOnly.Start()); + EXPECT_EQ(1<<29, refOnly.Stop()); // htslib's default, "read-to-end" interval stop +} + +TEST(GenomicIntervalTest, Copy) +{ + GenomicInterval interval1("foo", 10, 20); + GenomicInterval interval2(interval1); + GenomicInterval interval3 = interval1; + + EXPECT_TRUE(interval1 == interval1); + EXPECT_TRUE(interval1 == interval2); + EXPECT_TRUE(interval1 == interval3); +} + +TEST(GenomicIntervalTest, Modifiers) +{ + GenomicInterval interval1("foo", 10, 20); + + // modify individual properties + GenomicInterval interval2(interval1); + interval2.Name("bar"); + interval2.Start(2); + interval2.Stop(10); + + // modify interval as a whole + GenomicInterval interval3(interval1); + interval3.Interval(interval2.Interval()); + + EXPECT_FALSE(interval1 == interval2); + EXPECT_EQ("bar", interval2.Name()); + EXPECT_EQ(2, interval2.Start()); + EXPECT_EQ(10, interval2.Stop()); + + EXPECT_EQ(interval1.Name(), interval3.Name()); + EXPECT_EQ(interval2.Interval(), interval3.Interval()); +} + +TEST(GenomicIntervalTest, CoverTest) +{ + GenomicInterval interval1("foo", 2, 4); + GenomicInterval interval2("foo", 3, 5); + GenomicInterval interval3("foo", 6, 8); + GenomicInterval interval4("foo", 1, 7); + GenomicInterval interval5("foo", 5, 8); + + // same as interval2, but different ref + GenomicInterval interval6(interval2); + interval6.Name("bar"); + + EXPECT_TRUE(interval1.Covers(interval1)); // self-cover: a.covers(a) + EXPECT_TRUE(interval1.CoveredBy(interval1)); // self-cover: a.coveredBy(a) + + EXPECT_TRUE(interval2.CoveredBy(interval4)); // a.coveredBy(b) + EXPECT_TRUE(interval4.Covers(interval2)); // thus b.covers(a) + EXPECT_FALSE(interval2 == interval4); // if a != b + EXPECT_FALSE(interval2.Covers(interval4)); // then !a.covers(b) + + EXPECT_FALSE(interval6.CoveredBy(interval4)); // interval 6 has same start/stop as 2, w/ different ref + EXPECT_FALSE(interval4.Covers(interval6)); // + EXPECT_FALSE(interval6 == interval4); // + EXPECT_FALSE(interval6.Covers(interval4)); // + + EXPECT_FALSE(interval2.Covers(interval3)); // completely disjoint + EXPECT_FALSE(interval3.Covers(interval2)); + EXPECT_FALSE(interval2.CoveredBy(interval3)); + EXPECT_FALSE(interval3.CoveredBy(interval2)); + + EXPECT_FALSE(interval2.Covers(interval5)); // a.stop == b.start + EXPECT_FALSE(interval2.CoveredBy(interval5)); + + EXPECT_TRUE(interval5.Covers(interval3)); // shared endpoint, start contained, thus a.covers(b) + EXPECT_TRUE(interval3.CoveredBy(interval5)); // and b.coveredBy(a) +} + +TEST(GenomicIntervalTest, ValidityTest) +{ + GenomicInterval interval1; // default ctor + GenomicInterval interval2("foo",0,0); // valid id, start == stop (zero) + GenomicInterval interval3("foo",4,4); // valid id, start == stop (nonzero) + GenomicInterval interval4("foo",0,1); // valid id, start < stop (start is zero) + GenomicInterval interval5("foo",4,5); // valid id, start < stop (start is nonzero) + GenomicInterval interval6("foo",5,4); // valid id, start > stop + GenomicInterval interval7("",0,0); // invalid id, start == stop (zero) + GenomicInterval interval8("",4,4); // invalid id, start == stop (nonzero) + GenomicInterval interval9("",0,1); // invalid id, start < stop (start is zero) + GenomicInterval interval10("",4,5); // invalid id, start < stop (start is nonzero) + GenomicInterval interval11("",5,4); // invalid id, start > stop + + EXPECT_FALSE(interval1.IsValid()); + EXPECT_FALSE(interval2.IsValid()); + EXPECT_FALSE(interval3.IsValid()); + EXPECT_TRUE(interval4.IsValid()); + EXPECT_TRUE(interval5.IsValid()); + EXPECT_FALSE(interval6.IsValid()); + EXPECT_FALSE(interval7.IsValid()); + EXPECT_FALSE(interval8.IsValid()); + EXPECT_FALSE(interval9.IsValid()); + EXPECT_FALSE(interval10.IsValid()); + EXPECT_FALSE(interval11.IsValid()); +} diff --git a/tests/src/test_PacBioIndex.cpp b/tests/src/test_PacBioIndex.cpp new file mode 100644 index 0000000..fa17dc7 --- /dev/null +++ b/tests/src/test_PacBioIndex.cpp @@ -0,0 +1,1012 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +const string test2BamFn = tests::Data_Dir + "/aligned2.bam"; +const string phi29BamFn = tests::Data_Dir + "/phi29.bam"; + +namespace PacBio { +namespace BAM { +namespace tests { + +static +PbiRawData Test2Bam_CoreIndexData(void) + +{ + PbiRawData rawData; + rawData.Version(PbiFile::Version_3_0_1); + rawData.FileSections(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::REFERENCE); + rawData.NumReads(10); + + PbiRawBasicData& basicData = rawData.BasicData(); + basicData.rgId_ = { -1197849594, -1197849594, -1197849594, -1197849594, -1197849594, -1197849594, -1197849594, -1197849594, -1197849594, -1197849594 }; + basicData.qStart_ = {48,387,0,9936,10232,7468,5557,7285,426,7064}; + basicData.qEnd_ = {1132,1134,344,10187,10394,8906,7235,8657,1045,7421}; + basicData.holeNumber_ = {49050,32328,32328,6469,6469,30983,13473,13473,19915,30983}; + basicData.readQual_ = {0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6}; + basicData.ctxtFlag_ = {0,0,0,0,0,0,0,0,0,0}; + basicData.fileOffset_ = { 33816576, 33825163, 33831333, 33834264, 33836542, 33838065, 33849818, 33863499, 33874621, 1392836608 }; + + PbiRawMappedData& mappedData = rawData.MappedData(); + mappedData.tId_ = {0,0,0,0,0,0,0,0,0,0}; + mappedData.tStart_ = {0,302,675,2170,2203,3572,4506,4507,4592,4669}; + mappedData.tEnd_ = {471,1019,1026,2397,2326,5015,6125,5850,5203,5011}; + mappedData.aStart_ = {653,395,1,9960,10271,7468,5574,7285,441,7075}; + mappedData.aEnd_ = {1129,1134,344,10185,10394,8906,7235,8647,1040,7418}; + mappedData.revStrand_ = {0,1,0,1,0,1,1,0,1,0}; + mappedData.nM_ = {460,704,339,216,118,1394,1581,1313,583,333}; + mappedData.nMM_ = {0,0,0,0,0,0,0,0,0,0}; + mappedData.mapQV_ = {254,254,254,254,254,254,254,254,254,254}; + + PbiRawReferenceData& referenceData = rawData.ReferenceData(); + referenceData.entries_ = { + PbiReferenceEntry{0,0,10}, + PbiReferenceEntry{4294967295,4294967295,4294967295} + }; + + return rawData; +} + +// NOTE: We have 2 different sets of offsets because the copied, new file differs in size than the existing one. +// +// Unsure which combination of write parameters were used on the original. Things like thread count, +// compression level, etc. can effect compression ratio, BGZF block sizes, etc. even though the BAM record +// content itself is equal. So we'll just track these index values separately, for now at least. +// +static +PbiRawData Test2Bam_ExistingIndex(void) +{ + PbiRawData index = Test2Bam_CoreIndexData(); + index.BasicData().fileOffset_ = { 33816576, 33825163, 33831333, 33834264, 33836542, 33838065, 33849818, 33863499, 33874621, 1392836608 }; + return index; +} + +static +PbiRawData Test2Bam_NewIndex(void) +{ + PbiRawData index = Test2Bam_CoreIndexData(); + index.BasicData().fileOffset_ = { 33816576, 236126208, 391315456, 469106688, 537067520, 587792384, 867303424, 1182793728, 1449787392, 1582628864 }; + return index; +} + +static +void ExpectRawIndicesEqual(const PbiRawData& expected, const PbiRawData& actual) +{ + // header data + EXPECT_EQ(expected.Version(), actual.Version()); + EXPECT_EQ(expected.FileSections(), actual.FileSections()); + EXPECT_EQ(expected.NumReads(), actual.NumReads()); + + // subread data + const PbiRawBasicData& e = expected.BasicData(); + const PbiRawBasicData& a = actual.BasicData(); + EXPECT_EQ(e.rgId_, a.rgId_); + EXPECT_EQ(e.qStart_, a.qStart_); + EXPECT_EQ(e.qEnd_, a.qEnd_); + EXPECT_EQ(e.holeNumber_, a.holeNumber_); + EXPECT_EQ(e.readQual_, a.readQual_); + EXPECT_EQ(e.ctxtFlag_, a.ctxtFlag_); + EXPECT_EQ(e.fileOffset_, a.fileOffset_); + + // mapped data + EXPECT_EQ(expected.HasMappedData(), actual.HasMappedData()); + if (expected.HasMappedData() && actual.HasMappedData()) { + const PbiRawMappedData& e = expected.MappedData(); + const PbiRawMappedData& a = actual.MappedData(); + EXPECT_EQ(e.tId_, a.tId_); + EXPECT_EQ(e.tStart_, a.tStart_); + EXPECT_EQ(e.tEnd_, a.tEnd_); + EXPECT_EQ(e.aStart_, a.aStart_); + EXPECT_EQ(e.aEnd_, a.aEnd_); + EXPECT_EQ(e.revStrand_, a.revStrand_); + EXPECT_EQ(e.nM_, a.nM_); + EXPECT_EQ(e.nMM_, a.nMM_); + EXPECT_EQ(e.mapQV_, a.mapQV_); + } + + // reference data + EXPECT_EQ(expected.HasReferenceData(), actual.HasReferenceData()); + if (expected.HasReferenceData() && actual.HasReferenceData()) { + const PbiRawReferenceData& e = expected.ReferenceData(); + const PbiRawReferenceData& a = actual.ReferenceData(); + EXPECT_EQ(e.entries_, a.entries_); + } + + // barcode data + EXPECT_EQ(expected.HasBarcodeData(), actual.HasBarcodeData()); + if (expected.HasBarcodeData() && actual.HasBarcodeData()) { + const PbiRawBarcodeData& e = expected.BarcodeData(); + const PbiRawBarcodeData& a = actual.BarcodeData(); + EXPECT_EQ(e.bcForward_, a.bcForward_); + EXPECT_EQ(e.bcReverse_, a.bcReverse_); + EXPECT_EQ(e.bcQual_, a.bcQual_); + } +} + +static +bool BasicLookupsEqual(const BasicLookupData& lhs, + const BasicLookupData& rhs) +{ + return (lhs.rgId_ == rhs.rgId_ && + lhs.qStart_ == rhs.qStart_ && + lhs.qEnd_ == rhs.qEnd_ && + lhs.holeNumber_ == rhs.holeNumber_ && + lhs.readQual_ == rhs.readQual_ && + lhs.ctxtFlag_ == rhs.ctxtFlag_ && + lhs.fileOffset_ == rhs.fileOffset_); +} + +static +bool MappedLookupsEqual(const MappedLookupData& lhs, + const MappedLookupData& rhs) +{ + return (lhs.tId_ == rhs.tId_ && + lhs.tStart_ == rhs.tStart_ && + lhs.tEnd_ == rhs.tEnd_ && + lhs.aStart_ == rhs.aStart_ && + lhs.aEnd_ == rhs.aEnd_ && + lhs.nM_ == rhs.nM_ && + lhs.nMM_ == rhs.nMM_ && + lhs.mapQV_ == rhs.mapQV_ && + lhs.forwardStrand_ == rhs.forwardStrand_ && + lhs.reverseStrand_ == rhs.reverseStrand_); +} + +static +bool ReferenceLookupsEqual(const ReferenceLookupData& lhs, + const ReferenceLookupData& rhs) +{ + return lhs.references_ == rhs.references_; +} + +static +bool BarcodeLookupsEqual(const BarcodeLookupData& lhs, + const BarcodeLookupData& rhs) +{ + return (lhs.bcForward_ == rhs.bcForward_ && + lhs.bcReverse_ == rhs.bcReverse_ && + lhs.bcQual_ == rhs.bcQual_); +} + +static +bool PbiIndicesEqual(const PbiIndex& lhs, const PbiIndex& rhs) +{ + using namespace ::PacBio::BAM; + const unique_ptr& lhsImpl = lhs.d_; + const unique_ptr& rhsImpl = rhs.d_; + if (lhsImpl == rhsImpl) + return true; + if (lhsImpl == nullptr || rhsImpl == nullptr) + return false; + + // metadata compare + if (lhsImpl->version_ != rhsImpl->version_ || + lhsImpl->sections_ != rhsImpl->sections_ || + lhsImpl->numReads_ != rhsImpl->numReads_) + { return false; } + + // component compare + if ( !BasicLookupsEqual(lhsImpl->basicData_, rhsImpl->basicData_) || + !MappedLookupsEqual(lhsImpl->mappedData_, rhsImpl->mappedData_) || + !ReferenceLookupsEqual(lhsImpl->referenceData_, rhsImpl->referenceData_) || + !BarcodeLookupsEqual(lhsImpl->barcodeData_, rhsImpl->barcodeData_)) + { return false; } + + // if we get here, OK + return true; +} + +} // namespace tests +} // namespace BAM +} // namespace PacBio + +TEST(PacBioIndexTest, CreateFromExistingBam) +{ + // do this in temp directory, so we can ensure write access + const string tempDir = tests::GeneratedData_Dir + "/"; + const string tempBamFn = tempDir + "aligned_copy.bam"; + const string tempPbiFn = tempBamFn + ".pbi"; + string cmd("cp "); + cmd += test2BamFn; + cmd += " "; + cmd += tempBamFn; + int cmdResult = system(cmd.c_str()); + (void)cmdResult; + + BamFile bamFile(tempBamFn); + PbiFile::CreateFrom(bamFile); + EXPECT_EQ(tempPbiFn, bamFile.PacBioIndexFilename()); + + PbiRawData index(bamFile.PacBioIndexFilename()); + EXPECT_EQ(PbiFile::Version_3_0_1, index.Version()); + EXPECT_EQ(10, index.NumReads()); + EXPECT_TRUE(index.HasMappedData()); + + const PbiRawData& expectedIndex = tests::Test2Bam_ExistingIndex(); + tests::ExpectRawIndicesEqual(expectedIndex, index); + + // clean up temp file(s) + remove(tempBamFn.c_str()); + remove(tempPbiFn.c_str()); +} + +::testing::AssertionResult CanRead(BamReader& reader, BamRecord& record, int i) +{ + if (reader.GetNext(record)) + return ::testing::AssertionSuccess() << "i: " << i; + else + return ::testing::AssertionFailure() << "i: " << i; +} + +TEST(PacBioIndexTest, CreateOnTheFly) +{ + // do this in temp directory, so we can ensure write access + const string tempDir = tests::GeneratedData_Dir + "/"; + const string tempBamFn = tempDir + "temp.bam"; + const string tempPbiFn = tempBamFn + ".pbi"; + + // NOTE: new file differs in size than existing (different write parameters may yield different file sizes, even though content is same) + const vector expectedNewOffsets = { 33816576, 236126208, 391315456, 469106688, 537067520, 587792384, 867303424, 1182793728, 1449787392, 1582628864 }; + vector observedOffsets; + + // create PBI on the fly from input BAM while we write to new file + { + BamFile bamFile(test2BamFn); + BamHeader header = bamFile.Header(); + + BamWriter writer(tempBamFn, header); // default compression, default thread count + PbiBuilder builder(tempPbiFn, header.Sequences().size()); + + int64_t vOffset = 0; + EntireFileQuery entireFile(bamFile); + for (const BamRecord& record : entireFile) { + writer.Write(record, &vOffset); + builder.AddRecord(record, vOffset); + observedOffsets.push_back(vOffset); + } + } + + EXPECT_EQ(expectedNewOffsets, observedOffsets); + + // sanity check on original file + { + const vector originalFileOffsets = { 33816576, 33825163, 33831333, 33834264, 33836542, 33838065, 33849818, 33863499, 33874621, 1392836608 }; + BamRecord r; + BamReader reader(test2BamFn); + for (int i = 0; i < originalFileOffsets.size(); ++i) { + reader.VirtualSeek(originalFileOffsets.at(i)); + EXPECT_TRUE(CanRead(reader, r, i)); + } + } + + // attempt to seek in our new file using both expected & observed offsets + { + BamRecord r; + BamReader reader(tempBamFn); + for (int i = 0; i < expectedNewOffsets.size(); ++i) { + reader.VirtualSeek(expectedNewOffsets.at(i)); + EXPECT_TRUE(CanRead(reader, r, i)); + } + for (int i = 0; i < observedOffsets.size(); ++i) { + reader.VirtualSeek(observedOffsets.at(i)); + EXPECT_TRUE(CanRead(reader, r, i)); + } + } + + // compare data in new PBI file, to expected data + const PbiRawData& expectedIndex = tests::Test2Bam_NewIndex(); + const PbiRawData& fromBuilt = PbiRawData(tempPbiFn); + tests::ExpectRawIndicesEqual(expectedIndex, fromBuilt); + + // straight diff of newly-generated PBI file to existing PBI + // TODO: Come back to this once pbindexump is in place. + // We can't exactly do this since file offsets may differ between 2 BAMs of differing compression levels. + // Should add some sort of BAM checksum based on contents, not just size, for this reason. +// const string pbiDiffCmd = string("diff -q ") + test2BamFn + ".pbi " + tempPbiFn; +// EXPECT_EQ(0, system(pbiDiffCmd.c_str())); + + // clean up temp file(s) + remove(tempBamFn.c_str()); + remove(tempPbiFn.c_str()); +} + +TEST(PacBioIndexTest, RawLoadFromPbiFile) +{ + const BamFile bamFile(test2BamFn); + const string& pbiFilename = bamFile.PacBioIndexFilename(); + const PbiRawData loadedIndex(pbiFilename); + + const PbiRawData& expectedIndex = tests::Test2Bam_ExistingIndex(); + tests::ExpectRawIndicesEqual(expectedIndex, loadedIndex); +} + +TEST(PacBioIndexTest, BasicAndBarodeSectionsOnly) +{ + // do this in temp directory, so we can ensure write access + const string tempDir = tests::GeneratedData_Dir + "/"; + const string tempBamFn = tempDir + "phi29.bam"; + const string tempPbiFn = tempBamFn + ".pbi"; + string cmd("cp "); + cmd += phi29BamFn; + cmd += " "; + cmd += tempDir; + int cmdResult = system(cmd.c_str()); + (void)cmdResult; + + BamFile bamFile(tempBamFn); + PbiFile::CreateFrom(bamFile); + EXPECT_EQ(tempPbiFn, bamFile.PacBioIndexFilename()); + + PbiRawData index(bamFile.PacBioIndexFilename()); + EXPECT_EQ(PbiFile::Version_3_0_1, index.Version()); + EXPECT_EQ(120, index.NumReads()); + EXPECT_FALSE(index.HasMappedData()); + EXPECT_TRUE(index.HasBarcodeData()); + + const vector expectedBcForward = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2}; + const vector expectedBcReverse = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2}; + const vector expectedBcQuality = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1}; + + const PbiRawBarcodeData& barcodeData = index.BarcodeData(); + EXPECT_EQ(expectedBcForward, barcodeData.bcForward_); + EXPECT_EQ(expectedBcReverse, barcodeData.bcReverse_); + EXPECT_EQ(expectedBcQuality, barcodeData.bcQual_); + + + // clean up temp file(s) + remove(tempBamFn.c_str()); + remove(tempPbiFn.c_str()); + +} + + +TEST(PacBioIndexTest, ReferenceDataNotLoadedOnUnsortedBam) +{ + BamFile bamFile(test2BamFn); + PbiRawData raw(bamFile.PacBioIndexFilename()); + EXPECT_TRUE(raw.HasReferenceData()); +} + +TEST(PacBioIndexTest, LookupLoadFromFileOk) +{ + BamFile bamFile(test2BamFn); + EXPECT_NO_THROW( + { + PbiIndex index(bamFile.PacBioIndexFilename()); + EXPECT_EQ(10, index.NumReads()); + EXPECT_EQ(vector({ 33816576, 33825163, 33831333, 33834264, 33836542, 33838065, 33849818, 33863499, 33874621, 1392836608 }), index.BasicData().VirtualFileOffsets()); + }); +} + +TEST(PacBioIndexTest, ThrowOnNonExistentPbiFile) +{ + EXPECT_THROW(PbiRawData raw("does_not_exist.pbi"), std::exception); + EXPECT_THROW(PbiIndex idx("does_not_exist.pbi"), std::exception); +} + +TEST(PacBioIndexTest, ThrowOnNonPbiFile) +{ + // completely wrong format + const std::string fastaFn = tests::Data_Dir + "/lambdaNEB.fa"; + EXPECT_THROW(PbiRawData raw(fastaFn), std::exception); + EXPECT_THROW(PbiIndex idx(fastaFn), std::exception); + + // BGZF file, but not PBI + const std::string& bamFn = tests::Data_Dir + "/ex2.bam"; + EXPECT_THROW(PbiRawData raw(bamFn), std::exception); + EXPECT_THROW(PbiIndex idx(bamFn), std::exception); +} + +TEST(PacBioIndexTest, Copy_and_Move) +{ + const PbiIndex lookup(test2BamFn + ".pbi"); + + const PbiIndex copyConstructed(lookup); + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpessimizing-move" +#endif + const PbiIndex moveConstructed(std::move(PbiIndex(test2BamFn + ".pbi"))); +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + + PbiIndex copyAssigned; + copyAssigned = lookup; + + PbiIndex moveAssigned; + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpessimizing-move" +#endif + moveAssigned = std::move(PbiIndex(test2BamFn + ".pbi")); +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + + EXPECT_TRUE(tests::PbiIndicesEqual(lookup, copyConstructed)); + EXPECT_TRUE(tests::PbiIndicesEqual(lookup, moveConstructed)); + EXPECT_TRUE(tests::PbiIndicesEqual(lookup, copyAssigned)); + EXPECT_TRUE(tests::PbiIndicesEqual(lookup, moveAssigned)); +} + +TEST(PacBioIndexTest, OrderedLookup) +{ + using PacBio::BAM::IndexList; + using PacBio::BAM::OrderedLookup; + + OrderedLookup::container_type oRawData; + oRawData[11] = { 0, 3, 4 }; + oRawData[20] = { 1 }; + oRawData[42] = { 2, 7, 8 }; + oRawData[10] = { 5 }; + oRawData[12] = { 6 }; + oRawData[99] = { 9 }; + + OrderedLookup oLookup(oRawData); + + // EQUAL + EXPECT_EQ(IndexList({5}), oLookup.LookupIndices(10, Compare::EQUAL)); + EXPECT_EQ(IndexList({0, 3, 4}), oLookup.LookupIndices(11, Compare::EQUAL)); + EXPECT_EQ(IndexList({6}), oLookup.LookupIndices(12, Compare::EQUAL)); + EXPECT_EQ(IndexList({1}), oLookup.LookupIndices(20, Compare::EQUAL)); + EXPECT_EQ(IndexList({2, 7, 8}), oLookup.LookupIndices(42, Compare::EQUAL)); + EXPECT_EQ(IndexList({9}), oLookup.LookupIndices(99, Compare::EQUAL)); + EXPECT_EQ(IndexList(), oLookup.LookupIndices(66, Compare::EQUAL)); // does not exist + + // NOT_EQUAL + EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 6, 7, 8, 9}), oLookup.LookupIndices(10, Compare::NOT_EQUAL)); + EXPECT_EQ(IndexList({1, 2, 5, 6, 7, 8, 9}), oLookup.LookupIndices(11, Compare::NOT_EQUAL)); + EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 5, 7, 8, 9}), oLookup.LookupIndices(12, Compare::NOT_EQUAL)); + EXPECT_EQ(IndexList({0, 2, 3, 4, 5, 6, 7, 8, 9}), oLookup.LookupIndices(20, Compare::NOT_EQUAL)); + EXPECT_EQ(IndexList({0, 1, 3, 4, 5, 6, 9}), oLookup.LookupIndices(42, Compare::NOT_EQUAL)); + EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 5, 6, 7, 8}), oLookup.LookupIndices(99, Compare::NOT_EQUAL)); + EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}), oLookup.LookupIndices(66, Compare::NOT_EQUAL)); // does not exist + + // LESS_THAN + EXPECT_EQ(IndexList({0, 3, 4, 5, 6}), oLookup.LookupIndices(13, Compare::LESS_THAN)); + EXPECT_EQ(IndexList({0, 3, 4, 5}), oLookup.LookupIndices(12, Compare::LESS_THAN)); + // do more checks + + // LESS_THAN_EQUAL + EXPECT_EQ(IndexList({0, 3, 4, 5, 6}), oLookup.LookupIndices(13, Compare::LESS_THAN_EQUAL)); + EXPECT_EQ(IndexList({0, 3, 4, 5, 6}), oLookup.LookupIndices(12, Compare::LESS_THAN_EQUAL)); + // more checks? + + // GREATER_THAN + EXPECT_EQ(IndexList({2,7,8,9}), oLookup.LookupIndices(41, Compare::GREATER_THAN)); + EXPECT_EQ(IndexList({9}), oLookup.LookupIndices(42, Compare::GREATER_THAN)); + // more checks? + + // GREATER_THAN_EQUAL + EXPECT_EQ(IndexList({2,7,8,9}), oLookup.LookupIndices(41, Compare::GREATER_THAN_EQUAL)); + EXPECT_EQ(IndexList({2,7,8,9}), oLookup.LookupIndices(42, Compare::GREATER_THAN_EQUAL)); + // more checks? +} + +TEST(PacBioIndexTest, UnorderedLookup) +{ + using PacBio::BAM::IndexList; + using PacBio::BAM::UnorderedLookup; + + UnorderedLookup::container_type uRawData; + uRawData[11] = { 0, 3, 4 }; + uRawData[20] = { 1 }; + uRawData[42] = { 2, 7, 8 }; + uRawData[10] = { 5 }; + uRawData[12] = { 6 }; + uRawData[99] = { 9 }; + + UnorderedLookup uLookup(uRawData); + + // EQUAL + EXPECT_EQ(IndexList({5}), uLookup.LookupIndices(10, Compare::EQUAL)); + EXPECT_EQ(IndexList({0, 3, 4}), uLookup.LookupIndices(11, Compare::EQUAL)); + EXPECT_EQ(IndexList({6}), uLookup.LookupIndices(12, Compare::EQUAL)); + EXPECT_EQ(IndexList({1}), uLookup.LookupIndices(20, Compare::EQUAL)); + EXPECT_EQ(IndexList({2, 7, 8}), uLookup.LookupIndices(42, Compare::EQUAL)); + EXPECT_EQ(IndexList({9}), uLookup.LookupIndices(99, Compare::EQUAL)); + EXPECT_EQ(IndexList(), uLookup.LookupIndices(66, Compare::EQUAL)); // does not exist + + // NOT_EQUAL + EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 6, 7, 8, 9}), uLookup.LookupIndices(10, Compare::NOT_EQUAL)); + EXPECT_EQ(IndexList({1, 2, 5, 6, 7, 8, 9}), uLookup.LookupIndices(11, Compare::NOT_EQUAL)); + EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 5, 7, 8, 9}), uLookup.LookupIndices(12, Compare::NOT_EQUAL)); + EXPECT_EQ(IndexList({0, 2, 3, 4, 5, 6, 7, 8, 9}), uLookup.LookupIndices(20, Compare::NOT_EQUAL)); + EXPECT_EQ(IndexList({0, 1, 3, 4, 5, 6, 9}), uLookup.LookupIndices(42, Compare::NOT_EQUAL)); + EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 5, 6, 7, 8}), uLookup.LookupIndices(99, Compare::NOT_EQUAL)); + EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}), uLookup.LookupIndices(66, Compare::NOT_EQUAL)); // does not exist + + // LESS_THAN + EXPECT_EQ(IndexList({0, 3, 4, 5, 6}), uLookup.LookupIndices(13, Compare::LESS_THAN)); + EXPECT_EQ(IndexList({0, 3, 4, 5}), uLookup.LookupIndices(12, Compare::LESS_THAN)); + // more checks? + + // LESS_THAN_EQUAL + EXPECT_EQ(IndexList({0, 3, 4, 5, 6}), uLookup.LookupIndices(13, Compare::LESS_THAN_EQUAL)); + EXPECT_EQ(IndexList({0, 3, 4, 5, 6}), uLookup.LookupIndices(12, Compare::LESS_THAN_EQUAL)); + // more checks? + + // GREATER_THAN + EXPECT_EQ(IndexList({2,7,8,9}), uLookup.LookupIndices(41, Compare::GREATER_THAN)); + EXPECT_EQ(IndexList({9}), uLookup.LookupIndices(42, Compare::GREATER_THAN)); + // more checks? + + // GREATER_THAN_EQUAL + EXPECT_EQ(uLookup.LookupIndices(41, Compare::GREATER_THAN_EQUAL), IndexList({2,7,8,9})); + EXPECT_EQ(uLookup.LookupIndices(42, Compare::GREATER_THAN_EQUAL), IndexList({2,7,8,9})); + // more checks? +} + +TEST(PacBioIndexTest, MergeBlocks) +{ + using PacBio::BAM::IndexList; + using PacBio::BAM::IndexResultBlock; + using PacBio::BAM::IndexResultBlocks; + using PacBio::BAM::mergedIndexBlocks; + using PacBio::BAM::OrderedLookup; + + OrderedLookup::container_type oRawData; + oRawData[11] = { 0, 3, 4 }; + oRawData[20] = { 1 }; + oRawData[42] = { 2, 7, 8 }; + oRawData[10] = { 5 }; + oRawData[12] = { 6 }; + oRawData[99] = { 9 }; + + OrderedLookup oLookup(oRawData); + + // EQUAL + auto mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(10, Compare::EQUAL)); + EXPECT_EQ(1, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(5, 1), mergedBlocks.at(0)); + + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(11, Compare::EQUAL)); + EXPECT_EQ(2, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(0, 1), mergedBlocks.at(0)); + EXPECT_EQ(IndexResultBlock(3, 2), mergedBlocks.at(1)); + + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(12, Compare::EQUAL)); + EXPECT_EQ(1, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(6, 1), mergedBlocks.at(0)); + + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(20, Compare::EQUAL)); + EXPECT_EQ(1, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(1, 1), mergedBlocks.at(0)); + + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(42, Compare::EQUAL)); + EXPECT_EQ(2, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(2, 1), mergedBlocks.at(0)); + EXPECT_EQ(IndexResultBlock(7, 2), mergedBlocks.at(1)); + + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(99, Compare::EQUAL)); + EXPECT_EQ(1, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(9, 1), mergedBlocks.at(0)); + + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(66, Compare::EQUAL)); + EXPECT_TRUE(mergedBlocks.empty()); + + // NOT_EQUAL + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(10, Compare::NOT_EQUAL)); + EXPECT_EQ(2, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(0, 5), mergedBlocks.at(0)); + EXPECT_EQ(IndexResultBlock(6, 4), mergedBlocks.at(1)); + + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(11, Compare::NOT_EQUAL)); + EXPECT_EQ(2, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(1, 2), mergedBlocks.at(0)); + EXPECT_EQ(IndexResultBlock(5, 5), mergedBlocks.at(1)); + + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(12, Compare::NOT_EQUAL)); + EXPECT_EQ(2, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(0, 6), mergedBlocks.at(0)); + EXPECT_EQ(IndexResultBlock(7, 3), mergedBlocks.at(1)); + + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(20, Compare::NOT_EQUAL)); + EXPECT_EQ(2, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(0, 1), mergedBlocks.at(0)); + EXPECT_EQ(IndexResultBlock(2, 8), mergedBlocks.at(1)); + + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(42, Compare::NOT_EQUAL)); + EXPECT_EQ(3, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(0, 2), mergedBlocks.at(0)); + EXPECT_EQ(IndexResultBlock(3, 4), mergedBlocks.at(1)); + EXPECT_EQ(IndexResultBlock(9, 1), mergedBlocks.at(2)); + + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(99, Compare::NOT_EQUAL)); + EXPECT_EQ(1, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(0, 9), mergedBlocks.at(0)); + + mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(66, Compare::NOT_EQUAL)); + EXPECT_EQ(1, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(0, 10), mergedBlocks.at(0)); +} + +TEST(PacBioIndexTest, ApplyOffsetsToBlocks) +{ + using PacBio::BAM::BasicLookupData; + using PacBio::BAM::IndexList; + using PacBio::BAM::IndexResultBlock; + using PacBio::BAM::IndexResultBlocks; + using PacBio::BAM::mergedIndexBlocks; + using PacBio::BAM::OrderedLookup; + + OrderedLookup::container_type oRawData; + oRawData[11] = { 0, 3, 4 }; + oRawData[20] = { 1 }; + oRawData[42] = { 2, 7, 8 }; + oRawData[10] = { 5 }; + oRawData[12] = { 6 }; + oRawData[99] = { 9 }; + + OrderedLookup oLookup(oRawData); + auto mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(10, Compare::NOT_EQUAL)); + + EXPECT_EQ(2, mergedBlocks.size()); + EXPECT_EQ(IndexResultBlock(0, 5), mergedBlocks.at(0)); + EXPECT_EQ(IndexResultBlock(6, 4), mergedBlocks.at(1)); + + BasicLookupData basicLookupData; + basicLookupData.fileOffset_ = { 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 }; + basicLookupData.ApplyOffsets(mergedBlocks); + + EXPECT_EQ(2, mergedBlocks.size()); + EXPECT_EQ(0, mergedBlocks.at(0).virtualOffset_); + EXPECT_EQ(5, mergedBlocks.at(0).numReads_); + EXPECT_EQ(60, mergedBlocks.at(1).virtualOffset_); + EXPECT_EQ(4, mergedBlocks.at(1).numReads_); +} + +TEST(PacBioIndexTest, LookupMulti) +{ + using PacBio::BAM::BasicLookupData; + using PacBio::BAM::IndexList; + using PacBio::BAM::IndexResultBlock; + using PacBio::BAM::IndexResultBlocks; + using PacBio::BAM::mergedIndexBlocks; + using PacBio::BAM::UnorderedLookup; + + UnorderedLookup::container_type uRawData; + uRawData[11] = { 0, 3, 4 }; + uRawData[20] = { 1 }; + uRawData[42] = { 2, 7, 8 }; + uRawData[10] = { 5 }; + uRawData[12] = { 6 }; + uRawData[99] = { 9 }; + + BasicLookupData basicLookup; + basicLookup.rgId_ = UnorderedLookup(uRawData); + basicLookup.fileOffset_ = { 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 }; + + const std::vector whitelist = { 11, 42, 20 }; + const auto indices = basicLookup.IndicesMulti(BasicLookupData::RG_ID, whitelist); + + IndexResultBlocks mergedBlocks = mergedIndexBlocks(indices); + basicLookup.ApplyOffsets(mergedBlocks); + + EXPECT_EQ(IndexList({0, 3, 4, 2, 7, 8, 1}), indices); + EXPECT_EQ(2, mergedBlocks.size()); + + const IndexResultBlock& block0 = mergedBlocks.at(0); + EXPECT_EQ(0, block0.firstIndex_); + EXPECT_EQ(5, block0.numReads_); + EXPECT_EQ(0, block0.virtualOffset_); + + const IndexResultBlock& block1 = mergedBlocks.at(1); + EXPECT_EQ(7, block1.firstIndex_); + EXPECT_EQ(2, block1.numReads_); + EXPECT_EQ(70, block1.virtualOffset_); +} + +TEST(PacBioIndexTest, LookupAPI) +{ + const PbiIndex index(test2BamFn + ".pbi"); + const BasicLookupData& basicData = index.BasicData(); + const MappedLookupData& mappedData = index.MappedData(); + const BarcodeLookupData& barcodeData = index.BarcodeData(); + + // rgId == x + IndexResultBlocks rgResult = mergedIndexBlocks(basicData.Indices(BasicLookupData::RG_ID, -1197849594)); + basicData.ApplyOffsets(rgResult); + EXPECT_EQ(1, rgResult.size()); + EXPECT_EQ(0, rgResult.at(0).firstIndex_); + EXPECT_EQ(10, rgResult.at(0).numReads_); + EXPECT_EQ(33816576, rgResult.at(0).virtualOffset_); + + // rg != x + IndexResultBlocks notRgResult = mergedIndexBlocks(basicData.Indices(BasicLookupData::RG_ID, + -1197849594, + Compare::NOT_EQUAL)); + basicData.ApplyOffsets(notRgResult); + EXPECT_TRUE(notRgResult.empty()); + + // tEnd <= x + IndexResultBlocks tEndLteResult = mergedIndexBlocks(mappedData.Indices(MappedLookupData::T_END, + 4500, + Compare::LESS_THAN_EQUAL)); + basicData.ApplyOffsets(tEndLteResult); + EXPECT_EQ(1, tEndLteResult.size()); + EXPECT_EQ(0, tEndLteResult.at(0).firstIndex_); + EXPECT_EQ(5, tEndLteResult.at(0).numReads_); + EXPECT_EQ(33816576, tEndLteResult.at(0).virtualOffset_); + + // tEnd >= x + IndexResultBlocks tEndGteResult = mergedIndexBlocks(mappedData.Indices(MappedLookupData::T_START, + 4500, + Compare::GREATER_THAN_EQUAL)); + basicData.ApplyOffsets(tEndGteResult); + EXPECT_EQ(1, tEndGteResult.size()); + EXPECT_EQ(6, tEndGteResult.at(0).firstIndex_); + EXPECT_EQ(4, tEndGteResult.at(0).numReads_); + EXPECT_EQ(33849818, tEndGteResult.at(0).virtualOffset_); + + // strand query + IndexResultBlocks forward = mergedIndexBlocks(mappedData.Indices(MappedLookupData::STRAND, + Strand::FORWARD)); + basicData.ApplyOffsets(forward); + EXPECT_EQ(5, forward.size()); + EXPECT_EQ(0, forward.at(0).firstIndex_); + EXPECT_EQ(1, forward.at(0).numReads_); + EXPECT_EQ(33816576, forward.at(0).virtualOffset_); + + EXPECT_EQ(2, forward.at(1).firstIndex_); + EXPECT_EQ(1, forward.at(1).numReads_); + EXPECT_EQ(33831333, forward.at(1).virtualOffset_); + + EXPECT_EQ(4, forward.at(2).firstIndex_); + EXPECT_EQ(1, forward.at(2).numReads_); + EXPECT_EQ(33836542, forward.at(2).virtualOffset_); + + EXPECT_EQ(7, forward.at(3).firstIndex_); + EXPECT_EQ(1, forward.at(3).numReads_); + EXPECT_EQ(33863499, forward.at(3).virtualOffset_); + + EXPECT_EQ(9, forward.at(4).firstIndex_); + EXPECT_EQ(1, forward.at(4).numReads_); + EXPECT_EQ(1392836608, forward.at(4).virtualOffset_); + + // 0,1,0,1,0,1,1,0,1,0 + IndexResultBlocks reverse = mergedIndexBlocks(mappedData.Indices(MappedLookupData::STRAND, + Strand::REVERSE)); + basicData.ApplyOffsets(reverse); + EXPECT_EQ(4, reverse.size()); + EXPECT_EQ(1, reverse.at(0).firstIndex_); + EXPECT_EQ(1, reverse.at(0).numReads_); + EXPECT_EQ(33825163, reverse.at(0).virtualOffset_); + + EXPECT_EQ(3, reverse.at(1).firstIndex_); + EXPECT_EQ(1, reverse.at(1).numReads_); + EXPECT_EQ(33834264, reverse.at(1).virtualOffset_); + + EXPECT_EQ(5, reverse.at(2).firstIndex_); + EXPECT_EQ(2, reverse.at(2).numReads_); + EXPECT_EQ(33838065, reverse.at(2).virtualOffset_); + + EXPECT_EQ(8, reverse.at(3).firstIndex_); + EXPECT_EQ(1, reverse.at(3).numReads_); + EXPECT_EQ(33874621, reverse.at(3).virtualOffset_); + + // query data field that is not in the PBI + IndexResultBlocks missing = mergedIndexBlocks(barcodeData.Indices(BarcodeLookupData::BC_QUALITY, + 77, + Compare::GREATER_THAN)); + basicData.ApplyOffsets(missing); + EXPECT_TRUE(missing.empty()); +} + +TEST(PacBioIndexTest, LookupByZmw) +{ + BamFile f(tests::Data_Dir + "/dataset/bam_mapping.bam"); + f.EnsurePacBioIndexExists(); + + const PbiIndex index(f.PacBioIndexFilename()); + const BasicLookupData& basicData = index.BasicData(); + + IndexResultBlocks blocks = mergedIndexBlocks(basicData.Indices(BasicLookupData::ZMW, + 20000, + Compare::LESS_THAN)); + basicData.ApplyOffsets(blocks); + EXPECT_EQ(14, blocks.size()); + + // + // we'll take a look at first 5 contiguous blocks of reads with ZMW < 20000 + // + // skipped: { 49050, 32328, 32328 } + // block0: { 6469, 6469 } + // skipped: { 30983 } + // block1: { 13473, 13473, 19915 } + // skipped: { 30983 } + // block2: { 19915, 7247, 7247 } + // skipped: { 38025 } + // block3: { 13473 } + // skipped: { 36363, 36363, 31174, 31174, 38025, 50257, 50257 } + // block4: { 14743, 14743 } + // + + const IndexResultBlock& block0 = blocks.at(0); + EXPECT_EQ(3, block0.firstIndex_); + EXPECT_EQ(2, block0.numReads_); + EXPECT_EQ(32654529, block0.virtualOffset_); + + const IndexResultBlock& block1 = blocks.at(1); + EXPECT_EQ(6, block1.firstIndex_); + EXPECT_EQ(3, block1.numReads_); + EXPECT_EQ(32669996, block1.virtualOffset_); + + const IndexResultBlock& block2 = blocks.at(2); + EXPECT_EQ(10, block2.firstIndex_); + EXPECT_EQ(3, block2.numReads_); + EXPECT_EQ(1388841957, block2.virtualOffset_); + + const IndexResultBlock& block3 = blocks.at(3); + EXPECT_EQ(14, block3.firstIndex_); + EXPECT_EQ(1, block3.numReads_); + EXPECT_EQ(1388864866, block3.virtualOffset_); + + const IndexResultBlock& block4 = blocks.at(4); + EXPECT_EQ(22, block4.firstIndex_); + EXPECT_EQ(2, block4.numReads_); + EXPECT_EQ(1388892121, block4.virtualOffset_); +} + +TEST(PacBioIndexTest, LookupMultiZmw) +{ + BamFile f(tests::Data_Dir + "/dataset/bam_mapping.bam"); + f.EnsurePacBioIndexExists(); + + const PbiIndex index(f.PacBioIndexFilename()); + const BasicLookupData& basicData = index.BasicData(); + + const std::vector whitelist = { 13473, 38025 }; + IndexResultBlocks blocks = mergedIndexBlocks(basicData.IndicesMulti(BasicLookupData::ZMW, whitelist)); + basicData.ApplyOffsets(blocks); + + EXPECT_EQ(3, blocks.size()); + + const IndexResultBlock& block0 = blocks.at(0); + EXPECT_EQ(6, block0.firstIndex_); + EXPECT_EQ(2, block0.numReads_); + EXPECT_EQ(32669996, block0.virtualOffset_); + + const IndexResultBlock& block1 = blocks.at(1); + EXPECT_EQ(13, block1.firstIndex_); + EXPECT_EQ(2, block1.numReads_); + EXPECT_EQ(1388851626, block1.virtualOffset_); + + const IndexResultBlock& block2 = blocks.at(2); + EXPECT_EQ(19, block2.firstIndex_); + EXPECT_EQ(1, block2.numReads_); + EXPECT_EQ(1388881468, block2.virtualOffset_); +} + +TEST(PacBioIndexTest, AggregatePBI) +{ + + DataSet ds; + ExternalResources& resources = ds.ExternalResources(); + resources.Add(BamFile{tests::Data_Dir + "/aligned.bam"}); // 4 reads, BASIC | MAPPED | REFERENCE + resources.Add(BamFile{tests::Data_Dir + "/polymerase/production.subreads.bam"}); // 8 reads, BASIC | BARCODE + resources.Add(BamFile{tests::Data_Dir + "/polymerase/production_hq.hqregion.bam"}); // 1 read, BASIC only + + const PbiRawData index{ds}; + const PbiRawBasicData& mergedBasicData = index.BasicData(); + const PbiRawBarcodeData& mergedBarcodeData = index.BarcodeData(); + const PbiRawMappedData& mergedMappedData = index.MappedData(); + + const uint32_t expectedTotal = 13; // 4 + 8 + 1 + + // 'meta' info + EXPECT_EQ(expectedTotal, index.NumReads()); + EXPECT_EQ(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::BARCODE, index.FileSections()); + EXPECT_TRUE(index.HasBarcodeData()); + EXPECT_TRUE(index.HasMappedData()); + EXPECT_FALSE(index.HasReferenceData()); + + // file numbers + EXPECT_EQ(0, mergedBasicData.fileNumber_.at(0)); + EXPECT_EQ(0, mergedBasicData.fileNumber_.at(1)); + EXPECT_EQ(0, mergedBasicData.fileNumber_.at(2)); + EXPECT_EQ(0, mergedBasicData.fileNumber_.at(3)); + EXPECT_EQ(1, mergedBasicData.fileNumber_.at(4)); + EXPECT_EQ(1, mergedBasicData.fileNumber_.at(5)); + EXPECT_EQ(1, mergedBasicData.fileNumber_.at(6)); + EXPECT_EQ(1, mergedBasicData.fileNumber_.at(7)); + EXPECT_EQ(1, mergedBasicData.fileNumber_.at(8)); + EXPECT_EQ(1, mergedBasicData.fileNumber_.at(9)); + EXPECT_EQ(1, mergedBasicData.fileNumber_.at(10)); + EXPECT_EQ(1, mergedBasicData.fileNumber_.at(11)); + EXPECT_EQ(2, mergedBasicData.fileNumber_.at(12)); + + // basic data + EXPECT_EQ(0, mergedBasicData.qStart_.at(0)); // file 1 + EXPECT_EQ(0, mergedBasicData.qStart_.at(1)); + EXPECT_EQ(2659, mergedBasicData.qStart_.at(4)); // file 2 + EXPECT_EQ(3116, mergedBasicData.qStart_.at(5)); + EXPECT_EQ(2659, mergedBasicData.qStart_.at(12)); // file 3 + + EXPECT_EQ(21102592, mergedBasicData.fileOffset_.at(0)); // file 1 + EXPECT_EQ(21102883, mergedBasicData.fileOffset_.at(1)); + EXPECT_EQ(19857408, mergedBasicData.fileOffset_.at(4)); // file 2 + EXPECT_EQ(19860696, mergedBasicData.fileOffset_.at(5)); + EXPECT_EQ(20054016, mergedBasicData.fileOffset_.at(12)); // file 3 + + // mapped data + EXPECT_EQ(60, mergedMappedData.mapQV_.at(0)); // file 1 + EXPECT_EQ(60, mergedMappedData.mapQV_.at(1)); + EXPECT_EQ(255, mergedMappedData.mapQV_.at(4)); // file 2 + EXPECT_EQ(255, mergedMappedData.mapQV_.at(5)); + EXPECT_EQ(255, mergedMappedData.mapQV_.at(12)); // file 3 + + // barcode data + EXPECT_EQ(-1, mergedBarcodeData.bcForward_.at(0)); // file 1 + EXPECT_EQ(-1, mergedBarcodeData.bcForward_.at(1)); + EXPECT_EQ(92, mergedBarcodeData.bcForward_.at(4)); // file 2 + EXPECT_EQ(92, mergedBarcodeData.bcForward_.at(5)); + EXPECT_EQ(-1, mergedBarcodeData.bcForward_.at(12)); // file 3 +} diff --git a/tests/src/test_PbiFilter.cpp b/tests/src/test_PbiFilter.cpp new file mode 100644 index 0000000..449c83d --- /dev/null +++ b/tests/src/test_PbiFilter.cpp @@ -0,0 +1,1359 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace tests { + +// helper structs & methods + +static +PbiRawData test2Bam_RawIndex(void) +{ + PbiRawData index; + index.NumReads(4); + + PbiRawBasicData& subreadData = index.BasicData(); + subreadData.rgId_ = { -1197849594, -1197849594, -1197849594, -1197849594 }; + subreadData.qStart_ = { 2114, 2579, 4101, 5615 }; + subreadData.qEnd_ = { 2531, 4055, 5571, 6237 }; + subreadData.holeNumber_ = { 14743, 14743, 14743, 14743 }; + subreadData.readQual_ = { 0.901, 0.601, 0.901, 0.601 }; + subreadData.ctxtFlag_ = { 0, 1, 2, 3 }; + subreadData.fileOffset_ = { 35651584, 35655125, 35667128, 35679170 }; + + PbiRawMappedData& mappedData = index.mappedData_; + mappedData.tId_ = { 0, 0, 0, 0 }; + mappedData.tStart_ = { 9507, 8453, 8455, 9291 }; + mappedData.tEnd_ = { 9903, 9902, 9893, 9900 }; + mappedData.aStart_ = { 2130, 2581, 4102, 5619 }; + mappedData.aEnd_ = { 2531, 4055, 5560, 6237 }; + mappedData.revStrand_ = { 0, 1, 0, 1 }; + mappedData.mapQV_ = { 254, 254, 254, 254 }; + mappedData.nM_ = { 384, 1411, 1393, 598 }; + mappedData.nMM_ = { 0, 0, 0, 0 }; + + PbiRawBarcodeData& barcodeData = index.barcodeData_; + barcodeData.bcForward_ = { 0, 17, 256, 17 }; + barcodeData.bcReverse_ = { 1, 18, 257, 18 }; + barcodeData.bcQual_ = { 42, 80, 42, 110 }; + + PbiRawReferenceData& referenceData = index.referenceData_; + referenceData.entries_.emplace_back( 0, 0, 3 ); + referenceData.entries_.emplace_back( 1 ); + referenceData.entries_.emplace_back( PbiReferenceEntry::UNMAPPED_ID ); + + return index; +} + +static const PbiRawData shared_index = test2Bam_RawIndex(); + +static +void checkFilterRows(const PbiFilter& filter, const std::vector expectedRows) +{ + for (size_t row : expectedRows) + EXPECT_TRUE(filter.Accepts(shared_index, row)); +} + +static +void checkFilterInternals(const PbiFilter& filter, + const PbiFilter::CompositionType expectedType, + const size_t expectedNumChildren, + const std::vector expectedRows) +{ + EXPECT_EQ(expectedType, filter.d_->type_); + EXPECT_EQ(expectedNumChildren, filter.d_->filters_.size()); + checkFilterRows(filter, expectedRows); +} + +struct SimpleFilter +{ + bool Accepts(const PbiRawData& idx, const size_t row) const + { (void)idx; (void)row; return true; } +}; + +struct NoncompliantFilter { }; + +struct SortUniqueTestFilter +{ + bool Accepts(const PbiRawData& idx, const size_t row) const + { + (void)idx; + switch(row) { + case 0: // fall through + case 1: // . + case 2: // . + case 3: // . + case 4: // . + case 7: // . + case 8: return true; + default: + return false; + } + } +}; + +struct SortUniqueTestFilter2 +{ + bool Accepts(const PbiRawData& idx, const size_t row) const + { + (void)idx; + switch(row) { + case 3: // fall through + case 7: // . + case 5: return true; + default: + return false; + } + } +}; + +static inline +PbiFilter emptyFilter(void) +{ return PbiFilter{ }; } + +static inline +PbiFilter simpleFilter(void) +{ return PbiFilter{ SimpleFilter{ } }; } + +} // namespace tests +} // namespace BAM +} // namespace PacBio + +TEST(PbiFilterTest, DefaultCtorOk) +{ + auto filter = PbiFilter{ }; + tests::checkFilterInternals(filter, PbiFilter::INTERSECT, 0, std::vector{0,1,2,3}); +} + +TEST(PbiFilterTest, CompositionOk) +{ + auto filter = PbiFilter{ }; + filter.Add(PbiFilter{ }); + tests::checkFilterInternals(filter, PbiFilter::INTERSECT, 1, std::vector{0,1,2,3}); +} + +TEST(PbiFilterTest, CustomFilterOk) +{ + { // ctor + auto filter = PbiFilter{ tests::SimpleFilter{ } }; + tests::checkFilterInternals(filter, PbiFilter::INTERSECT, 1, std::vector{}); + } + { // Add + auto filter = PbiFilter{ }; + filter.Add(tests::SimpleFilter{ }); + tests::checkFilterInternals(filter, PbiFilter::INTERSECT, 1, std::vector{}); + } + +// PbiFilter shouldNotCompile = PbiFilter{ tests::NoncompliantFilter{ } }; // <-- when uncommented, should not compile +// PbiFilter shouldNotCompileEither; shouldNotCompileEither.Add(tests::NoncompliantFilter{ }); // <-- when uncommented, should not compile +} + +TEST(PbiFilterTest, CopyOk) +{ + { // empty + const auto original = PbiFilter{ }; + + PbiFilter copyCtor(original); + PbiFilter copyAssign; + copyAssign = original; + + tests::checkFilterInternals(original, PbiFilter::INTERSECT, 0, std::vector{0,1,2,3}); + tests::checkFilterInternals(copyCtor, PbiFilter::INTERSECT, 0, std::vector{0,1,2,3}); + tests::checkFilterInternals(copyAssign, PbiFilter::INTERSECT, 0, std::vector{0,1,2,3}); + } + { // with children + const auto original = PbiFilter{ tests::SimpleFilter{ } }; + + PbiFilter copyCtor(original); + PbiFilter copyAssign; + copyAssign = original; + + tests::checkFilterInternals(original, PbiFilter::INTERSECT, 1, std::vector{}); + tests::checkFilterInternals(copyCtor, PbiFilter::INTERSECT, 1, std::vector{}); + tests::checkFilterInternals(copyAssign, PbiFilter::INTERSECT, 1, std::vector{}); + } +} + +TEST(PbiFilterTest, MoveOk) +{ + { // empty + const auto original = tests::emptyFilter(); + + PbiFilter moveCtor(tests::emptyFilter()); + PbiFilter moveAssign; + moveAssign = tests::emptyFilter(); + + tests::checkFilterInternals(original, PbiFilter::INTERSECT, 0, std::vector{0,1,2,3}); + tests::checkFilterInternals(moveCtor, PbiFilter::INTERSECT, 0, std::vector{0,1,2,3}); + tests::checkFilterInternals(moveAssign, PbiFilter::INTERSECT, 0, std::vector{0,1,2,3}); + } + { // with children + const auto original = tests::simpleFilter(); + + PbiFilter moveCtor(tests::simpleFilter()); + PbiFilter moveAssign; + moveAssign = tests::simpleFilter(); + + tests::checkFilterInternals(original, PbiFilter::INTERSECT, 1, std::vector{0,1,2,3}); + tests::checkFilterInternals(moveCtor, PbiFilter::INTERSECT, 1, std::vector{0,1,2,3}); + tests::checkFilterInternals(moveAssign, PbiFilter::INTERSECT, 1, std::vector{0,1,2,3}); + } +} + +TEST(PbiFilterTest, SortsAndUniquesChildFilterResultsOk) +{ + const auto childFilter = tests::SortUniqueTestFilter{ }; + const auto filter = PbiFilter{ childFilter }; + tests::checkFilterRows(childFilter, std::vector{2, 7, 0, 3, 4, 1, 8}); + tests::checkFilterRows(filter, std::vector{0, 1, 2, 3, 4, 7, 8}); +} + +TEST(PbiFilterTest, UnionOk) +{ + { // empty + { // copy + const auto emptyFilter = tests::emptyFilter(); + const auto emptyFilter2 = tests::emptyFilter(); + const auto u = PbiFilter::Union({ emptyFilter, emptyFilter2 }); + tests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector{0,1,2,3}); + } + { // move + const auto u = PbiFilter::Union({ PbiFilter{ }, PbiFilter{ } }); + tests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector{0,1,2,3}); + } + } + + { // with (no-data) children - just checking composition + { // copy + const auto simpleFilter = tests::SimpleFilter{ }; + const auto simpleFilter2 = tests::SimpleFilter{ }; + const auto u = PbiFilter::Union({ simpleFilter, simpleFilter2 }); + tests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector{}); + } + { // move + const auto u = PbiFilter::Union({ tests::SimpleFilter{ }, tests::SimpleFilter{ } }); + tests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector{}); + } + } + + { // 2-child union, results sorted & unique-d by PbiFilter + + const auto child1 = tests::SortUniqueTestFilter{ }; + const auto child2 = tests::SortUniqueTestFilter2{ }; + const auto u = PbiFilter::Union({ child1, child2 }); + + tests::checkFilterRows(child1, std::vector{2, 7, 0, 3, 4, 1, 8}); + tests::checkFilterRows(child2, std::vector{3, 7, 5}); + tests::checkFilterRows(u, std::vector{0, 1, 2, 3, 4, 5, 7, 8}); + } +} + +TEST(PbiFilterTest, IntersectOk) +{ + { // empty + { // copy + const auto emptyFilter = tests::emptyFilter(); + const auto emptyFilter2 = tests::emptyFilter(); + const auto i = PbiFilter::Intersection({ emptyFilter, emptyFilter2 }); + tests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector{0,1,2,3}); + } + { // move + const auto i = PbiFilter::Intersection({ PbiFilter{ }, PbiFilter{ } }); + tests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector{0,1,2,3}); + } + } + + { // with (no-data) children - just checking composition + { // copy + const auto simpleFilter = tests::SimpleFilter{ }; + const auto simpleFilter2 = tests::SimpleFilter{ }; + const auto i = PbiFilter::Intersection({ simpleFilter, simpleFilter2 }); + tests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector{}); + } + { // move + const auto i = PbiFilter::Intersection({ tests::SimpleFilter{ }, tests::SimpleFilter{ } }); + tests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector{}); + } + } + + { // 2-child intersect, sorted & unique-d by PbiFilter + + const auto child1 = tests::SortUniqueTestFilter{ }; + const auto child2 = tests::SortUniqueTestFilter2{ }; + const auto i = PbiFilter::Intersection({ child1, child2 }); + + tests::checkFilterRows(child1, std::vector{2, 7, 0, 3, 4, 1, 8}); + tests::checkFilterRows(child2, std::vector{3, 7, 5 }); + tests::checkFilterRows(i, std::vector{3, 7}); + } +} + +TEST(PbiFilterTest, AlignedEndFilterOk) +{ + { + const auto filter = PbiFilter{ PbiAlignedEndFilter{ 4055 } }; + tests::checkFilterRows(filter, std::vector{1}); + } + { + const auto filter = PbiFilter{ PbiAlignedEndFilter{ 4055, Compare::NOT_EQUAL } }; + tests::checkFilterRows(filter, std::vector{0,2,3}); + } + { + const auto filter = PbiFilter{ PbiAlignedEndFilter{ 4000, Compare::LESS_THAN } }; + tests::checkFilterRows(filter, std::vector{0}); + } + { + const auto filter = PbiFilter{ PbiAlignedEndFilter{ 5560, Compare::GREATER_THAN } }; + tests::checkFilterRows(filter, std::vector{3}); + } + { + const auto filter = PbiFilter{ PbiAlignedEndFilter{ 5560, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{2,3}); + } + + { + const auto filter = PbiFilter{ PbiAlignedEndFilter{ 7000, Compare::GREATER_THAN } }; + tests::checkFilterRows(filter, std::vector{}); + } +} + +TEST(PbiFilterTest, AlignedLengthFilterOk) +{ + { + const auto filter = PbiFilter{ PbiAlignedLengthFilter{ 500, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{1,2,3}); + } + { + const auto filter = PbiFilter{ PbiAlignedLengthFilter{ 1000, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{1,2}); + } +} + +TEST(PbiFilterTest, AlignedStartFilterOk) +{ + { + const auto filter = PbiFilter{ PbiAlignedStartFilter{ 2600, Compare::LESS_THAN } }; + tests::checkFilterRows(filter, std::vector{0,1}); + } + { + const auto filter = PbiFilter{ PbiAlignedStartFilter{ 4102, Compare::GREATER_THAN } }; + tests::checkFilterRows(filter, std::vector{3}); + } + { + const auto filter = PbiFilter{ PbiAlignedStartFilter{ 4102, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{2,3}); + } + { + const auto filter = PbiFilter{ PbiAlignedStartFilter{ 6000, Compare::GREATER_THAN } }; + tests::checkFilterRows(filter, std::vector{ }); + } +} + +TEST(PbiFilterTest, AlignedStrandFilterOk) +{ + { + const auto filter = PbiFilter{ PbiAlignedStrandFilter{ Strand::FORWARD } }; + tests::checkFilterRows(filter, std::vector{0,2}); + } + { + const auto filter = PbiFilter{ PbiAlignedStrandFilter{ Strand::REVERSE } }; + tests::checkFilterRows(filter, std::vector{1,3}); + } + { + const auto filter = PbiFilter{ PbiAlignedStrandFilter{ Strand::FORWARD, Compare::NOT_EQUAL } }; // same as Strand::REVERSE + tests::checkFilterRows(filter, std::vector{1,3}); + } + + // unsupported compare types throw + EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::LESS_THAN), std::runtime_error); + EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::LESS_THAN_EQUAL), std::runtime_error); + EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::GREATER_THAN), std::runtime_error); + EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::GREATER_THAN_EQUAL), std::runtime_error); +} + +TEST(PbiFilterTest, BarcodeFilterOk) +{ + { + const auto filter = PbiFilter{ PbiBarcodeFilter{ 17 } }; + tests::checkFilterRows(filter, std::vector{1,3}); + } + { + const auto filter = PbiFilter{ PbiBarcodeFilter{ 18 } }; + tests::checkFilterRows(filter, std::vector{1,3}); + } + { + const auto filter = PbiFilter{ PbiBarcodeFilter{ 0 } }; + tests::checkFilterRows(filter, std::vector{0}); + } +} + +TEST(PbiFilterTest, BarcodeForwardFilterOk) +{ + { + const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ 17 } }; + tests::checkFilterRows(filter, std::vector{1,3}); + } + { + const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ 400 } }; + tests::checkFilterRows(filter, std::vector{}); + } + { + const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ {0, 256} } }; + tests::checkFilterRows(filter, std::vector{0,2}); + } +} + +TEST(PbiFilterTest, BarcodeQualityFilterOk) +{ + { + const auto filter = PbiFilter{ PbiBarcodeQualityFilter{ 80, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{1,3}); + } + { + const auto filter = PbiFilter{ PbiBarcodeQualityFilter{ 40, Compare::LESS_THAN } }; + tests::checkFilterRows(filter, std::vector{}); + } +} + +TEST(PbiFilterTest, BarcodeReverseFilterOk) +{ + { + const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ 18 } }; + tests::checkFilterRows(filter, std::vector{1,3}); + } + { + const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ 400 } }; + tests::checkFilterRows(filter, std::vector{ }); + } + { + const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ {1, 257} } }; + tests::checkFilterRows(filter, std::vector{0,2}); + } +} + +TEST(PbiFilterTest, BarcodesFilterOk) +{ + { + const auto filter = PbiFilter{ PbiBarcodesFilter{ 17, 18 } }; + tests::checkFilterRows(filter, std::vector{1,3}); + } + { + const auto filter = PbiFilter{ PbiBarcodesFilter{ 17, 19 } }; + tests::checkFilterRows(filter, std::vector{ }); + } + { + const auto filter = PbiFilter{ PbiBarcodesFilter{ std::make_pair(17,18) } }; + tests::checkFilterRows(filter, std::vector{1,3}); + } +} + +TEST(PbiFilterTest, IdentityFilterOk) +{ + { + const auto filter = PbiFilter{ PbiIdentityFilter{ 0.95, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{3}); + } +} + +TEST(PbiFilterTest, LocalContextFilterOk) +{ + { // == NO_LOCAL_CONTEXT + const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT } }; + tests::checkFilterRows(filter, std::vector{0}); + } + { // != ADAPTER_BEFORE (exact match) + const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_EQUAL } }; + tests::checkFilterRows(filter, std::vector{0,2,3}); + } + { // contains ADAPTER_BEFORE + const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS } }; + tests::checkFilterRows(filter, std::vector{1,3}); + } + { // does not contain ADAPTER_BEFORE + const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS } }; + tests::checkFilterRows(filter, std::vector{0,2}); + } + { // include both ADAPTER_BEFORE and ADAPTER_AFTER + const auto filter = PbiFilter::Intersection( + { + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS }, + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::CONTAINS } + }); + tests::checkFilterRows(filter, std::vector{3}); + } + { // exclude both ADAPTER_BEFORE and ADAPTER_AFTER + const auto filter = PbiFilter::Intersection( + { + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS }, + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::NOT_CONTAINS } + }); + tests::checkFilterRows(filter, std::vector{0}); + } + { // include everything with either ADAPTER_BEFORE or ADAPTER_AFTER + const auto filter = PbiFilter::Union( + { + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS }, + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::CONTAINS } + }); + tests::checkFilterRows(filter, std::vector{1,2,3}); + } + { // include everything with either ADAPTER_BEFORE or ADAPTER_AFTER, but not both + const auto filter = PbiFilter::Intersection( + { + PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL }, + PbiFilter::Union( + { + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS }, + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::NOT_CONTAINS } + }) + }); + tests::checkFilterRows(filter, std::vector{1,2}); + } +} + +TEST(PbiFilterTest, MapQualityFilterOk) +{ + { + const auto filter = PbiFilter{ PbiMapQualityFilter{ 254 } }; + tests::checkFilterRows(filter, std::vector{0,1,2,3}); + } + { + const auto filter = PbiFilter{ PbiMapQualityFilter{ 254, Compare::NOT_EQUAL } }; + tests::checkFilterRows(filter, std::vector{}); + } +} + +TEST(PbiFilterTest, MovieNameFilterOk) +{ + const auto bamFile = BamFile{ tests::Data_Dir + string{ "/group/test2.bam" } }; + const auto index = PbiRawData{ bamFile.PacBioIndexFilename() }; + + { + const auto filter = PbiFilter{ PbiMovieNameFilter{ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0" } }; + const auto expectedRows = std::vector{0,1,2,3}; + for (size_t row : expectedRows) + EXPECT_TRUE(filter.Accepts(index, row)); + } + { + const auto filter = PbiFilter{ PbiMovieNameFilter{ "does_not_exist" } }; + const auto expectedRows = std::vector{}; + for (size_t row : expectedRows) + EXPECT_TRUE(filter.Accepts(index, row)); + } + { + const auto names = vector{"does_not_exist", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0"}; + const auto filter = PbiFilter{ PbiMovieNameFilter{ names } }; + const auto expectedRows = std::vector{0,1,2,3}; + for (size_t row : expectedRows) + EXPECT_TRUE(filter.Accepts(index, row)); + } +} + +TEST(PbiFilterTest, NumDeletedBasesFilterOk) +{ + // del: { 12, 38, 45, 11} - calculated from raw data, not stored directly in testing object or read from PBI file + + { + const auto filter = PbiFilter{ PbiNumDeletedBasesFilter{ 12, Compare::LESS_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{0,3}); + } + { + const auto filter = PbiFilter{ PbiNumDeletedBasesFilter{ 45, Compare::EQUAL } }; + tests::checkFilterRows(filter, std::vector{2}); + } +} + +TEST(PbiFilterTest, NumInsertedBasesFilterOk) +{ + // ins: { 17, 63, 65, 20 } - calculated from raw data, not stored directly testing object or read from PBI file + + { + const auto filter = PbiFilter{ PbiNumInsertedBasesFilter{ 63, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{1,2}); + } + { + const auto filter = PbiFilter{ PbiNumInsertedBasesFilter{ 17, Compare::NOT_EQUAL } }; + tests::checkFilterRows(filter, std::vector{1,2,3}); + } +} + +TEST(PbiFilterTest, NumMatchesFilterOk) +{ + { + const auto filter = PbiFilter{ PbiNumMatchesFilter{ 1000, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{1,2}); + } + { + const auto filter = PbiFilter{ PbiNumMatchesFilter{ 400, Compare::LESS_THAN } }; + tests::checkFilterRows(filter, std::vector{0}); + } +} + +TEST(PbiFilterTest, NumMismatchesFilterOk) +{ + { + const auto filter = PbiFilter{ PbiNumMismatchesFilter{ 0, Compare::EQUAL } }; + tests::checkFilterRows(filter, std::vector{0,1,2,3}); + } + { + const auto filter = PbiFilter{ PbiNumMismatchesFilter{ 0, Compare::NOT_EQUAL } }; + tests::checkFilterRows(filter, std::vector{}); + } +} + +TEST(PbiFilterTest, QueryEndFilterOk) +{ + { + const auto filter = PbiFilter{ PbiQueryEndFilter{ 4055 } }; + tests::checkFilterRows(filter, std::vector{1}); + } + { + const auto filter = PbiFilter{ PbiQueryEndFilter{ 6200, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{3}); + } +} + +TEST(PbiFilterTest, QueryLengthFilterOk) +{ + { + const auto filter = PbiFilter{ PbiQueryLengthFilter{ 500, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{1,2,3}); + } + { + const auto filter = PbiFilter{ PbiQueryLengthFilter{ 1000, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{1,2}); + } +} + +TEST(PbiFilterTest, QueryNameFilterOk) +{ + const auto bamFile = BamFile{ tests::Data_Dir + string{ "/group/test2.bam" } }; + const auto index = PbiIndex{ bamFile.PacBioIndexFilename() }; + + { + const auto filter = PbiFilter{ PbiQueryNameFilter{ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055" } }; + tests::checkFilterRows(filter, std::vector{1}); + } + { + const auto filter = PbiFilter{ PbiQueryNameFilter{ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237" } }; + tests::checkFilterRows(filter, std::vector{3}); + } + + { + const auto filter = PbiFilter{ PbiQueryNameFilter{ "does_not_exist/0/0_0" } }; + tests::checkFilterRows(filter, std::vector{}); + } + { + const auto names = vector{"m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055", + "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237"}; + const auto filter = PbiFilter{ PbiQueryNameFilter{ names } }; + tests::checkFilterRows(filter, std::vector{1,3}); + } + + // invalid QNAME syntax throws + EXPECT_THROW( + { + const auto filter = PbiFilter{ PbiQueryNameFilter{ "" } }; + tests::checkFilterRows(filter, std::vector{}); + }, + std::runtime_error); + EXPECT_THROW( + { + const auto filter = PbiFilter{ PbiQueryNameFilter{ "foo" } }; + tests::checkFilterRows(filter, std::vector{}); + }, + std::runtime_error); + EXPECT_THROW( + { + const auto filter = PbiFilter{ PbiQueryNameFilter{ "foo/bar" } }; + tests::checkFilterRows(filter, std::vector{}); + }, + std::runtime_error); + EXPECT_THROW( + { + const auto filter = PbiFilter{ PbiQueryNameFilter{ "foo/bar/baz_bam" } }; + tests::checkFilterRows(filter, std::vector{}); + }, + std::exception); // come back to see why this is not runtime_error but something else +} + +TEST(PbiFilterTest, QueryStartFilterOk) +{ + { + const auto filter = PbiFilter{ PbiQueryStartFilter{ 4101 } }; + tests::checkFilterRows(filter, std::vector{2}); + } + { + const auto filter = PbiFilter{ PbiQueryStartFilter{ 5000 } }; + tests::checkFilterRows(filter, std::vector{}); + } + { + const auto filter = PbiFilter{ PbiQueryStartFilter{ 5000, Compare::GREATER_THAN } }; + tests::checkFilterRows(filter, std::vector{3}); + } +} + +TEST(PbiFilterTest, ReadAccuracyFilterOk) +{ + { + const auto filter = PbiFilter{ PbiReadAccuracyFilter{ 0.9 } }; + tests::checkFilterRows(filter, std::vector{}); + } + { + const auto filter = PbiFilter{ PbiReadAccuracyFilter{ 0.9, Compare::GREATER_THAN } }; + tests::checkFilterRows(filter, std::vector{0,2}); + } +} + +TEST(PbiFilterTest, ReadGroupFilterOk) +{ + { // numeric ID + const auto filter = PbiReadGroupFilter{ -1197849594 }; + tests::checkFilterRows(filter, std::vector{0,1,2,3}); + + const auto filter2 = PbiReadGroupFilter{ 200 }; + tests::checkFilterRows(filter2, std::vector{}); + } + { // string ID + const auto filter = PbiReadGroupFilter{ "b89a4406" }; + tests::checkFilterRows(filter, std::vector{0,1,2,3}); + + const auto filter2 = PbiReadGroupFilter{ "b89a4406" }; + tests::checkFilterRows(filter2, std::vector{0,1,2,3}); + } + { // ReadGroupInfo object + const auto rg = ReadGroupInfo{ "b89a4406" }; + const auto filter = PbiReadGroupFilter{ rg }; + tests::checkFilterRows(filter, std::vector{0,1,2,3}); + } + { // multi-ID + const auto ids = vector({-1197849594, 200}); + const auto filter = PbiReadGroupFilter{ ids }; + tests::checkFilterRows(filter, std::vector{0,1,2,3}); + } + { // multi-string + const auto ids = vector({"b89a4406", "deadbeef"}); + const auto filter = PbiReadGroupFilter{ ids }; + tests::checkFilterRows(filter, std::vector{0,1,2,3}); + } + { // multi-ReadGroupInfo + const auto ids = vector({ ReadGroupInfo("b89a4406"), ReadGroupInfo("deadbeef")}); + const auto filter = PbiReadGroupFilter{ ids }; + tests::checkFilterRows(filter, std::vector{0,1,2,3}); + } +} + +TEST(PbiFilterTest, ReferenceEndFilterOk) +{ + { + const auto filter = PbiFilter{ PbiReferenceEndFilter{ 9900 } }; + tests::checkFilterRows(filter, std::vector{3}); + } + { + const auto filter = PbiFilter{ PbiReferenceEndFilter{ 9900, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{0,1,3}); + } +} + +TEST(PbiFilterTest, ReferenceIdFilterOk) +{ + { + const auto filter = PbiFilter{ PbiReferenceIdFilter{ 0 } }; + tests::checkFilterRows(filter, std::vector{0,1,2,3}); + } + { + const auto filter = PbiFilter{ PbiReferenceIdFilter{ 0, Compare::NOT_EQUAL } }; + tests::checkFilterRows(filter, std::vector{}); + } + { + const auto ids = vector({0, 42}); + const auto filter = PbiFilter{ PbiReferenceIdFilter{ ids } }; + tests::checkFilterRows(filter, std::vector{0,1,2,3}); + } +} + +TEST(PbiFilterTest, ReferenceNameFilterOk) +{ + const auto bamFile = BamFile{ tests::Data_Dir + string{ "/group/test2.bam" } }; + const auto index = PbiRawData{ bamFile.PacBioIndexFilename() }; + + { + const auto filter = PbiFilter{ PbiReferenceNameFilter{ "lambda_NEB3011" } }; + const auto expectedRows = std::vector{0,1,2,3}; + for (size_t row : expectedRows) + EXPECT_TRUE(filter.Accepts(index, row)); + + } + { + const auto filter = PbiFilter{ PbiReferenceNameFilter{ "lambda_NEB3011", Compare::NOT_EQUAL } }; + const auto expectedRows = std::vector{}; + for (size_t row : expectedRows) + EXPECT_TRUE(filter.Accepts(index, row)); + } + { + const auto names = vector({ "lambda_NEB3011" }); // this file only has 1 :( + const auto filter = PbiFilter{ PbiReferenceNameFilter{ names } }; + const auto expectedRows = std::vector{0,1,2,3}; + for (size_t row : expectedRows) + EXPECT_TRUE(filter.Accepts(index, row)); + } + + // unsupported compare types throw + EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::LESS_THAN), std::runtime_error); + EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::LESS_THAN_EQUAL), std::runtime_error); + EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::GREATER_THAN), std::runtime_error); + EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::GREATER_THAN_EQUAL), std::runtime_error); +} + +TEST(PbiFilterTest, ReferenceStartFilterOk) +{ + { + const auto filter = PbiFilter{ PbiReferenceStartFilter{ 8453 } }; + tests::checkFilterRows(filter, std::vector{1}); + } + { + const auto filter = PbiFilter{ PbiReferenceStartFilter{ 9200, Compare::GREATER_THAN_EQUAL } }; + tests::checkFilterRows(filter, std::vector{0,3}); + } +} + +TEST(PbiFilterTest, ZmwFilterOk) +{ + { + const auto filter = PbiFilter{ PbiZmwFilter{ 14743 } }; + tests::checkFilterRows(filter, std::vector{0,1,2,3}); + } + { + const auto filter = PbiFilter{ PbiZmwFilter{ 14743, Compare::NOT_EQUAL } }; + tests::checkFilterRows(filter, std::vector{}); + } + { + const auto zmws = vector({14743,42,200}); + const auto filter = PbiFilter{ PbiZmwFilter{ zmws } }; + tests::checkFilterRows(filter, std::vector{0,1,2,3}); + } +} + +TEST(PbiFilterTest, FromDataSetOk) +{ + const auto expectedFilter = + PbiFilter::Union( + { + PbiFilter::Intersection( + { + PbiZmwFilter{ 14743 }, + PbiReadAccuracyFilter { 0.9, Compare::GREATER_THAN_EQUAL } + }), + + PbiReferenceStartFilter { 9200, Compare::GREATER_THAN_EQUAL } + }); + + + auto properties1 = Properties{ }; + properties1.Add(Property{ "zm", "14743", "==" }); + properties1.Add(Property{ "rq", "0.9", ">=" }); + + auto datasetFilter1 = Filter{ }; + datasetFilter1.Properties(properties1); + + auto properties2 = Properties{ }; + properties2.Add(Property{ "pos", "9200", ">=" }); + + auto datasetFilter2 = Filter{ }; + datasetFilter2.Properties(properties2); + + auto datasetFilters = Filters{ }; + datasetFilters.Add(datasetFilter1); + datasetFilters.Add(datasetFilter2); + auto dataset = DataSet{ }; + dataset.Filters(datasetFilters); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + + for (size_t i = 0; i < tests::shared_index.NumReads(); ++i) { + EXPECT_EQ(expectedFilter.Accepts(tests::shared_index, i), + generatedFilter.Accepts(tests::shared_index, i)); + } +} + +TEST(PbiFilterTest, BarcodeListFromDataSetXmlOk) +{ + auto runner = [](const Property& property, + const PbiFilter& expectedFilter, + const std::vector& expectedResults) + { + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, expectedResults); + tests::checkFilterRows(generatedFilter, expectedResults); + }; + + // single barcode + runner(Property{ "bc", "18", "==" }, + PbiBarcodeFilter{ 18, Compare::EQUAL }, + std::vector{1,3}); + + // single barcode (bracketed) + runner(Property{ "bc", "[18]", "==" }, + PbiBarcodeFilter{ 18, Compare::EQUAL }, + std::vector{1,3}); + + // barcode pair (square brackets) + runner(Property{ "bc", "[17,18]", "==" }, + PbiBarcodesFilter{ {17, 18}, Compare::EQUAL }, + std::vector{1,3}); + + // barcode pair (parens) + runner(Property{ "bc", "(17,18)", "==" }, + PbiBarcodesFilter{ {17, 18}, Compare::EQUAL }, + std::vector{1,3}); + + // barcode pair (curly brackets) + runner(Property{ "bc", "{17,18}", "==" }, + PbiBarcodesFilter{ {17, 18}, Compare::EQUAL }, + std::vector{1,3}); + + // barcode pair (list, but no brackets) + runner(Property{ "bc", "17,18", "==" }, + PbiBarcodesFilter{ {17, 18}, Compare::EQUAL }, + std::vector{1,3}); + + // barcode pair - same value + runner(Property{ "bc", "[18,18]", "==" }, + PbiBarcodesFilter{ {18, 18}, Compare::EQUAL }, + std::vector{}); // none share forward & reverse + + auto expectFail = [](const Property& property) + { + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + EXPECT_THROW(PbiFilter::FromDataSet(dataset), std::runtime_error); + }; + + // list-ish, but only one value + expectFail(Property{ "bc", "[18,]", "==" }); + + // too many barcodes + expectFail(Property{ "bc", "[18,18,18]", "==" }); +} + +TEST(PbiFilterTest, LocalContextFiltersFromDataSetXmlOk) +{ + { // no adapters or barcodes + + const auto expectedFilter = + PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::EQUAL }; + + // XML: + Property property("cx", "0", "=="); + + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{0}); + tests::checkFilterRows(generatedFilter, std::vector{0}); + } + { // any adapters or barcodes + + const auto expectedFilter = + PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL }; + + // XML: + Property property("cx", "0", "!="); + + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{1,2,3}); + tests::checkFilterRows(generatedFilter, std::vector{1,2,3}); + } + { // contains adapter_before + + const auto expectedFilter = + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS }; + + // XML: + Property property("cx", "1", "&"); + + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{1,3}); + tests::checkFilterRows(generatedFilter, std::vector{1,3}); + } + { // contains adapter_before + + const auto expectedFilter = + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS }; + + // XML: + Property property("cx", "ADAPTER_BEFORE", "&"); + + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{1,3}); + tests::checkFilterRows(generatedFilter, std::vector{1,3}); + } + { // contains adapter_after + + const auto expectedFilter = + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::CONTAINS }; + + // XML: + Property property("cx", "2", "&"); + + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{2,3}); + tests::checkFilterRows(generatedFilter, std::vector{2,3}); + } + { // contains adapter_before or adapter_after + + const auto expectedFilter = + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER, + Compare::CONTAINS }; + + // XML: + Property property("cx", "3", "&"); + + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{1,2,3}); + tests::checkFilterRows(generatedFilter, std::vector{1,2,3}); + } + { // contains adapter_before or adapter_after + + const auto expectedFilter = + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER, + Compare::CONTAINS }; + + // XML: + Property property("cx", "ADAPTER_BEFORE | ADAPTER_AFTER", "&"); + + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{1,2,3}); + tests::checkFilterRows(generatedFilter, std::vector{1,2,3}); + } + { // contains adapter_before or adapter_after - no whitespace separation + + const auto expectedFilter = + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER, + Compare::CONTAINS }; + + // XML: + Property property("cx", "ADAPTER_BEFORE|ADAPTER_AFTER", "&"); + + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{1,2,3}); + tests::checkFilterRows(generatedFilter, std::vector{1,2,3}); + } + { // contains adapter_before or adapter_after - a lot of whitespace separation + + const auto expectedFilter = + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER, + Compare::CONTAINS }; + + // XML: + Property property("cx", "ADAPTER_BEFORE | ADAPTER_AFTER", "&"); + + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{1,2,3}); + tests::checkFilterRows(generatedFilter, std::vector{1,2,3}); + } + { // contains adapter_before or adapter_after, but not both + + const auto expectedFilter = PbiFilter::Union( + { + PbiFilter::Intersection( + { + PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL }, + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS } + }), + PbiFilter::Intersection( + { + PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL }, + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::NOT_CONTAINS } + }) + }); + + // XML: + // + // + // + // + // + // + // + // + // + // + // + // + // + // + + auto filter1 = Filter{ }; + filter1.Properties().Add(Property("cx", "0", "!=")); + filter1.Properties().Add(Property("cx", "1", "~")); + + auto filter2 = Filter{ }; + filter2.Properties().Add(Property("cx", "0", "!=")); + filter2.Properties().Add(Property("cx", "2", "~")); + + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter1); + dataset.Filters().Add(filter2); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{1,2}); + tests::checkFilterRows(generatedFilter, std::vector{1,2}); + + } + { // contains adapter_before or adapter_after + + const auto expectedFilter = PbiFilter::Union( + { + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS }, + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::CONTAINS } + }); + + // XML: + // + // + // + // + // + // + // + // + // + // + // + // + + auto filter1 = Filter{ }; + filter1.Properties().Add(Property("cx", "1", "&")); + + auto filter2 = Filter{ }; + filter2.Properties().Add(Property("cx", "2", "&")); + + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter1); + dataset.Filters().Add(filter2); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{1,2,3}); + tests::checkFilterRows(generatedFilter, std::vector{1,2,3}); + } + { // adapter_before and adapter_after + + const auto expectedFilter = PbiFilter::Intersection( + { + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS }, + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::CONTAINS } + }); + + // XML: + // + // + Property property1("cx", "1", "&"); + Property property2("cx", "2", "&"); + + auto filter = Filter{ }; + filter.Properties().Add(property1); + filter.Properties().Add(property2); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{3}); + tests::checkFilterRows(generatedFilter, std::vector{3}); + } + { // adapter_before, but no adapter_after + + const auto expectedFilter = PbiFilter::Intersection( + { + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS }, + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::NOT_CONTAINS } + }); + + // XML: + // + // + Property property1("cx", "1", "&"); + Property property2("cx", "2", "~"); + + auto filter = Filter{ }; + filter.Properties().Add(property1); + filter.Properties().Add(property2); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{1}); + tests::checkFilterRows(generatedFilter, std::vector{1}); + } + { // contains no adapter_before + + const auto expectedFilter = + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS }; + + // XML: + Property property("cx", "1", "~"); + + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{0,2}); + tests::checkFilterRows(generatedFilter, std::vector{0,2}); + } + { // contains no adapter_before or adapter_after + + const auto expectedFilter = PbiFilter::Intersection( + { + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS }, + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::NOT_CONTAINS } + }); + + // XML: + // + // + Property property1("cx", "1", "~"); + Property property2("cx", "2", "~"); + + auto filter = Filter{ }; + filter.Properties().Add(property1); + filter.Properties().Add(property2); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{0}); + tests::checkFilterRows(generatedFilter, std::vector{0}); + } + { // contains no adapter_before or adapter_after + + const auto expectedFilter = + PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER, + Compare::NOT_CONTAINS }; + + // XML: + Property property("cx", "3", "~"); + + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + const auto generatedFilter = PbiFilter::FromDataSet(dataset); + tests::checkFilterRows(expectedFilter, std::vector{0}); + tests::checkFilterRows(generatedFilter, std::vector{0}); + } + { // throws on invalid enum name + + Property property("cx", "DOES_NOT_EXIST", "~"); + + auto filter = Filter{ }; + filter.Properties().Add(property); + DataSet dataset = DataSet{ }; + dataset.Filters().Add(filter); + + EXPECT_THROW(PbiFilter::FromDataSet(dataset), std::runtime_error); + } +} diff --git a/tests/src/test_PbiFilterQuery.cpp b/tests/src/test_PbiFilterQuery.cpp new file mode 100644 index 0000000..cacd7cd --- /dev/null +++ b/tests/src/test_PbiFilterQuery.cpp @@ -0,0 +1,444 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +TEST(PbiFilterQueryTest, QueryOk) +{ + const auto bamFile = BamFile{ tests::Data_Dir + string{ "/group/test2.bam" } }; + + { + int count = 0; + PbiFilterQuery query( PbiQueryLengthFilter{ 500, Compare::GREATER_THAN_EQUAL}, bamFile); + for (const auto& r: query) { + ++count; + EXPECT_GE((r.QueryEnd() - r.QueryStart()), 500); + } + EXPECT_EQ(3, count); + } + { + // all records aligned to reverse strand && pos >= 9200 + const auto filter = PbiFilter::Intersection( + { + PbiAlignedStrandFilter{Strand::REVERSE}, + PbiReferenceStartFilter{9200, Compare::GREATER_THAN_EQUAL} + }); + + int count = 0; + PbiFilterQuery query(filter, bamFile); + for (const auto& r: query) { + ++count; + EXPECT_EQ(Strand::REVERSE, r.AlignedStrand()); + EXPECT_GE((r.ReferenceStart()), 9200); + EXPECT_EQ(string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237"), r.FullName()); + } + EXPECT_EQ(1, count); + } + { + // all records aligned to forward strand && pos >= 9200 + const auto filter = PbiFilter::Intersection( + { + PbiAlignedStrandFilter{Strand::FORWARD}, + PbiReferenceStartFilter{9200, Compare::GREATER_THAN_EQUAL} + }); + + int count = 0; + PbiFilterQuery query(filter, bamFile); + for (const auto& r: query) { + ++count; + EXPECT_EQ(Strand::FORWARD, r.AlignedStrand()); + EXPECT_GE((r.ReferenceStart()), 9200); + EXPECT_EQ(string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2114_2531"), r.FullName()); + } + EXPECT_EQ(1, count); + } + { + // all records from RG ("b89a4406") with numMatches >= 1200 + const auto filter = PbiFilter::Intersection( + { + PbiReadGroupFilter{"b89a4406"}, + PbiNumMatchesFilter{1200, Compare::GREATER_THAN_EQUAL} + }); + + int count = 0; + PbiFilterQuery query(filter, bamFile); + for (const auto& r: query) { + ++count; + EXPECT_EQ(string("b89a4406"), r.ReadGroupId()); + EXPECT_GE((r.NumMatches()), 1200); + if (count == 1) + EXPECT_EQ(string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055"), r.FullName()); + else if (count == 2) + EXPECT_EQ(string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/4101_5571"), r.FullName()); + } + EXPECT_EQ(2, count); + } +} + +TEST(PbiFilterQueryTest, ZmwRangeFromDatasetOk) +{ + const auto expectedMovieName = string{ "m150404_101626_42267_c100807920800000001823174110291514_s1_p0" }; + + const DataSet ds(tests::Data_Dir + "/chunking/chunking.subreadset.xml"); + EXPECT_EQ(3, ds.BamFiles().size()); + + { // movie name + + int count = 0; + PbiFilterQuery query{ PbiMovieNameFilter{expectedMovieName}, ds }; + for (const BamRecord& r : query) { + EXPECT_EQ(expectedMovieName, r.MovieName()); + ++count; + } + EXPECT_EQ(1220, count); + } + + { // sequencing chemistries + set chems{ ds.SequencingChemistries() }; + set expected{ "P6-C4" }; + EXPECT_TRUE(equal(chems.begin(), chems.end(), expected.begin())); + } + + { // min ZMW + + int count = 0; + PbiFilterQuery query{ PbiZmwFilter{54, Compare::GREATER_THAN}, ds }; + for (const BamRecord& r : query) { + EXPECT_GT(r.HoleNumber(), 54); + ++count; + } + EXPECT_EQ(1220, count); + } + + { // max ZMW + + int count = 0; + PbiFilterQuery query{ PbiZmwFilter{1816, Compare::LESS_THAN}, ds }; + for (const BamRecord& r : query) { + EXPECT_LT(r.HoleNumber(),1816); + ++count; + } + EXPECT_EQ(150, count); + } + + { // put all together, from DataSet XML + + const PbiFilter filter = PbiFilter::FromDataSet(ds); + PbiFilterQuery query(filter, ds); + int count = 0; + for (const BamRecord& r : query) { + EXPECT_EQ(expectedMovieName, r.MovieName()); + const auto zmw = r.HoleNumber(); + EXPECT_GT(zmw, 54); + EXPECT_LT(zmw, 1816); + ++count; + } + EXPECT_EQ(150, count); + } + { // empty filter object - should return all records from the same dataset + + PbiFilterQuery query(PbiFilter{ }, ds); + int count = 0; + for (const BamRecord& r : query) { + (void)r; + ++count; + } + EXPECT_EQ(1220, count); + } + { // no element present at all + + const DataSet ds(tests::GeneratedData_Dir + "/chunking_missingfilters.subreadset.xml"); + const PbiFilter filter = PbiFilter::FromDataSet(ds); + PbiFilterQuery query(filter, ds); + int count = 0; + for (const BamRecord& r : query) { + (void)r; + ++count; + } + EXPECT_EQ(1220, count); + } + { // element contains no child elements + + const DataSet ds(tests::GeneratedData_Dir + "/chunking_emptyfilters.subreadset.xml"); + const PbiFilter filter = PbiFilter::FromDataSet(ds); + PbiFilterQuery query(filter, ds); + int count = 0; + for (const BamRecord& r : query) { + (void)r; + ++count; + } + EXPECT_EQ(1220, count); + } +} + +TEST(PbiFilterQueryTest, MissingPbiShouldThrow) +{ + const PbiFilter filter{ PbiZmwFilter{31883} }; + const string phi29Bam = tests::GeneratedData_Dir + "/missing_pbi.bam"; + const string hasPbiBam = tests::Data_Dir + "/polymerase/production.scraps.bam"; + + { // single file, missing PBI + + EXPECT_THROW(PbiFilterQuery(filter, phi29Bam), std::runtime_error); + } + + { // from dataset, all missing PBI + + DataSet ds; + ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam)); + ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam)); + EXPECT_THROW(PbiFilterQuery(filter, ds), std::runtime_error); + } + + { // from dataset, mixed PBI presence + + DataSet ds; + ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam)); + ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.ScrapsBamFile", hasPbiBam)); + EXPECT_THROW(PbiFilterQuery(filter, ds), std::runtime_error); + } +} + +TEST(PbiFilterQueryTest, QNameWhitelistFile) +{ + const DataSet ds(tests::Data_Dir + "/polymerase/qnameFiltered.subreads.dataset.xml"); + const PbiFilter filter = PbiFilter::FromDataSet(ds); + PbiFilterQuery query(filter, ds); + int count = 0; + for (const BamRecord& r : query) { + (void)r; + ++count; + } + EXPECT_EQ(3, count); +} + +TEST(PbiFilterQueryTest, EmptyFiles) +{ + const BamFile file{ tests::Data_Dir + "/empty.bam" }; + PbiFilterQuery query{ PbiFilter{}, file }; + size_t count = 0; + for (const auto& r : query) { + (void)r; + ++count; + } + EXPECT_EQ(0, count); +} + +TEST(PbiFilterQueryTest, BarcodeData) +{ + const BamFile file{ tests::Data_Dir + "/phi29.bam" }; + + // bc_quality == 1 + { + size_t count = 0; + PbiFilterQuery query{ PbiBarcodeQualityFilter{1}, file }; + for (const auto& r : query) { + (void)r; + ++count; + } + EXPECT_EQ(120, count); + } + + // bc_quality != 1 + { + size_t count = 0; + PbiFilterQuery query{ PbiBarcodeQualityFilter{1, Compare::NOT_EQUAL}, file }; + for (const auto& r : query) { + (void)r; + ++count; + } + EXPECT_EQ(0, count); + } + + // bc_forward == 0 + { + size_t count = 0; + PbiFilterQuery query{ PbiBarcodeForwardFilter{0}, file }; + for (const auto& r : query) { + (void)r; + ++count; + } + EXPECT_EQ(40, count); + } + + // bc_forward == [0,2] + { + size_t count = 0; + const auto ids = vector{ 0, 2 }; + PbiFilterQuery query{ PbiBarcodeForwardFilter{ ids }, file }; + for (const auto& r : query) { + (void)r; + ++count; + } + EXPECT_EQ(80, count); + } + + // bc_reverse != 0 + { + size_t count = 0; + PbiFilterQuery query{ PbiBarcodeReverseFilter{0, Compare::NOT_EQUAL}, file }; + for (const auto& r : query) { + (void)r; + ++count; + } + EXPECT_EQ(80, count); + } +} + +TEST(PbiFilterQueryTest, BarcodeQualityFromXml) +{ + +const string xml_all = R"_XML_( + + + + + + + + + + + + + + + + + +)_XML_"; + +const string xml_none = R"_XML_( + + + + + + + + + + + + + + + + + +)_XML_"; + + const BamFile file{ tests::Data_Dir + "/phi29.bam" }; + + { // filter allows all records + const DataSet ds = DataSet::FromXml(xml_all); + const PbiFilterQuery query { PbiFilter::FromDataSet(ds), file }; + size_t count = 0; + for (const auto& r : query) { + (void)r; + ++count; + } + EXPECT_EQ(120, count); + } + { // filter allows no records + const DataSet ds = DataSet::FromXml(xml_none); + const PbiFilterQuery query { PbiFilter::FromDataSet(ds), file }; + size_t count = 0; + for (const auto& r : query) { + (void)r; + ++count; + } + EXPECT_EQ(0, count); + } +} + + diff --git a/tests/src/test_Pulse2BaseCache.cpp b/tests/src/test_Pulse2BaseCache.cpp new file mode 100644 index 0000000..e93fa73 --- /dev/null +++ b/tests/src/test_Pulse2BaseCache.cpp @@ -0,0 +1,84 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +TEST(Pulse2BaseCacheTest, CountsDetectedInConstructor) +{ + const string pulseCalls = "ACccTTAGtTCAtG"; + const string trimmedPC = "ACTTAGTCAG"; + + const Pulse2BaseCache cache{ pulseCalls }; + + EXPECT_EQ(pulseCalls.size(), cache.NumPulses()); + EXPECT_EQ(trimmedPC.size(), cache.NumBases()); +} + +TEST(Pulse2BaseCacheTest, RemovesSquashedPulsesFromString) +{ + const string pulseCalls = "ACccTTAGtTCAtG"; + const string trimmedPC = "ACTTAGTCAG"; + const string altLabel = "-G--A--T--AC--"; + const string trimmedAlt = "-GA--T-AC-"; + + const Pulse2BaseCache cache{ pulseCalls }; + + EXPECT_EQ(trimmedPC, cache.RemoveSquashedPulses(pulseCalls)); + EXPECT_EQ(trimmedAlt, cache.RemoveSquashedPulses(altLabel)); +} + +TEST(Pulse2BaseCacheTest, RemovesSquashedPulsesFromVector) +{ + const string pulseCalls = "ACccTTAGtTCAtG"; + const vector pkMean = {5,4,2,2,3,8,8,8,4,7,7,7,3,4}; + const vector trimmedPkmean = {5,4,3,8,8,8,7,7,7,4}; + + const Pulse2BaseCache cache{ pulseCalls }; + + EXPECT_EQ(trimmedPkmean, cache.RemoveSquashedPulses(pkMean)); +} diff --git a/tests/src/test_QNameQuery.cpp b/tests/src/test_QNameQuery.cpp new file mode 100644 index 0000000..1a2dcd1 --- /dev/null +++ b/tests/src/test_QNameQuery.cpp @@ -0,0 +1,96 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Yuan Li + +#include "TestData.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +static const string dataDir = tests::Data_Dir + "/group/"; +static const string test1fn = string(dataDir) + "test1.bam"; +static const string test2fn = string(dataDir) + "test2.bam"; +static const string test3fn = string(dataDir) + "test3.bam"; + +static +void TestQNameQuery(const string& fn, const vector& expected) +{ + EXPECT_NO_THROW( + { + vector counts; + QNameQuery qQuery(fn); + for (const vector& records : qQuery) + counts.push_back(records.size()); + EXPECT_EQ(expected, counts); + }); +} + +static +void TestNoneConstQNameQuery(const string& fn, const vector& expected) +{ + EXPECT_NO_THROW( + { + vector counts; + QNameQuery qQuery(fn); + for (vector& records : qQuery) + counts.push_back(records.size()); + EXPECT_EQ(expected, counts); + }); +} + +TEST(QNameQueryTest, CountQSizes) +{ + // test case 1 has exactly one bamRecord. + string fn = test1fn; + vector expected({1}); + TestQNameQuery(fn, expected); + TestNoneConstQNameQuery(fn, expected); + + // test case 2 has bamRecords of four subreads. + fn = test2fn; + expected = {1, 1, 1, 1}; + TestQNameQuery(fn, expected); + TestNoneConstQNameQuery(fn, expected); + + fn = test3fn; + expected = {2,1,1,1,1,1,1,2,1,1,1}; + TestQNameQuery(fn, expected); + TestNoneConstQNameQuery(fn, expected); +} + diff --git a/tests/src/test_QualityValues.cpp b/tests/src/test_QualityValues.cpp new file mode 100644 index 0000000..98795a3 --- /dev/null +++ b/tests/src/test_QualityValues.cpp @@ -0,0 +1,120 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +TEST(QualityValueTest, DefaultsOk) +{ + const QualityValue value; + EXPECT_EQ(0, value); + EXPECT_EQ('!', value.Fastq()); +} + +TEST(QualityValueTest, FromNumber) +{ + const QualityValue zero(0); + const QualityValue thirtyThree(33); + const QualityValue valid(42); + const QualityValue max(93); + const QualityValue tooHigh(94); + const QualityValue wayTooHigh(INT8_MAX); + + EXPECT_EQ(0, zero); + EXPECT_EQ(33, thirtyThree); + EXPECT_EQ(42, valid); + EXPECT_EQ(93, max); + EXPECT_EQ(93, tooHigh); + EXPECT_EQ(93, wayTooHigh); + + EXPECT_EQ('!', zero.Fastq()); + EXPECT_EQ('B', thirtyThree.Fastq()); + EXPECT_EQ('K', valid.Fastq()); + EXPECT_EQ('~', max.Fastq()); + EXPECT_EQ('~', tooHigh.Fastq()); + EXPECT_EQ('~', wayTooHigh.Fastq()); +} + +TEST(QualityValueTest, FromFastq) +{ + const QualityValue zero = QualityValue::FromFastq('!'); + const QualityValue thirtyThree = QualityValue::FromFastq('B'); + const QualityValue valid = QualityValue::FromFastq('K'); + const QualityValue max = QualityValue::FromFastq('~'); + + EXPECT_EQ(0, zero); + EXPECT_EQ(33, thirtyThree); + EXPECT_EQ(42, valid); + EXPECT_EQ(93, max); +} + +TEST(QualityValuesTest, Default) +{ + const QualityValues qvs; + EXPECT_TRUE(qvs.empty()); + EXPECT_EQ(string(), qvs.Fastq()); +} + +TEST(QualityValuesTest, FromNumbers) +{ + const string fastqString = "~~~KKBB!!"; + const vector values = { 93, 93, 93, 42, 42, 33, 33, 0, 0 }; + + QualityValues qvs; + for (auto qv : values) + qvs.push_back(qv); + EXPECT_EQ(fastqString, qvs.Fastq()); +} + +TEST(QualityValuesTest, FromFastq) +{ + const string fastqString = "~~~KKBB!!"; + const vector values = { 93, 93, 93, 42, 42, 33, 33, 0, 0 }; + + const QualityValues& qvs = QualityValues::FromFastq(fastqString); + EXPECT_EQ(fastqString.size(), qvs.size()); + EXPECT_EQ(values.size(), qvs.size()); + for (size_t i = 0; i < fastqString.size(); ++i) + EXPECT_EQ(values.at(i), qvs.at(i)); +} diff --git a/tests/src/test_ReadAccuracyQuery.cpp b/tests/src/test_ReadAccuracyQuery.cpp new file mode 100644 index 0000000..05d8bfc --- /dev/null +++ b/tests/src/test_ReadAccuracyQuery.cpp @@ -0,0 +1,72 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +TEST(ReadAccuracyQueryTest, QueryOk) +{ + const auto bamFile = BamFile{ tests::Data_Dir + string{ "/group/test2.bam" } }; + + { + int count = 0; + ReadAccuracyQuery query(0.901, Compare::GREATER_THAN_EQUAL, bamFile); + for (const auto& r: query) { + ++count; + EXPECT_GE(r.ReadAccuracy(), 0.901); + } + EXPECT_EQ(4, count); + } + { + int count = 0; + ReadAccuracyQuery query(0.95, Compare::GREATER_THAN_EQUAL, bamFile); + for (const auto& r: query) { + ++count; + EXPECT_GE(r.ReadAccuracy(), 0.901); + } + EXPECT_EQ(0, count); + } +} diff --git a/tests/src/test_ReadGroupInfo.cpp b/tests/src/test_ReadGroupInfo.cpp new file mode 100644 index 0000000..8b9f23a --- /dev/null +++ b/tests/src/test_ReadGroupInfo.cpp @@ -0,0 +1,230 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett, Lance Hepler + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +using namespace PacBio::BAM; +using namespace std; + +TEST(ReadGroupInfoTest, IdFromMovieNameAndReadType) +{ + ReadGroupInfo rg("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0", "HQREGION"); + EXPECT_EQ("00082ba1", rg.Id()); +} + +TEST(ReadGroupInfoTest, FrameCodecSetOk) +{ + ReadGroupInfo rg("test"); + rg.IpdCodec(FrameCodec::V1); + EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::IPD)); + EXPECT_EQ("ip", rg.BaseFeatureTag(BaseFeature::IPD)); + EXPECT_EQ(FrameCodec::V1, rg.IpdCodec()); +} + +TEST(ReadGroupInfoTest, SequencingChemistryOk) +{ + { // P6-C4 + const string& chem = "P6-C4"; + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100356300","100356200","2.1")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100356300","100356200","2.3")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100356300","100612400","2.1")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100356300","100612400","2.3")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100372700","100356200","2.1")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100372700","100356200","2.3")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100372700","100612400","2.1")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100372700","100612400","2.3")); + + ReadGroupInfo rg("dummy"); + rg.BindingKit("100356300") + .SequencingKit("100356200") + .BasecallerVersion("2.1"); + EXPECT_EQ(chem, rg.SequencingChemistry()); + } + + { // S/P1-C1/beta + const string& chem = "S/P1-C1/beta"; + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-620-000","3.0")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-620-000","3.1")); + + ReadGroupInfo rg("dummy"); + rg.BindingKit("100-619-300") + .SequencingKit("100-620-000") + .BasecallerVersion("3.0"); + EXPECT_EQ(chem, rg.SequencingChemistry()); + } + + { // S/P1-C1.1 (Echidna) + const string& chem = "S/P1-C1.1"; + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-867-300","3.1")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-867-300","3.2")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-867-300","3.3")); + + ReadGroupInfo rg("dummy"); + rg.BindingKit("100-619-300") + .SequencingKit("100-867-300") + .BasecallerVersion("3.1"); + EXPECT_EQ(chem, rg.SequencingChemistry()); + } + + { // S/P1-C1.2 (Flea) + const string& chem = "S/P1-C1.2"; + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-902-100","3.1")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-902-100","3.2")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-902-100","3.3")); + + ReadGroupInfo rg("dummy"); + rg.BindingKit("100-619-300") + .SequencingKit("100-902-100") + .BasecallerVersion("3.1"); + EXPECT_EQ(chem, rg.SequencingChemistry()); + } + { // S/P1-C1.3 (Goat) + const string& chem = "S/P1-C1.3"; + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-972-200","3.2")); + EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-972-200","3.3")); + + ReadGroupInfo rg("dummy"); + rg.BindingKit("100-619-300") + .SequencingKit("100-972-200") + .BasecallerVersion("3.3"); + EXPECT_EQ(chem, rg.SequencingChemistry()); + } +} + +TEST(ReadGroupInfoTest, SequencingChemistryThrowsOnBadTriple) +{ + // check that we actually throw + ReadGroupInfo rg("BAD"); + rg.BindingKit("100372700") + .SequencingKit("100-619-400") + .BasecallerVersion("2.0"); + EXPECT_THROW(rg.SequencingChemistry(), InvalidSequencingChemistryException); + + // now check thrown contents + try { + ReadGroupInfo rg("BAD"); + rg.BindingKit("100372700") + .SequencingKit("100-619-400") + .BasecallerVersion("2.0"); + } catch (InvalidSequencingChemistryException& e) { + EXPECT_EQ(string("100372700"), e.BindingKit()); + EXPECT_EQ(string("100-619-400"), e.SequencingKit()); + EXPECT_EQ(string("2.0"), e.BasecallerVersion()); + } +} + +TEST(ReadGroupInfoTest, BasecallerVersion) +{ + // too short + try { + ReadGroupInfo rg("dummy"); + rg.BindingKit("100-619-300") + .SequencingKit("100-867-300") + .BasecallerVersion("3"); + const string chem = rg.SequencingChemistry(); + (void)chem; + + } catch (std::runtime_error& e) { + EXPECT_EQ(string("basecaller version too short: 3"), string(e.what())); + } + + // initial implementation assumed single digit version numbers: + // const string ver{ basecallerVersion.substr(0, 3) }; + // So '3.299.dummy' would incorrectly be interpreted as (OK) '3.2'. + // 3. + + try { + ReadGroupInfo rg("dummy"); + rg.BindingKit("100-619-300") + .SequencingKit("100-867-300") + .BasecallerVersion("3.199.dummy"); + const string chem = rg.SequencingChemistry(); + (void)chem; + + } catch (InvalidSequencingChemistryException& e) { + EXPECT_EQ("100-619-300", e.BindingKit()); + EXPECT_EQ("100-867-300", e.SequencingKit()); + EXPECT_EQ("3.199.dummy", e.BasecallerVersion()); + } + //EXPECT_THROW(rg.SequencingChemistry(), InvalidSequencingChemistryException); +} + +TEST(ReadGroupInfoTest, ClearBaseFeatures) +{ + ReadGroupInfo rg("test"); + rg.BaseFeatureTag(BaseFeature::DELETION_QV, "dq"); + rg.BaseFeatureTag(BaseFeature::DELETION_TAG, "dt"); + rg.BaseFeatureTag(BaseFeature::INSERTION_QV, "iq"); + rg.BaseFeatureTag(BaseFeature::MERGE_QV, "mq"); + rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV, "sq"); + EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::DELETION_QV)); + EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV)); + + rg.ClearBaseFeatures(); + EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_QV)); + EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_TAG)); + EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::INSERTION_QV)); + EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::MERGE_QV)); + EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_QV)); +} + +TEST(ReadGroupInfoTest, RemoveBaseFeature) +{ + ReadGroupInfo rg("test"); + rg.BaseFeatureTag(BaseFeature::DELETION_QV, "dq"); + rg.BaseFeatureTag(BaseFeature::DELETION_TAG, "dt"); + rg.BaseFeatureTag(BaseFeature::INSERTION_QV, "iq"); + rg.BaseFeatureTag(BaseFeature::MERGE_QV, "mq"); + rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV, "sq"); + EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::DELETION_QV)); + EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV)); + + rg.RemoveBaseFeature(BaseFeature::DELETION_QV); + EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_QV)); + + EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::DELETION_TAG)); + EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::INSERTION_QV)); + EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::MERGE_QV)); + EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_QV)); +} + diff --git a/tests/src/test_SamWriter.cpp b/tests/src/test_SamWriter.cpp new file mode 100644 index 0000000..f13b5df --- /dev/null +++ b/tests/src/test_SamWriter.cpp @@ -0,0 +1,150 @@ +// Copyright (c) 2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "TestData.h" +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +TEST(SamWriterTest, HeaderOk) +{ + // setup header + const string hdrText = { + "@HD\tVN:1.1\tSO:unknown\tpb:3.0.3\n" + "@RG\tID:6002b307\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;" + "SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\t" + "PU:test\tPM:SEQUEL\n" + }; + + EXPECT_NO_THROW( + { + // write header to file + const string generatedFn = tests::GeneratedData_Dir + "/samwriter_hdr_only.sam"; + { + const BamHeader inputHeader(hdrText); + SamWriter writer(generatedFn, inputHeader); + (void)writer; + }; + + // check header + { + ifstream f(generatedFn); + const string text((istreambuf_iterator(f)), + istreambuf_iterator()); + EXPECT_EQ(hdrText, text); + } + + // clean up + remove(generatedFn.c_str()); + }); +} + +TEST(SamWriterTest, SingleRecordOk) +{ + + // setup header + const string hdrLine1 = { "@HD\tVN:1.1\tSO:unknown\tpb:3.0.3" }; + const string hdrLine2 = { + "@RG\tID:6002b307\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;" + "SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\t" + "PU:test\tPM:SEQUEL" + }; + const string hdrText = hdrLine1 + "\n" + hdrLine2 + "\n"; + const BamHeader inputHeader(hdrText); + + // setup record + BamRecord record(inputHeader); + record.Impl().Name("test/100/0_5"); + record.Impl().SetSequenceAndQualities("ACGTC", 5, "@@@@@"); + record.Impl().CigarData(""); + record.Impl().Bin(0); + record.Impl().Flag(0); + record.Impl().InsertSize(0); + record.Impl().MapQuality(0); + record.Impl().MatePosition(-1); + record.Impl().MateReferenceId(-1); + record.Impl().Position(-1); + record.Impl().ReferenceId(-1); + record.Impl().SetMapped(false); + + TagCollection tags; + tags["zm"] = static_cast(100); + tags["qs"] = static_cast(0); + tags["qe"] = static_cast(5); + tags["np"] = static_cast(1); + tags["rq"] = static_cast(0.6); + tags["RG"] = std::string{ "6002b307" }; + tags["sn"] = vector{0.2f,0.2f,0.2f,0.2f}; + record.Impl().Tags(tags); + + const string expectedSamRecord = { + "test/100/0_5\t4\t*\t0\t0\t*\t*\t0\t0\tACGTC\t@@@@@\tRG:Z:6002b307\t" + "np:i:1\tqe:i:5\tqs:i:0\trq:f:0.6\tsn:B:f,0.2,0.2,0.2,0.2\tzm:i:100" + }; + + EXPECT_NO_THROW( + { + // write data to file + const string generatedFn = tests::GeneratedData_Dir + "/samwriter_hdr_and_record.sam"; + { + SamWriter writer(generatedFn, inputHeader); + writer.Write(record); + }; + + // check header & record + { + ifstream f(generatedFn); + string line1; + string line2; + string line3; + std::getline(f, line1); + std::getline(f, line2); + std::getline(f, line3); + EXPECT_EQ(hdrLine1, line1); + EXPECT_EQ(hdrLine2, line2); + EXPECT_EQ(expectedSamRecord, line3); + } + + // cleanup + remove(generatedFn.c_str()); + }); +} diff --git a/tests/src/test_SequenceUtils.cpp b/tests/src/test_SequenceUtils.cpp new file mode 100644 index 0000000..20bf5e6 --- /dev/null +++ b/tests/src/test_SequenceUtils.cpp @@ -0,0 +1,117 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +TEST(SequenceUtilsTest, ComplementChar) +{ + // complement + const char A = 'A'; // T + const char B = 'B'; // V + const char C = 'C'; // G + const char D = 'D'; // H + const char E = 'E'; // null + const char F = 'F'; // null + const char G = 'G'; // C + const char H = 'H'; // D + const char I = 'I'; // null + const char J = 'J'; // null + const char K = 'K'; // M + const char L = 'L'; // null + const char M = 'M'; // K + const char N = 'N'; // N + const char O = 'O'; // null + const char P = 'P'; // null + const char Q = 'Q'; // null + const char R = 'R'; // Y + const char S = 'S'; // S + const char T = 'T'; // A + const char U = 'U'; // A + const char V = 'V'; // B + const char W = 'W'; // W + const char X = 'X'; // null + const char Y = 'Y'; // R + const char Z = 'Z'; // null + + EXPECT_EQ(T, Complement(A)); + EXPECT_EQ(V, Complement(B)); + EXPECT_EQ(G, Complement(C)); + EXPECT_EQ(H, Complement(D)); + EXPECT_EQ(0, Complement(E)); + EXPECT_EQ(0, Complement(F)); + EXPECT_EQ(C, Complement(G)); + EXPECT_EQ(D, Complement(H)); + EXPECT_EQ(0, Complement(I)); + EXPECT_EQ(0, Complement(J)); + EXPECT_EQ(M, Complement(K)); + EXPECT_EQ(0, Complement(L)); + EXPECT_EQ(K, Complement(M)); + EXPECT_EQ(N, Complement(N)); + EXPECT_EQ(0, Complement(O)); + EXPECT_EQ(0, Complement(P)); + EXPECT_EQ(0, Complement(Q)); + EXPECT_EQ(Y, Complement(R)); + EXPECT_EQ(S, Complement(S)); + EXPECT_EQ(A, Complement(T)); + EXPECT_EQ(A, Complement(U)); + EXPECT_EQ(B, Complement(V)); + EXPECT_EQ(W, Complement(W)); + EXPECT_EQ(0, Complement(X)); + EXPECT_EQ(R, Complement(Y)); + EXPECT_EQ(0, Complement(Z)); +} + +TEST(SequenceUtilsTest, ReverseComplement) +{ + string input1 = "ATATATCCCGGCG"; + const string rc1 = "CGCCGGGATATAT"; + + ReverseComplement(input1); + EXPECT_EQ(rc1, input1); +} diff --git a/tests/src/test_StringUtils.cpp b/tests/src/test_StringUtils.cpp new file mode 100644 index 0000000..d335246 --- /dev/null +++ b/tests/src/test_StringUtils.cpp @@ -0,0 +1,70 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +TEST(StringUtilsTest, BasicSplit) +{ + const string test = "foo\tbar\tbaz"; + const vector tokens = internal::Split(test, '\t'); + EXPECT_EQ(3, tokens.size()); + EXPECT_TRUE(tokens.at(0) == "foo"); + EXPECT_TRUE(tokens.at(1) == "bar"); + EXPECT_TRUE(tokens.at(2) == "baz"); +} + +TEST(StringUtilsTest, SplitKeepsEmptyTokens) +{ + const string test = "foo\tbar\t\tbaz"; + const vector tokens = internal::Split(test, '\t'); + EXPECT_EQ(4, tokens.size()); + EXPECT_TRUE(tokens.at(0) == "foo"); + EXPECT_TRUE(tokens.at(1) == "bar"); + EXPECT_TRUE(tokens.at(2) == ""); + EXPECT_TRUE(tokens.at(3) == "baz"); +} diff --git a/tests/src/test_SubreadLengthQuery.cpp b/tests/src/test_SubreadLengthQuery.cpp new file mode 100644 index 0000000..9d55e21 --- /dev/null +++ b/tests/src/test_SubreadLengthQuery.cpp @@ -0,0 +1,81 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +TEST(SubreadLengthQueryTest, QueryOk) +{ + const auto bamFile = BamFile{ tests::Data_Dir + string{ "/group/test2.bam" } }; + + { + int count = 0; + SubreadLengthQuery query(500, Compare::GREATER_THAN_EQUAL, bamFile); + for (const auto& r: query) { + ++count; + EXPECT_GE((r.QueryEnd() - r.QueryStart()), 500); + } + EXPECT_EQ(3, count); + } + { + int count = 0; + SubreadLengthQuery query(1000, Compare::GREATER_THAN_EQUAL, bamFile); + for (const auto& r: query) { + ++count; + EXPECT_GE((r.QueryEnd() - r.QueryStart()), 1000); + } + EXPECT_EQ(2, count); + } + { + int count = 0; + SubreadLengthQuery query(5000, Compare::GREATER_THAN_EQUAL, bamFile); + for (const auto& r: query) { + ++count; + EXPECT_GE((r.QueryEnd() - r.QueryStart()), 5000); + } + EXPECT_EQ(0, count); + } +} diff --git a/tests/src/test_Tags.cpp b/tests/src/test_Tags.cpp new file mode 100644 index 0000000..6755204 --- /dev/null +++ b/tests/src/test_Tags.cpp @@ -0,0 +1,1144 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +TEST(TagTest, TagConstruction) +{ + int8_t i8 = 0; + uint8_t u8 = 0; + int16_t i16 = 0; + uint16_t u16 = 0; + int32_t i32 = 0; + uint32_t u32 = 0; + float f = 0.0; + string str = ""; + vector i8_array; + vector u8_array; + vector i16_array; + vector u16_array; + vector i32_array; + vector u32_Array; + vector float_array; + + signed char c = 'A'; + unsigned char uc = 'A'; + + Tag i8Tag(i8); + Tag u8Tag(u8); + Tag i16Tag(i16); + Tag u16Tag(u16); + Tag i32Tag(i32); + Tag u32Tag(u32); + Tag floatTag(f); + Tag stringTag(str); + Tag i8_array_Tag(i8_array); + Tag u8_array_Tag(u8_array); + Tag i16_array_Tag(i16_array); + Tag u16_array_Tag(u16_array); + Tag i32_array_Tag(i32_array); + Tag u32_array_Tag(u32_Array); + Tag float_array_Tag(float_array); + + Tag charTag(c, TagModifier::ASCII_CHAR); + Tag ucharTag(uc, TagModifier::ASCII_CHAR); + + EXPECT_TRUE(i8Tag.Type() == TagDataType::INT8); + EXPECT_TRUE(u8Tag.Type() == TagDataType::UINT8); + EXPECT_TRUE(i16Tag.Type() == TagDataType::INT16); + EXPECT_TRUE(u16Tag.Type() == TagDataType::UINT16); + EXPECT_TRUE(i32Tag.Type() == TagDataType::INT32); + EXPECT_TRUE(u32Tag.Type() == TagDataType::UINT32); + EXPECT_TRUE(floatTag.Type() == TagDataType::FLOAT); + EXPECT_TRUE(stringTag.Type() == TagDataType::STRING); + EXPECT_TRUE(i8_array_Tag.Type() == TagDataType::INT8_ARRAY); + EXPECT_TRUE(u8_array_Tag.Type() == TagDataType::UINT8_ARRAY); + EXPECT_TRUE(i16_array_Tag.Type() == TagDataType::INT16_ARRAY); + EXPECT_TRUE(u16_array_Tag.Type() == TagDataType::UINT16_ARRAY); + EXPECT_TRUE(i32_array_Tag.Type() == TagDataType::INT32_ARRAY); + EXPECT_TRUE(u32_array_Tag.Type() == TagDataType::UINT32_ARRAY); + EXPECT_TRUE(float_array_Tag.Type() == TagDataType::FLOAT_ARRAY); + + EXPECT_TRUE(charTag.ToAscii() == 'A'); + EXPECT_TRUE(ucharTag.ToAscii() == 'A'); +} + +TEST(TagTest, CopyAndCompare) +{ + int8_t i8 = 0; + uint8_t u8 = 0; + int16_t i16 = 0; + uint16_t u16 = 0; + int32_t i32 = 0; + uint32_t u32 = 0; + float f = 0.0; + string str = ""; + vector i8_array; + vector u8_array; + vector i16_array; + vector u16_array; + vector i32_array; + vector u32_Array; + vector float_array; + + Tag i8Tag(i8); + Tag u8Tag(u8); + Tag i16Tag(i16); + Tag u16Tag(u16); + Tag i32Tag(i32); + Tag u32Tag(u32); + Tag floatTag(f); + Tag stringTag(str); + Tag i8_array_Tag(i8_array); + Tag u8_array_Tag(u8_array); + Tag i16_array_Tag(i16_array); + Tag u16_array_Tag(u16_array); + Tag i32_array_Tag(i32_array); + Tag u32_array_Tag(u32_Array); + Tag float_array_Tag(float_array); + + Tag i8Tag2 = i8Tag; + Tag u8Tag2 = u8Tag; + Tag i16Tag2 = i16Tag; + Tag u16Tag2 = u16Tag; + Tag i32Tag2 = i32Tag; + Tag u32Tag2 = u32Tag; + Tag floatTag2 = floatTag; + Tag stringTag2 = stringTag; + Tag i8_array_Tag2 = i8_array_Tag; + Tag u8_array_Tag2 = u8_array_Tag; + Tag i16_array_Tag2 = i16_array_Tag; + Tag u16_array_Tag2 = u16_array_Tag; + Tag i32_array_Tag2 = i32_array_Tag; + Tag u32_array_Tag2 = u32_array_Tag; + Tag float_array_Tag2 = float_array_Tag; + + EXPECT_EQ(i8Tag, i8Tag2); + EXPECT_EQ(u8Tag, u8Tag2); + EXPECT_EQ(i16Tag, i16Tag2); + EXPECT_EQ(u16Tag, u16Tag2); + EXPECT_EQ(i32Tag, i32Tag2); + EXPECT_EQ(u32Tag, u32Tag2); + EXPECT_EQ(floatTag, floatTag2); + EXPECT_EQ(stringTag, stringTag2); + EXPECT_EQ(i8_array_Tag, i8_array_Tag2); + EXPECT_EQ(u8_array_Tag, u8_array_Tag2); + EXPECT_EQ(i16_array_Tag, i16_array_Tag2); + EXPECT_EQ(u16_array_Tag, u16_array_Tag2); + EXPECT_EQ(i32_array_Tag, i32_array_Tag2); + EXPECT_EQ(u32_array_Tag, u32_array_Tag2); + EXPECT_EQ(float_array_Tag, float_array_Tag2); +} + +TEST(TagTest, Type_None) +{ + Tag tag; + + EXPECT_TRUE(tag.Type() == TagDataType::INVALID); + EXPECT_TRUE(tag.IsNull()); + EXPECT_TRUE(tag.Typename() == "none"); + + EXPECT_FALSE(tag.IsNumeric()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsArray()); +} + +TEST(TagTest, Type_Int8) +{ + const int8_t v = -42; + const Tag tag(v); + + int8_t v2; + EXPECT_NO_THROW(v2 = tag.ToInt8()); + + EXPECT_TRUE(tag.Type() == TagDataType::INT8); + EXPECT_TRUE(tag.Typename() == "int8_t"); + EXPECT_TRUE(tag.IsInt8()); + + EXPECT_TRUE(tag.IsSignedInt()); + EXPECT_TRUE(tag.IsIntegral()); + EXPECT_TRUE(tag.IsNumeric()); + + EXPECT_FALSE(tag.IsUnsignedInt()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsArray()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, Type_UInt8) +{ + const uint8_t v = 42; + const Tag tag(v); + + uint8_t v2; + EXPECT_NO_THROW(v2 = tag.ToUInt8()); + + EXPECT_TRUE(tag.Type() == TagDataType::UINT8); + EXPECT_TRUE(tag.Typename() == "uint8_t"); + EXPECT_TRUE(tag.IsUInt8()); + + EXPECT_TRUE(tag.IsUnsignedInt()); + EXPECT_TRUE(tag.IsIntegral()); + EXPECT_TRUE(tag.IsNumeric()); + + EXPECT_FALSE(tag.IsSignedInt()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsArray()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, Type_Ascii) +{ + const char c = '$'; + const signed char sc = '$'; + const unsigned char uc = '$'; + const uint8_t u8 = 65; + const int8_t i8 = 66; + + { // old style: construct-then-modify + + Tag fromPlainChar = Tag(c); + Tag fromSignedChar = Tag(sc); + Tag fromUnsignedChar = Tag(uc); + Tag fromUint8 = Tag(u8); + Tag fromInt8 = Tag(i8); + fromPlainChar.Modifier(TagModifier::ASCII_CHAR); + fromSignedChar.Modifier(TagModifier::ASCII_CHAR); + fromUnsignedChar.Modifier(TagModifier::ASCII_CHAR); + fromUint8.Modifier(TagModifier::ASCII_CHAR); + fromInt8.Modifier(TagModifier::ASCII_CHAR); + + EXPECT_TRUE(fromPlainChar.HasModifier(TagModifier::ASCII_CHAR)); + EXPECT_TRUE(fromPlainChar.IsIntegral()); + EXPECT_TRUE(fromPlainChar.IsNumeric()); + EXPECT_EQ('$', fromPlainChar.ToAscii()); + + EXPECT_TRUE(fromSignedChar.HasModifier(TagModifier::ASCII_CHAR)); + EXPECT_TRUE(fromSignedChar.IsIntegral()); + EXPECT_TRUE(fromSignedChar.IsNumeric()); + EXPECT_EQ('$', fromSignedChar.ToAscii()); + + EXPECT_TRUE(fromUnsignedChar.HasModifier(TagModifier::ASCII_CHAR)); + EXPECT_TRUE(fromUnsignedChar.IsIntegral()); + EXPECT_TRUE(fromUnsignedChar.IsNumeric()); + EXPECT_EQ('$', fromUnsignedChar.ToAscii()); + + EXPECT_TRUE(fromUint8.HasModifier(TagModifier::ASCII_CHAR)); + EXPECT_TRUE(fromUint8.IsIntegral()); + EXPECT_TRUE(fromUint8.IsNumeric()); + EXPECT_EQ('A', fromUint8.ToAscii()); + + EXPECT_TRUE(fromInt8.HasModifier(TagModifier::ASCII_CHAR)); + EXPECT_TRUE(fromInt8.IsIntegral()); + EXPECT_TRUE(fromInt8.IsNumeric()); + EXPECT_EQ('B', fromInt8.ToAscii()); + } + + { // new style: construct directly as ASCII + + const Tag fromPlainChar = Tag(c, TagModifier::ASCII_CHAR); + const Tag fromSignedChar = Tag(sc, TagModifier::ASCII_CHAR); + const Tag fromUnsignedChar = Tag(uc, TagModifier::ASCII_CHAR); + const Tag fromUint8 = Tag(u8, TagModifier::ASCII_CHAR); + const Tag fromInt8 = Tag(i8, TagModifier::ASCII_CHAR); + + EXPECT_TRUE(fromPlainChar.HasModifier(TagModifier::ASCII_CHAR)); + EXPECT_TRUE(fromPlainChar.IsIntegral()); + EXPECT_TRUE(fromPlainChar.IsNumeric()); + EXPECT_EQ('$', fromPlainChar.ToAscii()); + + EXPECT_TRUE(fromSignedChar.HasModifier(TagModifier::ASCII_CHAR)); + EXPECT_TRUE(fromSignedChar.IsIntegral()); + EXPECT_TRUE(fromSignedChar.IsNumeric()); + EXPECT_EQ('$', fromSignedChar.ToAscii()); + + EXPECT_TRUE(fromUnsignedChar.HasModifier(TagModifier::ASCII_CHAR)); + EXPECT_TRUE(fromUnsignedChar.IsIntegral()); + EXPECT_TRUE(fromUnsignedChar.IsNumeric()); + EXPECT_EQ('$', fromUnsignedChar.ToAscii()); + + EXPECT_TRUE(fromUint8.HasModifier(TagModifier::ASCII_CHAR)); + EXPECT_TRUE(fromUint8.IsIntegral()); + EXPECT_TRUE(fromUint8.IsNumeric()); + EXPECT_EQ('A', fromUint8.ToAscii()); + + EXPECT_TRUE(fromInt8.HasModifier(TagModifier::ASCII_CHAR)); + EXPECT_TRUE(fromInt8.IsIntegral()); + EXPECT_TRUE(fromInt8.IsNumeric()); + EXPECT_EQ('B', fromInt8.ToAscii()); + } + + // check invalid constructs + EXPECT_THROW(Tag('A', TagModifier::HEX_STRING), std::runtime_error); +} + +TEST(TagTest, Type_Int16) +{ + const int16_t v = -42; + const Tag tag(v); + + int16_t v2; + EXPECT_NO_THROW(v2 = tag.ToInt16()); + + EXPECT_TRUE(tag.Type() == TagDataType::INT16); + EXPECT_TRUE(tag.Typename() == "int16_t"); + EXPECT_TRUE(tag.IsInt16()); + EXPECT_TRUE(tag.IsSignedInt()); + EXPECT_TRUE(tag.IsIntegral()); + EXPECT_TRUE(tag.IsNumeric()); + + EXPECT_FALSE(tag.IsUnsignedInt()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsArray()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, Type_UInt16) +{ + const uint16_t v = 42; + const Tag tag(v); + + uint16_t v2; + EXPECT_NO_THROW(v2 = tag.ToUInt16()); + + EXPECT_TRUE(tag.Type() == TagDataType::UINT16); + EXPECT_TRUE(tag.Typename() == "uint16_t"); + EXPECT_TRUE(tag.IsUInt16()); + EXPECT_TRUE(tag.IsUnsignedInt()); + EXPECT_TRUE(tag.IsIntegral()); + EXPECT_TRUE(tag.IsNumeric()); + + EXPECT_FALSE(tag.IsSignedInt()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsArray()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, Type_Int32) +{ + const int32_t v = -42; + const Tag tag(v); + + int32_t v2; + EXPECT_NO_THROW(v2 = tag.ToInt32()); + + EXPECT_TRUE(tag.Type() == TagDataType::INT32); + EXPECT_TRUE(tag.Typename() == "int32_t"); + EXPECT_TRUE(tag.IsInt32()); + EXPECT_TRUE(tag.IsSignedInt()); + EXPECT_TRUE(tag.IsIntegral()); + EXPECT_TRUE(tag.IsNumeric()); + + EXPECT_FALSE(tag.IsUnsignedInt()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsArray()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, Type_UInt32) +{ + const uint32_t v = 42; + const Tag tag(v); + + uint32_t v2; + EXPECT_NO_THROW(v2 = tag.ToUInt32()); + + EXPECT_TRUE(tag.Type() == TagDataType::UINT32); + EXPECT_TRUE(tag.Typename() == "uint32_t"); + EXPECT_TRUE(tag.IsUInt32()); + EXPECT_TRUE(tag.IsUnsignedInt()); + EXPECT_TRUE(tag.IsIntegral()); + EXPECT_TRUE(tag.IsNumeric()); + + EXPECT_FALSE(tag.IsSignedInt()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsArray()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, Type_Float) +{ + const float v = 3.141; + const Tag tag(v); + + float v2; + EXPECT_NO_THROW(v2 = tag.ToFloat()); + + EXPECT_TRUE(tag.Type() == TagDataType::FLOAT); + EXPECT_TRUE(tag.Typename() == "float"); + EXPECT_TRUE(tag.IsFloat()); + EXPECT_TRUE(tag.IsNumeric()); + + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsIntegral()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsArray()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, Type_String) +{ + const string v = "foo_who"; + const Tag tag(v); + + string v2; + EXPECT_NO_THROW(v2 = tag.ToString()); + + EXPECT_TRUE(tag.Type() == TagDataType::STRING); + EXPECT_TRUE(tag.Typename() == "string"); + EXPECT_TRUE(tag.IsString()); + + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsNumeric()); + EXPECT_FALSE(tag.IsArray()); + + EXPECT_EQ(v, v2); + + // "Hex format" string + const Tag hex("DEADBEEF", TagModifier::HEX_STRING); + EXPECT_TRUE(hex.Type() == TagDataType::STRING); + EXPECT_TRUE(hex.Typename() == "string"); + EXPECT_TRUE(hex.IsString()); + EXPECT_TRUE(hex.HasModifier(TagModifier::HEX_STRING)); + EXPECT_FALSE(hex.IsNull()); + EXPECT_FALSE(hex.IsNumeric()); + EXPECT_FALSE(hex.IsArray()); + + // check invalid constructs + EXPECT_THROW(Tag("DEADBEEF", TagModifier::ASCII_CHAR), std::runtime_error); +} + +TEST(TagTest, Type_Int8Array) +{ + const vector v = { -42, 100, 0 }; + const Tag tag(v); + + vector v2; + EXPECT_NO_THROW(v2 = tag.ToInt8Array()); + + EXPECT_TRUE(tag.Type() == TagDataType::INT8_ARRAY); + EXPECT_TRUE(tag.Typename() == "vector"); + EXPECT_TRUE(tag.IsInt8Array()); + EXPECT_TRUE(tag.IsSignedArray()); + EXPECT_TRUE(tag.IsIntegralArray()); + EXPECT_TRUE(tag.IsArray()); + + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsNumeric()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, Type_UInt8Array) +{ + const vector v = { 42, 200, 0 }; + const Tag tag(v); + + vector v2; + EXPECT_NO_THROW(v2 = tag.ToUInt8Array()); + + EXPECT_TRUE(tag.Type() == TagDataType::UINT8_ARRAY); + EXPECT_TRUE(tag.Typename() == "vector"); + EXPECT_TRUE(tag.IsUInt8Array()); + EXPECT_TRUE(tag.IsUnsignedArray()); + EXPECT_TRUE(tag.IsIntegralArray()); + EXPECT_TRUE(tag.IsArray()); + + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsNumeric()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, Type_Int16Array) +{ + const vector v = { 42, -300, 0 }; + const Tag tag(v); + + vector v2; + EXPECT_NO_THROW(v2 = tag.ToInt16Array()); + + EXPECT_TRUE(tag.Type() == TagDataType::INT16_ARRAY); + EXPECT_TRUE(tag.Typename() == "vector"); + EXPECT_TRUE(tag.IsInt16Array()); + EXPECT_TRUE(tag.IsSignedArray()); + EXPECT_TRUE(tag.IsIntegralArray()); + EXPECT_TRUE(tag.IsArray()); + + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsNumeric()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, Type_UInt16Array) +{ + const vector v = { 42, 300, 0 }; + const Tag tag(v); + + vector v2; + EXPECT_NO_THROW(v2 = tag.ToUInt16Array()); + + EXPECT_TRUE(tag.Type() == TagDataType::UINT16_ARRAY); + EXPECT_TRUE(tag.Typename() == "vector"); + EXPECT_TRUE(tag.IsUInt16Array()); + EXPECT_TRUE(tag.IsUnsignedArray()); + EXPECT_TRUE(tag.IsIntegralArray()); + EXPECT_TRUE(tag.IsArray()); + + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsNumeric()); + + EXPECT_EQ(v, v2);; +} + +TEST(TagTest, Type_Int32Array) +{ + const vector v = { 42, -300, 0 }; + const Tag tag(v); + + vector v2; + EXPECT_NO_THROW(v2 = tag.ToInt32Array()); + + EXPECT_TRUE(tag.Type() == TagDataType::INT32_ARRAY); + EXPECT_TRUE(tag.Typename() == "vector"); + EXPECT_TRUE(tag.IsInt32Array()); + EXPECT_TRUE(tag.IsSignedArray()); + EXPECT_TRUE(tag.IsIntegralArray()); + EXPECT_TRUE(tag.IsArray()); + + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsNumeric()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, Type_UInt32Array) +{ + const vector v = { 42, 300, 0 }; + const Tag tag(v); + + vector v2; + EXPECT_NO_THROW(v2 = tag.ToUInt32Array()); + + EXPECT_TRUE(tag.Type() == TagDataType::UINT32_ARRAY); + EXPECT_TRUE(tag.Typename() == "vector"); + EXPECT_TRUE(tag.IsUInt32Array()); + EXPECT_TRUE(tag.IsUnsignedArray()); + EXPECT_TRUE(tag.IsIntegralArray()); + EXPECT_TRUE(tag.IsArray()); + + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsNumeric()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, Type_FloatArray) +{ + const vector v = { 1.1f, 1.2f, 1.3f }; + const Tag tag(v); + + vector v2; + EXPECT_NO_THROW(v2 = tag.ToFloatArray()); + + EXPECT_TRUE(tag.Type() == TagDataType::FLOAT_ARRAY); + EXPECT_TRUE(tag.Typename() == "vector"); + EXPECT_TRUE(tag.IsFloatArray()); + EXPECT_TRUE(tag.IsArray()); + + EXPECT_FALSE(tag.IsIntegralArray()); + EXPECT_FALSE(tag.IsFloat()); + EXPECT_FALSE(tag.IsString()); + EXPECT_FALSE(tag.IsNull()); + EXPECT_FALSE(tag.IsNumeric()); + + EXPECT_EQ(v, v2); +} + +TEST(TagTest, CastBackToOriginalOk) +{ + int8_t i8 = 0; + uint8_t u8 = 0; + int16_t i16 = 0; + uint16_t u16 = 0; + int32_t i32 = 0; + uint32_t u32 = 0; + float f = 0.0; + string str = ""; + vector i8_array; + vector u8_array; + vector i16_array; + vector u16_array; + vector i32_array; + vector u32_array; + vector float_array; + + Tag i8Tag(i8); + Tag u8Tag(u8); + Tag i16Tag(i16); + Tag u16Tag(u16); + Tag i32Tag(i32); + Tag u32Tag(u32); + Tag floatTag(f); + Tag stringTag(str); + Tag i8_array_Tag(i8_array); + Tag u8_array_Tag(u8_array); + Tag i16_array_Tag(i16_array); + Tag u16_array_Tag(u16_array); + Tag i32_array_Tag(i32_array); + Tag u32_array_Tag(u32_array); + Tag float_array_Tag(float_array); + + EXPECT_NO_THROW({ + i8 = i8Tag.ToInt8(); + u8 = u8Tag.ToUInt8(); + i16 = i16Tag.ToInt16(); + u16 = u16Tag.ToUInt16(); + i32 = i32Tag.ToInt32(); + u32 = u32Tag.ToUInt32(); + f = floatTag.ToFloat(); + str = stringTag.ToString(); + i8_array = i8_array_Tag.ToInt8Array(); + u8_array = u8_array_Tag.ToUInt8Array(); + i16_array = i16_array_Tag.ToInt16Array(); + u16_array = u16_array_Tag.ToUInt16Array(); + i32_array = i32_array_Tag.ToInt32Array(); + u32_array = u32_array_Tag.ToUInt32Array(); + float_array = float_array_Tag.ToFloatArray(); + }); +} + +TEST(TagTest, ConvertToInt8) +{ + Tag zero(int32_t(0)); + Tag min(int32_t(INT8_MIN)); + Tag normal(int32_t(42)); + Tag max(int32_t(INT8_MAX)); + Tag underflow(int32_t(INT8_MIN-1)); + Tag overflow(int32_t(INT8_MAX+1)); + Tag floatTag(float(3.14)); + Tag stringTag(string("foo")); + Tag arrayTag(vector({1, 2, 3})); + + // allowed + EXPECT_NO_THROW( + { + zero.ToInt8(); + min.ToInt8(); + normal.ToInt8(); + max.ToInt8(); + }); + + // not allowed + EXPECT_THROW(underflow.ToInt8(), std::exception); + EXPECT_THROW(overflow.ToInt8(), std::exception); + EXPECT_THROW(floatTag.ToInt8(), std::exception); + EXPECT_THROW(stringTag.ToInt8(), std::exception); + EXPECT_THROW(arrayTag.ToInt8(), std::exception); +} + +TEST(TagTest, ConvertToUInt8) +{ + Tag zero(int32_t(0)); + Tag neg(int32_t(-1)); + Tag normal(int32_t(42)); + Tag max(int32_t(UINT8_MAX)); + Tag overflow(int32_t(UINT8_MAX+1)); + Tag floatTag(float(3.14)); + Tag stringTag(string("foo")); + Tag arrayTag(vector({1, 2, 3})); + + // allowed + EXPECT_NO_THROW( + { + zero.ToUInt8(); + normal.ToUInt8(); + max.ToUInt8(); + }); + + // not allowed + EXPECT_THROW(neg.ToUInt8(), std::exception); + EXPECT_THROW(overflow.ToUInt8(), std::exception); + EXPECT_THROW(floatTag.ToUInt8(), std::exception); + EXPECT_THROW(stringTag.ToUInt8(), std::exception); + EXPECT_THROW(arrayTag.ToUInt8(), std::exception); +} + +TEST(TagTest, ConvertToInt16) +{ + Tag zero(int32_t(0)); + Tag min(int32_t(INT16_MIN)); + Tag normal(int32_t(42)); + Tag max(int32_t(INT16_MAX)); + Tag underflow(int32_t(INT16_MIN-1)); + Tag overflow(int32_t(INT16_MAX+1)); + Tag floatTag(float(3.14)); + Tag stringTag(string("foo")); + Tag arrayTag(vector({1, 2, 3})); + + // allowed + EXPECT_NO_THROW( + { + zero.ToInt16(); + min.ToInt16(); + normal.ToInt16(); + max.ToInt16(); + }); + + // not allowed + EXPECT_THROW(underflow.ToInt16(), std::exception); + EXPECT_THROW(overflow.ToInt16(), std::exception); + EXPECT_THROW(floatTag.ToInt16(), std::exception); + EXPECT_THROW(stringTag.ToInt16(), std::exception); + EXPECT_THROW(arrayTag.ToInt16(), std::exception); +} + +TEST(TagTest, ConvertToUInt16) +{ + Tag zero(int32_t(0)); + Tag neg(int32_t(-1)); + Tag normal(int32_t(42)); + Tag max(int32_t(UINT16_MAX)); + Tag overflow(int32_t(UINT16_MAX+1)); + Tag floatTag(float(3.14)); + Tag stringTag(string("foo")); + Tag arrayTag(vector({1, 2, 3})); + + // allowed + EXPECT_NO_THROW( + { + zero.ToUInt16(); + normal.ToUInt16(); + max.ToUInt16(); + }); + + // not allowed + EXPECT_THROW(neg.ToUInt16(), std::exception); + EXPECT_THROW(overflow.ToUInt16(), std::exception); + EXPECT_THROW(floatTag.ToUInt16(), std::exception); + EXPECT_THROW(stringTag.ToUInt16(), std::exception); + EXPECT_THROW(arrayTag.ToUInt16(), std::exception); +} + +TEST(TagTest, ConvertToInt32) +{ + Tag zero(int32_t(0)); + Tag min(int32_t(INT32_MIN)); + Tag normal(int32_t(42)); + Tag max(int32_t(INT32_MAX)); + Tag floatTag(float(3.14)); + Tag stringTag(string("foo")); + Tag arrayTag(vector({1, 2, 3})); + + // no 64-bit ctors - will not compile + // + // Tag underflow(int64_t(INT32_MIN-1)); + // Tag overflow(int64_t(INT32_MAX+1)); + + // allowed + EXPECT_NO_THROW( + { + zero.ToInt32(); + min.ToInt32(); + normal.ToInt32(); + max.ToInt32(); + }); + + // not allowed + EXPECT_THROW(floatTag.ToInt32(), std::exception); + EXPECT_THROW(stringTag.ToInt32(), std::exception); + EXPECT_THROW(arrayTag.ToInt32(), std::exception); +} + +TEST(TagTest, ConvertToUInt32) +{ + Tag zero(int32_t(0)); + Tag neg(int32_t(-1)); + Tag normal(int32_t(42)); + Tag max(uint32_t(UINT32_MAX)); + Tag floatTag(float(3.14)); + Tag stringTag(string("foo")); + Tag arrayTag(vector({1, 2, 3})); + + // no 64-bit ctors - will not compile + // + // Tag overflow(int64_t(UINT32_MAX+1)); + + // allowed + EXPECT_NO_THROW( + { + zero.ToUInt32(); + normal.ToUInt32(); + max.ToUInt32(); + }); + + // not allowed + EXPECT_THROW(neg.ToUInt32(), std::exception); + EXPECT_THROW(floatTag.ToUInt32(), std::exception); + EXPECT_THROW(stringTag.ToUInt32(), std::exception); + EXPECT_THROW(arrayTag.ToUInt32(), std::exception); +} + +TEST(TagCollectionTest, DefaultConstruction) +{ + TagCollection tags; + EXPECT_TRUE(tags.empty()); + EXPECT_FALSE(tags.Contains("XY")); +} + +TEST(TagCollectionTest, AddSimpleTags) +{ + const int32_t intValue = -42; + const string strValue = "foo"; + const string hexStrValue = "1abc75"; + + TagCollection tags; + tags["ST"] = strValue; + tags["XY"] = intValue; + tags["HX"] = hexStrValue; + tags["HX"].Modifier(TagModifier::HEX_STRING); + + EXPECT_EQ(3, tags.size()); + EXPECT_TRUE(tags.Contains("XY")); + EXPECT_TRUE(tags.Contains("ST")); + EXPECT_TRUE(tags.Contains("HX")); + EXPECT_FALSE(tags.Contains("ZZ")); + + EXPECT_TRUE( tags["XY"].ToInt32() == intValue ); + EXPECT_TRUE( tags["ST"].ToString() == strValue ); + EXPECT_TRUE( tags["HX"].ToString() == hexStrValue ); + EXPECT_TRUE( tags["HX"].HasModifier(TagModifier::HEX_STRING) ); +} + +TEST(SamTagCodecTest, DecodeTest) +{ + string tagString; + tagString.append("HX:H:1abc75"); + tagString.append("\t"); + tagString.append("ST:Z:foo"); + tagString.append("\t"); + tagString.append("VC:B:i,42,-100,37,2048"); + tagString.append("\t"); + tagString.append("XY:i:-42"); + + TagCollection expected; + expected["ST"] = string("foo"); + expected["XY"] = int32_t(-42); + expected["HX"] = Tag("1abc75", TagModifier::HEX_STRING); + expected["VC"] = vector( { 42, -100, 37, 2048 } ); + + TagCollection tags = SamTagCodec::Decode(tagString); + + EXPECT_TRUE(tags.Contains("ST")); + EXPECT_TRUE(tags.Contains("HX")); + EXPECT_TRUE(tags.Contains("XY")); + EXPECT_TRUE(tags.Contains("VC")); + + EXPECT_EQ(string("foo"), tags["ST"].ToString()); + EXPECT_TRUE(tags["HX"].HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(string("1abc75"), tags["HX"].ToString()); + EXPECT_EQ((int8_t)-42, tags["XY"].ToInt8()); + EXPECT_EQ(vector( { 42, -100, 37, 2048 } ), tags["VC"].ToInt32Array()); +} + +TEST(SamTagCodecTest, EncodeTest) +{ + TagCollection tags; + tags["ST"] = string("foo"); + tags["XY"] = int32_t(-42); + tags["HX"] = Tag("1abc75", TagModifier::HEX_STRING); + tags["VC"] = vector( { 42, -100, 37, 2048 } ); + + // "HX:H:1abc75\tST:Z:foo\0\tVC:B:i,42,-100,37,2048\tXY:i:-42" + string expected; + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("ST:Z:foo"); + expected.append("\t"); + expected.append("VC:B:i,42,-100,37,2048"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const string sam = SamTagCodec::Encode(tags); + EXPECT_EQ(expected, sam); +} + +TEST(BamTagCodecTest, DecodeTest) +{ + vector data; + data.push_back(uint8_t('H')); + data.push_back(uint8_t('X')); + data.push_back(uint8_t('H')); + data.push_back(uint8_t('1')); + data.push_back(uint8_t('a')); + data.push_back(uint8_t('b')); + data.push_back(uint8_t('c')); + data.push_back(uint8_t('7')); + data.push_back(uint8_t('5')); + data.push_back(uint8_t(0)); + + data.push_back(uint8_t('X')); + data.push_back(uint8_t('Y')); + data.push_back(uint8_t('i')); + const int32_t x = -42; + char valueBytes[sizeof x]; + std::copy(static_cast(static_cast(&x)), + static_cast(static_cast(&x)) + sizeof x, + valueBytes); + data.push_back(valueBytes[0]); + data.push_back(valueBytes[1]); + data.push_back(valueBytes[2]); + data.push_back(valueBytes[3]); + + data.push_back('C'); + data.push_back('A'); + data.push_back('B'); + data.push_back('C'); + const uint32_t numChars = 3; + char numCharsValueBytes[sizeof numChars]; + std::copy(static_cast(static_cast(&numChars)), + static_cast(static_cast(&numChars)) + sizeof numChars, + numCharsValueBytes); + data.push_back(numCharsValueBytes[0]); + data.push_back(numCharsValueBytes[1]); + data.push_back(numCharsValueBytes[2]); + data.push_back(numCharsValueBytes[3]); + + const vector charArray = vector({34, 5, 125}); + data.push_back(charArray.at(0)); + data.push_back(charArray.at(1)); + data.push_back(charArray.at(2)); + + TagCollection tags = BamTagCodec::Decode(data); + + EXPECT_TRUE(tags["HX"].HasModifier(TagModifier::HEX_STRING)); + EXPECT_EQ(string("1abc75"), tags["HX"].ToString()); + EXPECT_EQ(x, tags["XY"].ToInt32()); + EXPECT_EQ(charArray, tags["CA"].ToUInt8Array()); + + // sanity check - convert tags back to SAM + string expected; + expected.append("CA:B:C,34,5,125"); + expected.append("\t"); + expected.append("HX:H:1abc75"); + expected.append("\t"); + expected.append("XY:i:-42"); + + const string sam = SamTagCodec::Encode(tags); + EXPECT_EQ(expected, sam); +} + +TEST(BamTagCodecTest, EncodeTest) +{ + vector expected; + + expected.push_back('C'); + expected.push_back('A'); + expected.push_back('B'); + expected.push_back('C'); + const uint32_t numChars = 3; + char numCharsValueBytes[sizeof numChars]; + std::copy(static_cast(static_cast(&numChars)), + static_cast(static_cast(&numChars)) + sizeof numChars, + numCharsValueBytes); + expected.push_back(numCharsValueBytes[0]); + expected.push_back(numCharsValueBytes[1]); + expected.push_back(numCharsValueBytes[2]); + expected.push_back(numCharsValueBytes[3]); + + const vector charArray = vector({34, 5, 125}); + expected.push_back(charArray.at(0)); + expected.push_back(charArray.at(1)); + expected.push_back(charArray.at(2)); + + expected.push_back(uint8_t('H')); + expected.push_back(uint8_t('X')); + expected.push_back(uint8_t('H')); + expected.push_back(uint8_t('1')); + expected.push_back(uint8_t('a')); + expected.push_back(uint8_t('b')); + expected.push_back(uint8_t('c')); + expected.push_back(uint8_t('7')); + expected.push_back(uint8_t('5')); + expected.push_back(uint8_t(0)); + + expected.push_back(uint8_t('X')); + expected.push_back(uint8_t('Y')); + expected.push_back(uint8_t('i')); + const int32_t x = -42; + char valueBytes[sizeof x]; + std::copy(static_cast(static_cast(&x)), + static_cast(static_cast(&x)) + sizeof x, + valueBytes); + expected.push_back(valueBytes[0]); + expected.push_back(valueBytes[1]); + expected.push_back(valueBytes[2]); + expected.push_back(valueBytes[3]); + + TagCollection tags; + tags["HX"] = Tag("1abc75", TagModifier::HEX_STRING); + tags["CA"] = charArray; + tags["XY"] = x; + + const vector& data = BamTagCodec::Encode(tags); + EXPECT_EQ(expected, data); +} + +TEST(BamTagCodecTest, AsciiTagsTest) +{ + vector expected; + expected.reserve(20); + expected.push_back('I'); // I8:A:B + expected.push_back('8'); + expected.push_back('A'); + expected.push_back('B'); + expected.push_back('P'); // PC:A:$ + expected.push_back('C'); + expected.push_back('A'); + expected.push_back('$'); + expected.push_back('S'); // SC:A:$ + expected.push_back('C'); + expected.push_back('A'); + expected.push_back('$'); + expected.push_back('U'); // U8:A:A + expected.push_back('8'); + expected.push_back('A'); + expected.push_back('A'); + expected.push_back('U'); // UC:A:$ + expected.push_back('C'); + expected.push_back('A'); + expected.push_back('$'); + + const char c = '$'; + const signed char sc = '$'; + const unsigned char uc = '$'; + const uint8_t u8 = 65; + const int8_t i8 = 66; + + { // old style: construct-then-modify + + Tag fromPlainChar = Tag(c); + Tag fromSignedChar = Tag(sc); + Tag fromUnsignedChar = Tag(uc); + Tag fromUint8 = Tag(u8); + Tag fromInt8 = Tag(i8); + fromPlainChar.Modifier(TagModifier::ASCII_CHAR); + fromSignedChar.Modifier(TagModifier::ASCII_CHAR); + fromUnsignedChar.Modifier(TagModifier::ASCII_CHAR); + fromUint8.Modifier(TagModifier::ASCII_CHAR); + fromInt8.Modifier(TagModifier::ASCII_CHAR); + + TagCollection tags; + tags["PC"] = fromPlainChar; + tags["SC"] = fromSignedChar; + tags["UC"] = fromUnsignedChar; + tags["U8"] = fromUint8; + tags["I8"] = fromInt8; + + const vector& data = BamTagCodec::Encode(tags); + EXPECT_EQ(expected, data); + } + + { // new style: construct directly as ASCII + + const Tag fromPlainChar = Tag(c, TagModifier::ASCII_CHAR); + const Tag fromSignedChar = Tag(sc, TagModifier::ASCII_CHAR); + const Tag fromUnsignedChar = Tag(uc, TagModifier::ASCII_CHAR); + const Tag fromUint8 = Tag(u8, TagModifier::ASCII_CHAR); + const Tag fromInt8 = Tag(i8, TagModifier::ASCII_CHAR); + + TagCollection tags; + tags["PC"] = fromPlainChar; + tags["SC"] = fromSignedChar; + tags["UC"] = fromUnsignedChar; + tags["U8"] = fromUint8; + tags["I8"] = fromInt8; + + const vector& data = BamTagCodec::Encode(tags); + EXPECT_EQ(expected, data); + } +} diff --git a/tests/src/test_TimeUtils.cpp b/tests/src/test_TimeUtils.cpp new file mode 100644 index 0000000..90f1489 --- /dev/null +++ b/tests/src/test_TimeUtils.cpp @@ -0,0 +1,67 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +TEST(TimeUtilsTest, ToIso8601) +{ + const time_t rawTime = 436428750L; + const auto timestamp = std::chrono::system_clock::from_time_t(rawTime); + + const auto expected = string{ "1983-10-31T06:12:30Z" }; // no ms in test case + const auto actual = internal::ToIso8601(timestamp); + EXPECT_EQ(expected, actual); +} + +TEST(TimeUtilsTest, ToDataSetFormat) +{ + const time_t rawTime = 436428750L; + const auto timestamp = std::chrono::system_clock::from_time_t(rawTime); + + const auto expected = string{ "831031_061230" }; // no ms in test case + const std::string& actual = internal::ToDataSetFormat(timestamp); + EXPECT_EQ(expected, actual); +} diff --git a/tests/src/test_UnmappedReadsQuery.cpp b/tests/src/test_UnmappedReadsQuery.cpp new file mode 100644 index 0000000..cf5f46a --- /dev/null +++ b/tests/src/test_UnmappedReadsQuery.cpp @@ -0,0 +1,117 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +//#ifdef PBBAM_TESTING +//#define private public +//#endif + +//#include "TestData.h" +//#include +//#include + +//#include +//#include +//using namespace PacBio; +//using namespace PacBio::BAM; +//using namespace std; + +//const string inputBamFn1 = tests::Data_Dir + "/unmap1.bam"; +//const string inputBamFn2 = tests::Data_Dir + "/unmap2.bam"; + +//TEST(UnmappedReadsQueryTest, UnmappedOnlyFile) +//{ +// // open input BAM file +// BamFile bamFile(inputBamFn1); +// EXPECT_TRUE(bamFile); + +// // check all records, and save unmapped count +// int count = 0; +// int unmappedExpected = 0; +// EntireFileQuery entireFile(bamFile); +// EXPECT_TRUE(entireFile); +// for ( const BamRecord& record : entireFile ) { +// ++count; +// if (!record.IsMapped()) +// ++unmappedExpected; +// } +// EXPECT_EQ(10, count); +// EXPECT_EQ(10, unmappedExpected); + +// // query unmapped records only +// int unmappedObserved = 0; +// UnmappedReadsQuery unmappedReads(bamFile); +// EXPECT_TRUE(unmappedReads); +// for ( const BamRecord& record : unmappedReads ) { +// EXPECT_FALSE(record.IsMapped()); +// ++unmappedObserved; +// } +// EXPECT_EQ(unmappedExpected, unmappedObserved); +//} + +//TEST(UnmappedReadsQueryTest, MixedFile) +//{ +// // open input BAM file +// BamFile bamFile(inputBamFn2); +// EXPECT_TRUE(bamFile); + +// // check all records, and save unmapped count +// int count = 0; +// int unmappedExpected = 0; +// EntireFileQuery entireFile(bamFile); +// EXPECT_TRUE(entireFile); +// for ( const BamRecord& record : entireFile ) { +// ++count; +// if (!record.IsMapped()) +// ++unmappedExpected; +// } +// EXPECT_EQ(19, count); +// EXPECT_EQ(9, unmappedExpected); + +// // query unmapped records only +// int unmappedObserved = 0; +// UnmappedReadsQuery unmappedReads(bamFile); +// EXPECT_TRUE(unmappedReads); +// for ( const BamRecord& record : unmappedReads ) { +// EXPECT_FALSE(record.IsMapped()); +// ++unmappedObserved; +// } +// EXPECT_EQ(unmappedExpected, unmappedObserved); +//} + +// TODO: handle no index case + +// TODO: additional special cases as needed diff --git a/tests/src/test_Validator.cpp b/tests/src/test_Validator.cpp new file mode 100644 index 0000000..b3d0638 --- /dev/null +++ b/tests/src/test_Validator.cpp @@ -0,0 +1,615 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "../src/StringUtils.h" +#include "../src/ValidationErrors.h" + +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace tests { + +static BamRecord makeValidMappedRecord(void) +{ + BamRecordImpl impl; + impl.Bin(4680); + impl.Flag(2); + impl.InsertSize(0); + impl.MapQuality(10); + impl.MatePosition(-1); + impl.MateReferenceId(-1); + impl.Name("movie1/54130/0_10"); + impl.Position(1); + impl.ReferenceId(0); + impl.SetMapped(true); + impl.SetSequenceAndQualities("AATGAGGAGA"); + impl.CigarData(Cigar{ "10=" }); + + TagCollection tags; + tags["RG"] = string{ "3f58e5b8" }; + tags["dq"] = string{ "2222'$22'2" }; + tags["dt"] = string{ "NNNNAGNNGN" }; + tags["iq"] = string{ "(+#1'$#*1&" }; + tags["mq"] = string{ "&1~51*5&~2" }; + tags["sq"] = string{ "<32<4<<<<3" }; + tags["ip"] = vector{ 2,0,10,22,34,0,2,3,0,16 }; + tags["np"] = static_cast(1); + tags["qe"] = static_cast(10); + tags["qs"] = static_cast(0); + tags["zm"] = static_cast(54130); + tags["cx"] = static_cast(2); + tags["AS"] = static_cast(-3020); + tags["NM"] = static_cast(134); + tags["rq"] = static_cast(0.854); + tags["sn"] = vector{ 2.0,2.0,2.0,2.0 }; + impl.Tags(tags); + + return BamRecord(impl); +} + +static BamRecord makeValidUnmappedRecord(void) +{ + BamRecordImpl impl; + impl.Bin(4680); + impl.Flag(4); + impl.InsertSize(0); + impl.MapQuality(10); + impl.MatePosition(-1); + impl.MateReferenceId(-1); + impl.Name("m140906_231018_42161_c100676332550000001823129611271486_s1_p0/8/0_10"); + impl.Position(-1); + impl.ReferenceId(-1); + impl.SetSequenceAndQualities("AATGAGGAGA"); + + TagCollection tags; + tags["RG"] = string{ "b5482b33" }; + tags["dq"] = string{ "2222222222" }; + tags["dt"] = string{ "NNNNNNNNNN" }; + tags["iq"] = string{ ",*11111001" }; + tags["mq"] = string{ "&47088')34" }; + tags["sq"] = string{ "8<4<:<6<0<" }; + tags["ip"] = vector{ 255,9,20,43,38,12,9,30,39,22 }; + tags["np"] = static_cast(1); + tags["qe"] = static_cast(10); + tags["qs"] = static_cast(0); + tags["zm"] = static_cast(8); + tags["cx"] = static_cast(2); + tags["AS"] = static_cast(-3020); + tags["NM"] = static_cast(134); + tags["rq"] = static_cast(0.811); + tags["sn"] = vector{ 2.0,2.0,2.0,2.0 }; + impl.Tags(tags); + + return BamRecord(impl); +} + +static ReadGroupInfo makeValidReadGroup(void) +{ + ReadGroupInfo rg("f5b4ffb6"); + rg.MovieName("movie32"); + rg.ReadType("CCS"); + rg.BindingKit("100372700"); + rg.SequencingKit("100612400"); + rg.BasecallerVersion("2.3"); + rg.FrameRateHz("100"); + rg.Control("TRUE"); + return rg; +} + +// valid, 'starter' objects +static const ReadGroupInfo validReadGroup = makeValidReadGroup(); +static const BamRecord validMappedRecord = makeValidMappedRecord(); +static const BamRecord validUnmappedRecord = makeValidUnmappedRecord(); + +} // namespace tests +} // namespace BAM +} // namespace PacBio + +TEST(ValidatorErrorsTest, SetMaxNumErrors) +{ + { // default - use "no max" + internal::ValidationErrors errors; + EXPECT_EQ(internal::ValidationErrors::MAX, errors.maxNumErrors_); + } + { // max of zero doesn't make sense... make equivalent to "no max" + internal::ValidationErrors errors(0); + EXPECT_EQ(internal::ValidationErrors::MAX, errors.maxNumErrors_); + } + { // max = 1 + internal::ValidationErrors errors(1); + EXPECT_EQ(1, errors.maxNumErrors_); + } + { // max = 10 + internal::ValidationErrors errors(10); + EXPECT_EQ(10, errors.maxNumErrors_); + } +} + +TEST(ValidatorErrorsTest, ThrowOnMaxReached) +{ + { + internal::ValidationErrors errors(1); + EXPECT_THROW(errors.AddFileError("foo", "you"), ValidationException); + } + { + internal::ValidationErrors errors(2); + errors.AddFileError("foo", "you"); + EXPECT_THROW(errors.AddFileError("foo", "me"), ValidationException); + } +} + +TEST(ValidatorErrorsTest, ExceptionFromResults) +{ + const string error1 = "error1"; + const string error2 = "error2"; + + try { + + internal::ValidationErrors errors(4); + errors.AddFileError("path/to/foo.bam", error1); + errors.AddFileError("path/to/foo.bam", error2); + errors.AddReadGroupError("deadbeef", "invalid sequencing chemistry combination detected"); + errors.AddRecordError("m140906_231018_42161_c100676332550000001823129611271486_s1_p0/8/0_10", + "MergeQV does not match expected length"); + + } catch (ValidationException& e) { + + EXPECT_EQ(1, e.FileErrors().size()); // only 1 file + EXPECT_EQ(2, e.FileErrors().at("path/to/foo.bam").size()); // 2 errors for this file + EXPECT_EQ(1, e.ReadGroupErrors().size()); + EXPECT_EQ(1, e.RecordErrors().size()); + } +} + +TEST(ValidatorTest, ValidReadGroup) +{ + ASSERT_NO_THROW(Validator::Validate(tests::validReadGroup)); +} + +TEST(ValidatorTest, ReadGroupRequiredComponents) +{ + { // missing ID + ReadGroupInfo rg = tests::validReadGroup; + rg.Id(""); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // missing movie name + ReadGroupInfo rg = tests::validReadGroup; + rg.MovieName(""); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // missing read type + ReadGroupInfo rg = tests::validReadGroup; + rg.ReadType(""); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // missing binding kit + ReadGroupInfo rg = tests::validReadGroup; + rg.BindingKit(""); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // missing sequencing kit + ReadGroupInfo rg = tests::validReadGroup; + rg.SequencingKit(""); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // missing basecaller version + ReadGroupInfo rg = tests::validReadGroup; + rg.BasecallerVersion(""); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // missing frame rate + ReadGroupInfo rg = tests::validReadGroup; + rg.FrameRateHz(""); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } +} + +TEST(ValidatorTest, ReadGroupValues) +{ + { // mismatch expected ID vs stored ID - change ID + ReadGroupInfo rg = tests::validReadGroup; + rg.Id("deadbeef"); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // mismatch expected ID vs stored ID - change read type + ReadGroupInfo rg = tests::validReadGroup; + rg.ReadType("SUBREAD"); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // mismatch expected ID vs stored ID - change movie name + ReadGroupInfo rg = tests::validReadGroup; + rg.MovieName("foo"); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // unknown read type + ReadGroupInfo rg = tests::validReadGroup; + rg.ReadType("FOO"); + + // recompute ID so we're only checking the new read type, not read ID + rg.Id( MakeReadGroupId(rg.MovieName(), rg.ReadType()) ); + + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // invalid chemistry triple - change binding kit + ReadGroupInfo rg = tests::validReadGroup; + rg.BindingKit("foo"); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // invalid chemistry triple - change sequencing kit + ReadGroupInfo rg = tests::validReadGroup; + rg.SequencingKit("foo"); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // invalid chemistry triple - change basecaller version + ReadGroupInfo rg = tests::validReadGroup; + rg.BasecallerVersion("0.42"); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } + { // non-numeric frame rate + ReadGroupInfo rg = tests::validReadGroup; + rg.FrameRateHz("foo"); + EXPECT_THROW(Validator::Validate(rg), ValidationException); + EXPECT_FALSE(Validator::IsValid(rg)); + } +} + +TEST(ValidatorTest, ValidHeader) +{ + const BamHeader validMappedHeader { + "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n" + "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:734d5f3b2859595f4bd87a2fe6b7389b\n" + "@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;" + "MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;" + "FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200" + "\tPU:movie1\n" + }; + + const BamHeader validUnmappedHeader { + "@HD\tVN:1.5\tSO:unknown\tpb:3.0.1\n" + "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;" + "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;" + "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t" + "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n" + }; + + ASSERT_NO_THROW(Validator::Validate(validMappedHeader)); + ASSERT_NO_THROW(Validator::Validate(validUnmappedHeader)); +} + +TEST(ValidatorTest, ValidateHeader) +{ + const BamHeader validMappedHeader { + "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n" + "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:734d5f3b2859595f4bd87a2fe6b7389b\n" + "@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;" + "MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;" + "FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200" + "\tPU:movie1\n" + }; + + { // invalid SAM version - non-numeric + BamHeader header = validMappedHeader.DeepCopy(); + header.Version("foo"); + EXPECT_THROW(Validator::Validate(header), ValidationException); + EXPECT_FALSE(Validator::IsValid(header)); + } + { // invalid SAM version - negative version numbers + BamHeader header = validMappedHeader.DeepCopy(); + header.Version("-1.4.0"); + EXPECT_THROW(Validator::Validate(header), ValidationException); + EXPECT_FALSE(Validator::IsValid(header)); + } + { // invalid sort order + BamHeader header = validMappedHeader.DeepCopy(); + header.SortOrder("not_a_valid_sort_order"); + EXPECT_THROW(Validator::Validate(header), ValidationException); + EXPECT_FALSE(Validator::IsValid(header)); + } + + // invalid PacBioBamVersion numbers (non-numeric, negative, earlier than min) + // already throw when you try to set them... so we have to catch & ignore + // initial exception to get to validator + + { // invalid PacBioBAM version - non-numeric + BamHeader header = validMappedHeader.DeepCopy(); + try { + header.PacBioBamVersion("foo"); + } catch (...) { } + EXPECT_THROW(Validator::Validate(header), ValidationException); + EXPECT_FALSE(Validator::IsValid(header)); + } + { // invalid PacBioBAM version - negative version numbers + BamHeader header = validMappedHeader.DeepCopy(); + try { + header.PacBioBamVersion("-1.4.0"); + } catch (...) { } + EXPECT_THROW(Validator::Validate(header), ValidationException); + EXPECT_FALSE(Validator::IsValid(header)); + } + { // invalid PacBioBAM version - earlier than minimum allowed + BamHeader header = validMappedHeader.DeepCopy(); + try { + header.PacBioBamVersion("3.0.0"); + } catch (...) { } + EXPECT_THROW(Validator::Validate(header), ValidationException); + EXPECT_FALSE(Validator::IsValid(header)); + } +} + +TEST(ValidatorTest, ValidRecord) +{ + const BamHeader validMappedHeader { + "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n" + "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:734d5f3b2859595f4bd87a2fe6b7389b\n" + "@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;" + "MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;" + "FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200" + "\tPU:movie1\n" + }; + BamRecord record(tests::validMappedRecord); + record.header_ = validMappedHeader; + ASSERT_NO_THROW(Validator::Validate(record)); +} + +static inline +void ModifyTag(BamRecord* record, + const std::string& tagName, + const Tag& tag) +{ + if (record->Impl().HasTag(tagName)) + record->Impl().EditTag(tagName, tag); + else + record->Impl().AddTag(tagName, tag); +} + +static inline +void CheckInvalidTagLength(const std::string& tagName, const Tag& tag) +{ + static const BamHeader validUnmappedHeader { + "@HD\tVN:1.5\tSO:unknown\tpb:3.0.1\n" + "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;" + "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;" + "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t" + "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n" + }; + BamRecord record(tests::validUnmappedRecord); + record.header_ = validUnmappedHeader; + + ModifyTag(&record, tagName, tag); + + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); +} + +TEST(ValidatorTest, TagDataLengths) +{ + const BamHeader validUnmappedHeader { + "@HD\tVN:1.5\tSO:unknown\tpb:3.0.1\n" + "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;" + "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;" + "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t" + "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n" + }; + + // make these "variable-length" SEQ/tags too short for the read's stated + // queryStart/queryEnd + + { // SEQ + BamRecord record(tests::validUnmappedRecord); + record.header_ = validUnmappedHeader; + record.Impl().SetSequenceAndQualities("AA"); + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); + } + + CheckInvalidTagLength("dq", QualityValues("@@").Fastq()); // DeletionQV + CheckInvalidTagLength("iq", QualityValues("@@").Fastq()); // InsertionQV + CheckInvalidTagLength("mq", QualityValues("@@").Fastq()); // MergeQV + CheckInvalidTagLength("sq", QualityValues("@@").Fastq()); // SubstitutionQV + CheckInvalidTagLength("dt", string("AA")); // DeletionTag + CheckInvalidTagLength("st", string("AA")); // SubstitutionTag + + const auto& f = Frames{ vector{42, 42, 42} }; + const auto& frames = f.Data(); + CheckInvalidTagLength("ip", frames); // IPD + + // NOTE: disabling "internal" tag checks for now, only checking "standard" + // PacBioBAM tags + +// const auto& pulses = vector{42, 42, 42}; +// CheckInvalidTagLength("pv", QualityValues("@@").Fastq()); // AltLabelQV +// CheckInvalidTagLength("pq", QualityValues("@@").Fastq()); // LabelQV +// CheckInvalidTagLength("pg", QualityValues("@@").Fastq()); // PulseMergeQv +// CheckInvalidTagLength("pt", string("AA")); // AltLabelTag +// CheckInvalidTagLength("pc", string("AA")); // PulseCall +// CheckInvalidTagLength("pd", frames); // PrePulseFrames +// CheckInvalidTagLength("px", frames); // PulseCallWidth +// CheckInvalidTagLength("pw", frames); // PulseWidth +// CheckInvalidTagLength("pa", pulses); // Pkmean +// CheckInvalidTagLength("ps", pulses); // Pkmean2 +// CheckInvalidTagLength("pm", pulses); // Pkmid +// CheckInvalidTagLength("pi", pulses); // Pkmid2 +} + +TEST(ValidatorTest, TagDataValues) +{ + const BamHeader validMappedHeader { + "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n" + "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:734d5f3b2859595f4bd87a2fe6b7389b\n" + "@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;" + "MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;" + "FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200" + "\tPU:movie1\n" + }; + + { // missing qe + BamRecord record(tests::validMappedRecord); + record.header_ = validMappedHeader; + record.Impl().RemoveTag("qe"); + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); + } + { // missing qs + BamRecord record(tests::validMappedRecord); + record.header_ = validMappedHeader; + record.Impl().RemoveTag("qs"); + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); + } + { // queryStart should be < queryEnd + BamRecord record(tests::validMappedRecord); + record.header_ = validMappedHeader; + record.QueryStart(10); + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); + } + { // missing zm + BamRecord record(tests::validMappedRecord); + record.header_ = validMappedHeader; + record.Impl().RemoveTag("zm"); + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); + } + { // missing np + BamRecord record(tests::validMappedRecord); + record.header_ = validMappedHeader; + record.Impl().RemoveTag("np"); + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); + } + { // numPasses for SUBREAD type records should be 1 + BamRecord record(tests::validMappedRecord); + record.header_ = validMappedHeader; + record.NumPasses(42); + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); + } + { // missing sn + BamRecord record(tests::validMappedRecord); + record.header_ = validMappedHeader; + record.Impl().RemoveTag("sn"); + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); + } +} + +TEST(ValidatorTest, MappedRecords) +{ + const BamHeader validMappedHeader { + "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n" + "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;" + "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;" + "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t" + "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n" + }; + + { // mapped record should have valid refID + BamRecord record(tests::validMappedRecord); + record.header_ = validMappedHeader; + record.Impl().ReferenceId(-1); + + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); + } + { // mapped record should have valid position + BamRecord record(tests::validMappedRecord); + record.header_ = validMappedHeader; + record.Impl().Position(-1); + + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); + } + +} + +TEST(ValidatorTest, UnmappedRecords) +{ + const BamHeader validUnmappedHeader { + "@HD\tVN:1.5\tSO:unknown\tpb:3.0.1\n" + "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;" + "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;" + "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t" + "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n" + }; + + { // unmapped should have no refID + BamRecord record(tests::validUnmappedRecord); + record.header_ = validUnmappedHeader; + record.Impl().ReferenceId(0); + + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); + } + { // unmapped should have no position + BamRecord record(tests::validUnmappedRecord); + record.header_ = validUnmappedHeader; + record.Impl().Position(42); + + EXPECT_THROW(Validator::Validate(record), ValidationException); + EXPECT_FALSE(Validator::IsValid(record)); + } +} diff --git a/tests/src/test_Version.cpp b/tests/src/test_Version.cpp new file mode 100644 index 0000000..08bc7fe --- /dev/null +++ b/tests/src/test_Version.cpp @@ -0,0 +1,335 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "../src/Version.h" + +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace PacBio::BAM::internal; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace tests { + +static inline Version MakeVersion(int x, int y, int z) +{ return Version(x, y, z); } + +} // namespace tests +} // namespace BAM +} // namespace PacBio + +TEST(VersionTest, DefaultOk) +{ + Version v; + EXPECT_EQ(0, v.Major()); + EXPECT_EQ(0, v.Minor()); + EXPECT_EQ(0, v.Revision()); +} + +TEST(VersionTest, CopyAndMoveOk) +{ + { // copy ctor + Version v1(3,1,1); + EXPECT_EQ(3, v1.Major()); + EXPECT_EQ(1, v1.Minor()); + EXPECT_EQ(1, v1.Revision()); + + Version v2(v1); + EXPECT_EQ(3, v2.Major()); + EXPECT_EQ(1, v2.Minor()); + EXPECT_EQ(1, v2.Revision()); + } + { // copy assign + Version v1(3,1,1); + EXPECT_EQ(3, v1.Major()); + EXPECT_EQ(1, v1.Minor()); + EXPECT_EQ(1, v1.Revision()); + + Version v2; + v2 = v1; + EXPECT_EQ(3, v2.Major()); + EXPECT_EQ(1, v2.Minor()); + EXPECT_EQ(1, v2.Revision()); + + } + { // move ctor + Version v(tests::MakeVersion(3,1,1)); + EXPECT_EQ(3, v.Major()); + EXPECT_EQ(1, v.Minor()); + EXPECT_EQ(1, v.Revision()); + + } + { // move assign + Version v1(3,1,1); + EXPECT_EQ(3, v1.Major()); + EXPECT_EQ(1, v1.Minor()); + EXPECT_EQ(1, v1.Revision()); + + Version v2; + v2 = std::move(v1); + EXPECT_EQ(3, v2.Major()); + EXPECT_EQ(1, v2.Minor()); + EXPECT_EQ(1, v2.Revision()); + } +} + +TEST(VersionTest, FromIntsOk) +{ + { // normal + Version v(3,1,1); + EXPECT_EQ(3, v.Major()); + EXPECT_EQ(1, v.Minor()); + EXPECT_EQ(1, v.Revision()); + } + + // negatives + EXPECT_THROW(Version(-3, 1, 1), std::runtime_error); +} + +TEST(VersionTest, FromStringOk) +{ + { // normal + Version v("3.1.1"); + EXPECT_EQ(3, v.Major()); + EXPECT_EQ(1, v.Minor()); + EXPECT_EQ(1, v.Revision()); + } + + // negatives + EXPECT_THROW(Version("-3.1.1"), std::runtime_error); + + // non-numeric + EXPECT_THROW(Version("foo.bar.baz"), std::runtime_error); + + // empty + EXPECT_THROW(Version(""), std::runtime_error); +} + +TEST(VersionTest, SettersOk) +{ + Version v(3,1,1); + + v.Major(4); + + EXPECT_EQ(4, v.Major()); + EXPECT_EQ(1, v.Minor()); + EXPECT_EQ(1, v.Revision()); + + v.Minor(7); + + EXPECT_EQ(4, v.Major()); + EXPECT_EQ(7, v.Minor()); + EXPECT_EQ(1, v.Revision()); + + v.Revision(23); + + EXPECT_EQ(4, v.Major()); + EXPECT_EQ(7, v.Minor()); + EXPECT_EQ(23, v.Revision()); + + { // invalid + Version v1(3,1,1); + Version v2(3,1,1); + Version v3(3,1,1); + EXPECT_THROW(v1.Major(-1), std::runtime_error); + EXPECT_THROW(v2.Minor(-1), std::runtime_error); + EXPECT_THROW(v3.Revision(-1), std::runtime_error); + } +} + +TEST(VersionTest, ComparisonsOk) +{ + const Version v0_0_0 = Version(0,0,0); + const Version v0_0_4 = Version(0,0,4); + const Version v0_1_0 = Version(0,1,0); + const Version v0_1_4 = Version(0,1,4); + const Version v3_0_0 = Version(3,0,0); + const Version v3_0_4 = Version(3,0,4); + const Version v3_1_0 = Version(3,1,0); + const Version v3_1_4 = Version(3,1,4); + const Version v3_1_5 = Version(3,1,5); + + // operator== + EXPECT_TRUE(v0_0_0 == v0_0_0); + EXPECT_TRUE(v3_0_0 == v3_0_0); + EXPECT_TRUE(v0_1_0 == v0_1_0); + EXPECT_TRUE(v0_0_4 == v0_0_4); + EXPECT_TRUE(v3_1_0 == v3_1_0); + EXPECT_TRUE(v3_1_4 == v3_1_4); + + EXPECT_FALSE(v3_1_4 == v0_0_0); + EXPECT_FALSE(v3_1_4 == v3_0_0); + EXPECT_FALSE(v3_1_4 == v0_1_0); + EXPECT_FALSE(v3_1_4 == v0_0_4); + EXPECT_FALSE(v3_1_4 == v3_1_0); + EXPECT_FALSE(v3_1_4 == v3_1_5); + + // operator!= + EXPECT_FALSE(v0_0_0 != v0_0_0); + EXPECT_FALSE(v3_0_0 != v3_0_0); + EXPECT_FALSE(v0_1_0 != v0_1_0); + EXPECT_FALSE(v0_0_4 != v0_0_4); + EXPECT_FALSE(v3_1_0 != v3_1_0); + EXPECT_FALSE(v3_1_4 != v3_1_4); + + EXPECT_TRUE(v3_1_4 != v0_0_0); + EXPECT_TRUE(v3_1_4 != v3_0_0); + EXPECT_TRUE(v3_1_4 != v0_1_0); + EXPECT_TRUE(v3_1_4 != v0_0_4); + EXPECT_TRUE(v3_1_4 != v3_1_0); + EXPECT_TRUE(v3_1_4 != v3_1_5); + + // operator< + EXPECT_FALSE(v0_0_0 < v0_0_0); + EXPECT_TRUE(v0_0_0 < v0_0_4); + EXPECT_TRUE(v0_0_0 < v0_1_0); + EXPECT_TRUE(v0_0_0 < v3_0_0); + EXPECT_TRUE(v0_0_0 < v0_1_4); + EXPECT_TRUE(v0_0_0 < v3_0_4); + EXPECT_TRUE(v0_0_0 < v3_1_0); + EXPECT_TRUE(v0_0_0 < v3_1_4); + + EXPECT_TRUE(v0_0_4 < v3_1_4); + EXPECT_TRUE(v0_1_0 < v3_1_4); + EXPECT_TRUE(v0_1_4 < v3_1_4); + EXPECT_TRUE(v3_0_0 < v3_1_4); + EXPECT_TRUE(v3_0_4 < v3_1_4); + EXPECT_TRUE(v3_1_0 < v3_1_4); + EXPECT_FALSE(v3_1_4 < v3_1_4); + EXPECT_FALSE(v3_1_5 < v3_1_4); + + EXPECT_FALSE(v3_1_4 < v0_0_0); + + // operator<= + EXPECT_TRUE(v0_0_0 <= v0_0_0); + EXPECT_TRUE(v0_0_0 <= v0_0_4); + EXPECT_TRUE(v0_0_0 <= v0_1_0); + EXPECT_TRUE(v0_0_0 <= v3_0_0); + EXPECT_TRUE(v0_0_0 <= v0_1_4); + EXPECT_TRUE(v0_0_0 <= v3_0_4); + EXPECT_TRUE(v0_0_0 <= v3_1_0); + EXPECT_TRUE(v0_0_0 <= v3_1_4); + + EXPECT_TRUE(v0_0_4 <= v3_1_4); + EXPECT_TRUE(v0_1_0 <= v3_1_4); + EXPECT_TRUE(v0_1_4 <= v3_1_4); + EXPECT_TRUE(v3_0_0 <= v3_1_4); + EXPECT_TRUE(v3_0_4 <= v3_1_4); + EXPECT_TRUE(v3_1_0 <= v3_1_4); + EXPECT_TRUE(v3_1_4 <= v3_1_4); + EXPECT_FALSE(v3_1_5 <= v3_1_4); + + EXPECT_FALSE(v3_1_4 <= v0_0_0); + + // operator> + EXPECT_FALSE(v0_0_0 > v0_0_0); + EXPECT_FALSE(v0_0_0 > v0_0_4); + EXPECT_FALSE(v0_0_0 > v0_1_0); + EXPECT_FALSE(v0_0_0 > v3_0_0); + EXPECT_FALSE(v0_0_0 > v0_1_4); + EXPECT_FALSE(v0_0_0 > v3_0_4); + EXPECT_FALSE(v0_0_0 > v3_1_0); + EXPECT_FALSE(v0_0_0 > v3_1_4); + + EXPECT_FALSE(v0_0_4 > v3_1_4); + EXPECT_FALSE(v0_1_0 > v3_1_4); + EXPECT_FALSE(v0_1_4 > v3_1_4); + EXPECT_FALSE(v3_0_0 > v3_1_4); + EXPECT_FALSE(v3_0_4 > v3_1_4); + EXPECT_FALSE(v3_1_0 > v3_1_4); + EXPECT_FALSE(v3_1_4 > v3_1_4); + EXPECT_TRUE(v3_1_5 > v3_1_4); + + EXPECT_TRUE(v3_1_4 > v0_0_0); + + // operator>= + EXPECT_TRUE(v0_0_0 >= v0_0_0); + EXPECT_FALSE(v0_0_0 >= v0_0_4); + EXPECT_FALSE(v0_0_0 >= v0_1_0); + EXPECT_FALSE(v0_0_0 >= v3_0_0); + EXPECT_FALSE(v0_0_0 >= v0_1_4); + EXPECT_FALSE(v0_0_0 >= v3_0_4); + EXPECT_FALSE(v0_0_0 >= v3_1_0); + EXPECT_FALSE(v0_0_0 >= v3_1_4); + + EXPECT_FALSE(v0_0_4 >= v3_1_4); + EXPECT_FALSE(v0_1_0 >= v3_1_4); + EXPECT_FALSE(v0_1_4 >= v3_1_4); + EXPECT_FALSE(v3_0_0 >= v3_1_4); + EXPECT_FALSE(v3_0_4 >= v3_1_4); + EXPECT_FALSE(v3_1_0 >= v3_1_4); + EXPECT_TRUE(v3_1_4 >= v3_1_4); + EXPECT_TRUE(v3_1_5 >= v3_1_4); + + EXPECT_TRUE(v3_1_4 >= v0_0_0); +} + +TEST(VersionTest, ToStringOk) +{ + { + Version v(0,0,0); + EXPECT_EQ(string("0.0.0"), v.ToString()); + } + { + Version v(3,1,4); + EXPECT_EQ(string("3.1.4"), v.ToString()); + } + { + Version v; + v.Major(4); + EXPECT_EQ(string("4.0.0"), v.ToString()); + } + { + const string s = "1.2.3"; + Version v(s); + EXPECT_EQ(s, v.ToString()); + } +} + +TEST(VersionTest, OutputStreamOk) +{ + Version v(3,1,4); + Version v2(4,10,0); + + stringstream s; + s << v << ", " << v2 << ", " << v << endl; + + EXPECT_EQ(string("3.1.4, 4.10.0, 3.1.4\n"), s.str()); +} diff --git a/tests/src/test_WhitelistedZmwReadStitcher.cpp b/tests/src/test_WhitelistedZmwReadStitcher.cpp new file mode 100644 index 0000000..9c4ba7f --- /dev/null +++ b/tests/src/test_WhitelistedZmwReadStitcher.cpp @@ -0,0 +1,260 @@ +// Copyright (c) 2014-2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace tests { + +static +void Compare(const BamRecord& b1, const BamRecord& b2) +{ + EXPECT_TRUE(b1.HasDeletionQV()); + EXPECT_TRUE(b1.HasDeletionTag()); + EXPECT_TRUE(b1.HasInsertionQV()); + EXPECT_TRUE(b1.HasMergeQV()); + EXPECT_TRUE(b1.HasSubstitutionQV()); + EXPECT_TRUE(b1.HasSubstitutionTag()); + EXPECT_TRUE(b1.HasLabelQV()); + EXPECT_TRUE(b1.HasAltLabelQV()); + EXPECT_TRUE(b1.HasAltLabelTag()); + EXPECT_TRUE(b1.HasPkmean()); + EXPECT_TRUE(b1.HasPkmid()); + EXPECT_TRUE(b1.HasPulseCall()); + EXPECT_TRUE(b1.HasIPD()); + EXPECT_TRUE(b1.HasPulseWidth()); + EXPECT_TRUE(b1.HasPrePulseFrames()); + EXPECT_TRUE(b1.HasPulseCallWidth()); + EXPECT_TRUE(b1.HasPulseMergeQV()); + + EXPECT_TRUE(b2.HasDeletionQV()); + EXPECT_TRUE(b2.HasDeletionTag()); + EXPECT_TRUE(b2.HasInsertionQV()); + EXPECT_TRUE(b2.HasMergeQV()); + EXPECT_TRUE(b2.HasSubstitutionQV()); + EXPECT_TRUE(b2.HasSubstitutionTag()); + EXPECT_TRUE(b2.HasLabelQV()); + EXPECT_TRUE(b2.HasAltLabelQV()); + EXPECT_TRUE(b2.HasAltLabelTag()); + EXPECT_TRUE(b2.HasPkmean()); + EXPECT_TRUE(b2.HasPkmid()); + EXPECT_TRUE(b2.HasPulseCall()); + EXPECT_TRUE(b2.HasIPD()); + EXPECT_TRUE(b2.HasPulseWidth()); + EXPECT_TRUE(b2.HasPrePulseFrames()); + EXPECT_TRUE(b2.HasPulseCallWidth()); + EXPECT_TRUE(b2.HasPulseMergeQV()); + + EXPECT_EQ(b1.FullName(), b2.FullName()); + EXPECT_EQ(b1.HoleNumber(), b2.HoleNumber()); + EXPECT_EQ(b1.NumPasses(), b2.NumPasses()); + EXPECT_EQ(b1.Sequence(), b2.Sequence()); + EXPECT_EQ(b1.Qualities(), b2.Qualities()); + EXPECT_EQ(b1.DeletionQV(), b2.DeletionQV()); + EXPECT_EQ(b1.DeletionTag(), b2.DeletionTag()); + EXPECT_EQ(b1.InsertionQV(), b2.InsertionQV()); + EXPECT_EQ(b1.MergeQV(), b2.MergeQV()); + EXPECT_EQ(b1.SubstitutionQV(), b2.SubstitutionQV()); + EXPECT_EQ(b1.SubstitutionTag(), b2.SubstitutionTag()); + EXPECT_EQ(b1.LabelQV(), b2.LabelQV()); + EXPECT_EQ(b1.AltLabelQV(), b2.AltLabelQV()); + EXPECT_EQ(b1.AltLabelTag(), b2.AltLabelTag()); + EXPECT_EQ(b1.Pkmean(), b2.Pkmean()); + EXPECT_EQ(b1.Pkmid(), b2.Pkmid()); + EXPECT_EQ(b1.PulseCall(), b2.PulseCall()); + EXPECT_EQ(b1.IPD(), b2.IPD()); + EXPECT_EQ(b1.PulseWidth(), b2.PulseWidth()); + EXPECT_EQ(b1.PrePulseFrames(), b2.PrePulseFrames()); + EXPECT_EQ(b1.PulseCallWidth(), b2.PulseCallWidth()); + EXPECT_EQ(b1.ReadGroup(), b2.ReadGroup()); + EXPECT_EQ(b1.PulseMergeQV(), b2.PulseMergeQV()); +} + +} // namespace tests +} // namespace BAM +} // namespace PacBio + +TEST(WhitelistedZmwReadStitching, EmptyList) +{ + const std::vector whitelist = { }; + WhitelistedZmwReadStitcher stitcher(whitelist, + tests::Data_Dir + "/polymerase/internal.subreads.bam", + tests::Data_Dir + "/polymerase/internal.scraps.bam"); + EXPECT_FALSE(stitcher.HasNext()); + EXPECT_TRUE(stitcher.NextRaw().empty()); +} + +TEST(WhitelistedZmwReadStitching, SingleValue) +{ + const std::vector whitelist = { 200000 }; + WhitelistedZmwReadStitcher stitcher(whitelist, + tests::Data_Dir + "/polymerase/internal.subreads.bam", + tests::Data_Dir + "/polymerase/internal.scraps.bam"); + + // create virtual record + EXPECT_TRUE(stitcher.HasNext()); + auto virtualRecord = stitcher.Next(); + EXPECT_FALSE(stitcher.HasNext()); + + // fetch original polymerase read (2nd record) + BamFile polyBam(tests::Data_Dir + "/polymerase/internal.polymerase.bam"); + EntireFileQuery polyQuery(polyBam); + auto begin = polyQuery.begin(); + auto end = polyQuery.end(); + EXPECT_TRUE(begin != end); + ++begin; + EXPECT_TRUE(begin != end); + auto polyRecord = *begin++; + + EXPECT_EQ(200000, virtualRecord.HoleNumber()); + + tests::Compare(polyRecord, virtualRecord); +} + +TEST(WhitelistedZmwReadStitching, UnknownZmw) +{ + const std::vector whitelist { 42 }; // ZMW not in our files + WhitelistedZmwReadStitcher stitcher(whitelist, + tests::Data_Dir + "/polymerase/internal.subreads.bam", + tests::Data_Dir + "/polymerase/internal.scraps.bam"); + + EXPECT_FALSE(stitcher.HasNext()); + EXPECT_TRUE(stitcher.NextRaw().empty()); +} + +TEST(WhitelistedZmwReadStitching, MultiValue) +{ + const std::vector whitelist = { 100000, 300000 }; + WhitelistedZmwReadStitcher stitcher(whitelist, + tests::Data_Dir + "/polymerase/internal.subreads.bam", + tests::Data_Dir + "/polymerase/internal.scraps.bam"); + + + // create virtual records + EXPECT_TRUE(stitcher.HasNext()); + auto virtualRecord1 = stitcher.Next(); + EXPECT_TRUE(stitcher.HasNext()); + auto virtualRecord2 = stitcher.Next(); + EXPECT_FALSE(stitcher.HasNext()); + + // fetch original polymerase reads (2nd record) + BamFile polyBam(tests::Data_Dir + "/polymerase/internal.polymerase.bam"); + EntireFileQuery polyQuery(polyBam); + auto begin = polyQuery.begin(); + auto end = polyQuery.end(); + + EXPECT_TRUE(begin != end); + auto polyRecord1 = *begin++; + EXPECT_TRUE(begin != end); + ++begin; + EXPECT_TRUE(begin != end); + auto polyRecord2 = *begin++; + EXPECT_TRUE(begin == end); + + EXPECT_EQ(100000, virtualRecord1.HoleNumber()); + EXPECT_EQ(300000, virtualRecord2.HoleNumber()); + + tests::Compare(polyRecord1, virtualRecord1); + tests::Compare(polyRecord2, virtualRecord2); +} + +TEST(WhitelistedZmwReadStitching, MultiValue_MixedKnownAndUnknown) +{ + const std::vector whitelist { 42, 200000, 24 }; + WhitelistedZmwReadStitcher stitcher(whitelist, + tests::Data_Dir + "/polymerase/internal.subreads.bam", + tests::Data_Dir + "/polymerase/internal.scraps.bam"); + + // everything below should behave exactly as 'SingleValueOk' test, + // as the unknown ZMWs will have been removed during construction + + // create virtual record + EXPECT_TRUE(stitcher.HasNext()); + auto virtualRecord = stitcher.Next(); + EXPECT_FALSE(stitcher.HasNext()); + + // fetch original polymerase read (2nd record) + BamFile polyBam(tests::Data_Dir + "/polymerase/internal.polymerase.bam"); + EntireFileQuery polyQuery(polyBam); + auto begin = polyQuery.begin(); + auto end = polyQuery.end(); + EXPECT_TRUE(begin != end); + ++begin; + EXPECT_TRUE(begin != end); + auto polyRecord = *begin++; + + EXPECT_EQ(200000, virtualRecord.HoleNumber()); + + tests::Compare(polyRecord, virtualRecord); +} + +TEST(WhitelistedZmwReadStitching, EmptyScrapsFileOk) +{ + const std::vector whitelist = { 10944689, 10944690 }; + const std::string primaryBamFn = tests::Data_Dir + "/polymerase/scrapless.subreads.bam" ; + const std::string scrapsBamFn = tests::Data_Dir + "/polymerase/scrapless.scraps.bam" ; + + int count = 0; + WhitelistedZmwReadStitcher stitcher(whitelist, primaryBamFn, scrapsBamFn); + while (stitcher.HasNext()) { + auto record = stitcher.Next(); + (void)record; + ++count; + } + EXPECT_EQ(2, count); + + const BamFile primaryBam(primaryBamFn); + const BamFile scrapsBam(scrapsBamFn); + const PbiRawData primaryIdx(primaryBam.PacBioIndexFilename()); + const PbiRawData scrapsIdx(scrapsBam.PacBioIndexFilename()); + EXPECT_EQ(3, primaryIdx.NumReads()); + EXPECT_EQ(0, scrapsIdx.NumReads()); +} diff --git a/tests/src/test_ZmwQuery.cpp b/tests/src/test_ZmwQuery.cpp new file mode 100644 index 0000000..287a4eb --- /dev/null +++ b/tests/src/test_ZmwQuery.cpp @@ -0,0 +1,67 @@ +// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +//TEST(EntireFileQueryTest, CountRecords) +//{ +// EXPECT_NO_THROW( +// { +// // open input BAM file +// BamFile bamFile(inputBamFn); + +// // count records +// int count = 0; +// EntireFileQuery entireFile(bamFile); +// for (const BamRecord& record : entireFile) { +// (void)record; +// ++count; +// } + +// EXPECT_EQ(3307, count); +// }); +//} diff --git a/tests/src/test_ZmwReadStitcher.cpp b/tests/src/test_ZmwReadStitcher.cpp new file mode 100644 index 0000000..3554c74 --- /dev/null +++ b/tests/src/test_ZmwReadStitcher.cpp @@ -0,0 +1,512 @@ +// Copyright (c) 2014-2016, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifdef PBBAM_TESTING +#define private public +#endif + +#include "TestData.h" +#include +#include +#include +#include +#include +#include +#include +using namespace PacBio; +using namespace PacBio::BAM; +using namespace std; + +namespace PacBio { +namespace BAM { +namespace tests { + +static +void Compare(const BamRecord& b1, const BamRecord& b2) +{ + EXPECT_TRUE(b1.HasDeletionQV()); + EXPECT_TRUE(b1.HasDeletionTag()); + EXPECT_TRUE(b1.HasInsertionQV()); + EXPECT_TRUE(b1.HasMergeQV()); + EXPECT_TRUE(b1.HasSubstitutionQV()); + EXPECT_TRUE(b1.HasSubstitutionTag()); + EXPECT_TRUE(b1.HasLabelQV()); + EXPECT_TRUE(b1.HasAltLabelQV()); + EXPECT_TRUE(b1.HasAltLabelTag()); + EXPECT_TRUE(b1.HasPkmean()); + EXPECT_TRUE(b1.HasPkmid()); + EXPECT_TRUE(b1.HasPulseCall()); + EXPECT_TRUE(b1.HasIPD()); + EXPECT_TRUE(b1.HasPulseWidth()); + EXPECT_TRUE(b1.HasPrePulseFrames()); + EXPECT_TRUE(b1.HasPulseCallWidth()); + EXPECT_TRUE(b1.HasPulseMergeQV()); + + EXPECT_TRUE(b2.HasDeletionQV()); + EXPECT_TRUE(b2.HasDeletionTag()); + EXPECT_TRUE(b2.HasInsertionQV()); + EXPECT_TRUE(b2.HasMergeQV()); + EXPECT_TRUE(b2.HasSubstitutionQV()); + EXPECT_TRUE(b2.HasSubstitutionTag()); + EXPECT_TRUE(b2.HasLabelQV()); + EXPECT_TRUE(b2.HasAltLabelQV()); + EXPECT_TRUE(b2.HasAltLabelTag()); + EXPECT_TRUE(b2.HasPkmean()); + EXPECT_TRUE(b2.HasPkmid()); + EXPECT_TRUE(b2.HasPulseCall()); + EXPECT_TRUE(b2.HasIPD()); + EXPECT_TRUE(b2.HasPulseWidth()); + EXPECT_TRUE(b2.HasPrePulseFrames()); + EXPECT_TRUE(b2.HasPulseCallWidth()); + EXPECT_TRUE(b2.HasPulseMergeQV()); + + EXPECT_EQ(b1.FullName(), b2.FullName()); + EXPECT_EQ(b1.HoleNumber(), b2.HoleNumber()); + EXPECT_EQ(b1.NumPasses(), b2.NumPasses()); + EXPECT_EQ(b1.Sequence(), b2.Sequence()); + EXPECT_EQ(b1.Qualities(), b2.Qualities()); + EXPECT_EQ(b1.DeletionQV(), b2.DeletionQV()); + EXPECT_EQ(b1.DeletionTag(), b2.DeletionTag()); + EXPECT_EQ(b1.InsertionQV(), b2.InsertionQV()); + EXPECT_EQ(b1.MergeQV(), b2.MergeQV()); + EXPECT_EQ(b1.SubstitutionQV(), b2.SubstitutionQV()); + EXPECT_EQ(b1.SubstitutionTag(), b2.SubstitutionTag()); + EXPECT_EQ(b1.LabelQV(), b2.LabelQV()); + EXPECT_EQ(b1.AltLabelQV(), b2.AltLabelQV()); + EXPECT_EQ(b1.AltLabelTag(), b2.AltLabelTag()); + EXPECT_EQ(b1.Pkmean(), b2.Pkmean()); + EXPECT_EQ(b1.Pkmid(), b2.Pkmid()); + EXPECT_EQ(b1.PulseCall(), b2.PulseCall()); + EXPECT_EQ(b1.IPD(), b2.IPD()); + EXPECT_EQ(b1.PulseWidth(), b2.PulseWidth()); + EXPECT_EQ(b1.PrePulseFrames(), b2.PrePulseFrames()); + EXPECT_EQ(b1.PulseCallWidth(), b2.PulseCallWidth()); + EXPECT_EQ(b1.ReadGroup(), b2.ReadGroup()); + EXPECT_EQ(b1.PulseMergeQV(), b2.PulseMergeQV()); +} + +static +size_t NumVirtualRecords(const string& primaryBamFn, + const string& scrapsBamFn) +{ + ZmwReadStitcher stitcher(primaryBamFn, scrapsBamFn); + size_t count = 0; + while (stitcher.HasNext()) { + const auto record = stitcher.Next(); + (void)record; + ++count; + } + return count; +} + +} // namespace tests +} // namespace BAM +} // namespace PacBio + +TEST(ZmwReadStitching, FromBams_NoFilter) +{ + ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/internal.subreads.bam", + tests::Data_Dir + "/polymerase/internal.scraps.bam"); + size_t count = 0; + while (stitcher.HasNext()) { + const auto record = stitcher.Next(); + (void)record; + ++count; + } + EXPECT_EQ(3, count); +} + +TEST(ZmwReadStitching, FromBams_Filtered) +{ + PbiFilter filter { PbiZmwFilter{100000} }; // setup to match DataSet w/ filter + ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/internal.subreads.bam", + tests::Data_Dir + "/polymerase/internal.scraps.bam", + filter); + size_t count = 0; + while (stitcher.HasNext()) { + const auto record = stitcher.Next(); + EXPECT_EQ(100000, record.HoleNumber()); + ++count; + } + EXPECT_EQ(1, count); +} + +TEST(ZmwReadStitching, FromDataSet_NoFilter) +{ + // dataset contains these resources (subreads/scraps + hqregion/scraps BAMs) + const string primaryFn1 = tests::Data_Dir + "/polymerase/production.subreads.bam"; + const string scrapsFn1 = tests::Data_Dir + "/polymerase/production.scraps.bam"; + const string primaryFn2 = tests::Data_Dir + "/polymerase/production_hq.hqregion.bam"; + const string scrapsFn2 = tests::Data_Dir + "/polymerase/production_hq.scraps.bam"; + const size_t numExpectedRecords = + tests::NumVirtualRecords(primaryFn1, scrapsFn1) + + tests::NumVirtualRecords(primaryFn2, scrapsFn2); + + const string datasetFn = tests::Data_Dir + + "/polymerase/multiple_resources.subread.dataset.xml"; + + DataSet ds{ datasetFn }; + ZmwReadStitcher stitcher{ ds }; + size_t numObservedRecords = 0; + while (stitcher.HasNext()) { + const auto record = stitcher.Next(); + (void)record; + ++numObservedRecords; + } + EXPECT_EQ(numExpectedRecords, numObservedRecords); +} + +TEST(ZmwReadStitching, FromDataSet_Filtered) +{ + // dataset contains these resources (subreads/scraps + hqregion/scraps BAMs) + const string primaryFn1 = tests::Data_Dir + "/polymerase/production.subreads.bam"; + const string scrapsFn1 = tests::Data_Dir + "/polymerase/production.scraps.bam"; + const string primaryFn2 = tests::Data_Dir + "/polymerase/internal.subreads.bam"; + const string scrapsFn2 = tests::Data_Dir + "/polymerase/internal.scraps.bam"; + const string primaryFn3 = tests::Data_Dir + "/polymerase/production_hq.hqregion.bam"; + const string scrapsFn3 = tests::Data_Dir + "/polymerase/production_hq.scraps.bam"; + const size_t totalRecords = + tests::NumVirtualRecords(primaryFn1, scrapsFn1) + + tests::NumVirtualRecords(primaryFn2, scrapsFn2) + + tests::NumVirtualRecords(primaryFn3, scrapsFn3); + EXPECT_EQ(5, totalRecords); + + // our filter will remove the 2 "production" BAM pairs + // using a ZMW filter that only the "internal" pair should pass + const string datasetFn = tests::Data_Dir + + "/polymerase/filtered_resources.subread.dataset.xml"; + + DataSet ds{ datasetFn }; + ZmwReadStitcher stitcher{ ds }; + size_t numObservedRecords = 0; + while (stitcher.HasNext()) { + const auto record = stitcher.Next(); + (void)record; + ++numObservedRecords; + } + EXPECT_EQ(1, numObservedRecords); +} + +TEST(ZmwReadStitching, FromDataSet_EmptyDataSet) +{ + ZmwReadStitcher stitcher{ DataSet{} }; + EXPECT_FALSE(stitcher.HasNext()); +} + +TEST(ZmwReadStitching, EmptyScrapsFile) +{ + const std::string primaryBamFn = tests::Data_Dir + "/polymerase/scrapless.subreads.bam" ; + const std::string scrapsBamFn = tests::Data_Dir + "/polymerase/scrapless.scraps.bam" ; + + const BamFile primaryBam(primaryBamFn); + const BamFile scrapsBam(scrapsBamFn); + const PbiRawData primaryIdx(primaryBam.PacBioIndexFilename()); + const PbiRawData scrapsIdx(scrapsBam.PacBioIndexFilename()); + EXPECT_EQ(3, primaryIdx.NumReads()); + EXPECT_EQ(0, scrapsIdx.NumReads()); + + int count = 0; + ZmwReadStitcher stitcher(primaryBamFn, scrapsBamFn); + while (stitcher.HasNext()) { + auto record = stitcher.Next(); + (void)record; + ++count; + } + EXPECT_EQ(3, count); +} + +TEST(ZmwReadStitching, VirtualRegions) +{ + // Create virtual polymerase read + ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/internal.subreads.bam", + tests::Data_Dir + "/polymerase/internal.scraps.bam"); + auto virtualRecord = stitcher.Next(); + + auto regionMap = virtualRecord.VirtualRegionsMap(); + auto adapter = virtualRecord.VirtualRegionsTable(VirtualRegionType::ADAPTER); + + // Compare different accessors to same source + EXPECT_EQ(regionMap[VirtualRegionType::ADAPTER], adapter); + + // Compare to truth + EXPECT_EQ(3047,adapter[0].beginPos); + EXPECT_EQ(3095,adapter[0].endPos); + EXPECT_EQ(3650,adapter[1].beginPos); + EXPECT_EQ(3700,adapter[1].endPos); + EXPECT_EQ(4289,adapter[2].beginPos); + EXPECT_EQ(4335,adapter[2].endPos); + EXPECT_EQ(4888,adapter[3].beginPos); + EXPECT_EQ(4939,adapter[3].endPos); + EXPECT_EQ(5498,adapter[4].beginPos); + EXPECT_EQ(5546,adapter[4].endPos); + EXPECT_EQ(6116,adapter[5].beginPos); + EXPECT_EQ(6173,adapter[5].endPos); + EXPECT_EQ(6740,adapter[6].beginPos); + EXPECT_EQ(6790,adapter[6].endPos); + + auto barcode = virtualRecord.VirtualRegionsTable(VirtualRegionType::BARCODE); + EXPECT_EQ(regionMap[VirtualRegionType::BARCODE], barcode); + EXPECT_EQ(3025,barcode[0].beginPos); + EXPECT_EQ(3047,barcode[0].endPos); + EXPECT_EQ(3095,barcode[1].beginPos); + EXPECT_EQ(3116,barcode[1].endPos); + EXPECT_EQ(3628,barcode[2].beginPos); + EXPECT_EQ(3650,barcode[2].endPos); + EXPECT_EQ(3700,barcode[3].beginPos); + EXPECT_EQ(3722,barcode[3].endPos); + EXPECT_EQ(4267,barcode[4].beginPos); + EXPECT_EQ(4289,barcode[4].endPos); + EXPECT_EQ(4335,barcode[5].beginPos); + EXPECT_EQ(4356,barcode[5].endPos); + EXPECT_EQ(4864,barcode[6].beginPos); + EXPECT_EQ(4888,barcode[6].endPos); + EXPECT_EQ(4939,barcode[7].beginPos); + EXPECT_EQ(4960,barcode[7].endPos); + EXPECT_EQ(5477,barcode[8].beginPos); + EXPECT_EQ(5498,barcode[8].endPos); + EXPECT_EQ(5546,barcode[9].beginPos); + EXPECT_EQ(5571,barcode[9].endPos); + EXPECT_EQ(6087,barcode[10].beginPos); + EXPECT_EQ(6116,barcode[10].endPos); + EXPECT_EQ(6173,barcode[11].beginPos); + EXPECT_EQ(6199,barcode[11].endPos); + EXPECT_EQ(6719,barcode[12].beginPos); + EXPECT_EQ(6740,barcode[12].endPos); + EXPECT_EQ(6790,barcode[13].beginPos); + EXPECT_EQ(6812,barcode[13].endPos); + + auto lqregion = virtualRecord.VirtualRegionsTable(VirtualRegionType::LQREGION); + EXPECT_EQ(regionMap[VirtualRegionType::LQREGION], lqregion); + EXPECT_EQ(0,lqregion[0].beginPos); + EXPECT_EQ(2659,lqregion[0].endPos); + EXPECT_EQ(7034,lqregion[1].beginPos); + EXPECT_EQ(7035,lqregion[1].endPos); + + + auto hqregion = virtualRecord.VirtualRegionsTable(VirtualRegionType::HQREGION); + EXPECT_EQ(regionMap[VirtualRegionType::HQREGION], hqregion); + EXPECT_EQ(2659,hqregion[0].beginPos); + EXPECT_EQ(7034,hqregion[0].endPos); +} + +TEST(ZmwReadStitching, InternalSubreadsToOriginal) +{ + // Create virtual polymerase read + ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/internal.subreads.bam", + tests::Data_Dir + "/polymerase/internal.scraps.bam"); + EXPECT_TRUE(stitcher.HasNext()); + auto virtualRecord = stitcher.Next(); + + // Read original polymerase read + BamFile polyBam(tests::Data_Dir + "/polymerase/internal.polymerase.bam"); + EntireFileQuery polyQuery(polyBam); + auto begin = polyQuery.begin(); + auto end = polyQuery.end(); + EXPECT_TRUE(begin != end); + auto polyRecord = *begin; + + // check + tests::Compare(polyRecord, virtualRecord); +} + +TEST(ZmwReadStitching, InternalHQToOriginal) +{ + // Create virtual polymerase read + ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/internal.hqregions.bam", + tests::Data_Dir + "/polymerase/internal.lqregions.bam"); + EXPECT_TRUE(stitcher.HasNext()); + auto virtualRecord = stitcher.Next(); + + // Read original polymerase read + BamFile polyBam(tests::Data_Dir + "/polymerase/internal.polymerase.bam"); + EntireFileQuery polyQuery(polyBam); + auto begin = polyQuery.begin(); + auto end = polyQuery.end(); + EXPECT_TRUE(begin != end); + auto polyRecord = *begin; + + // check + tests::Compare(polyRecord, virtualRecord); +} + +TEST(ZmwReadStitching, ProductionSubreadsToOriginal) +{ + // Create virtual polymerase read + ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/production.subreads.bam", + tests::Data_Dir + "/polymerase/production.scraps.bam"); + + EXPECT_TRUE(stitcher.HasNext()); + auto virtualRecord = stitcher.Next(); + EXPECT_FALSE(stitcher.HasNext()); + + // Read original polymerase read + BamFile polyBam(tests::Data_Dir + "/polymerase/production.polymerase.bam"); + EntireFileQuery polyQuery(polyBam); + + auto begin = polyQuery.begin(); + auto end = polyQuery.end(); + EXPECT_TRUE(begin != end); + auto polyRecord = *begin; + + EXPECT_EQ(polyRecord.FullName(), virtualRecord.FullName()); + EXPECT_EQ(polyRecord.HoleNumber(), virtualRecord.HoleNumber()); + EXPECT_FLOAT_EQ(polyRecord.ReadAccuracy(), virtualRecord.ReadAccuracy()); + EXPECT_EQ(polyRecord.NumPasses(), virtualRecord.NumPasses()); + EXPECT_EQ(polyRecord.Sequence(), virtualRecord.Sequence()); + EXPECT_EQ(polyRecord.Qualities(), virtualRecord.Qualities()); + EXPECT_EQ(polyRecord.DeletionQV(), virtualRecord.DeletionQV()); + EXPECT_EQ(polyRecord.DeletionTag(), virtualRecord.DeletionTag()); + EXPECT_EQ(polyRecord.InsertionQV(), virtualRecord.InsertionQV()); + EXPECT_EQ(polyRecord.MergeQV(), virtualRecord.MergeQV()); + EXPECT_EQ(polyRecord.SubstitutionQV(), virtualRecord.SubstitutionQV()); + EXPECT_EQ(polyRecord.SubstitutionTag(), virtualRecord.SubstitutionTag()); + EXPECT_EQ(polyRecord.IPD(), virtualRecord.IPDV1Frames()); + EXPECT_EQ(polyRecord.ReadGroup(), virtualRecord.ReadGroup()); +} + +TEST(ZmwReadStitching, ProductionHQToOriginal) +{ + // Create virtual polymerase read + ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/production_hq.hqregion.bam", + tests::Data_Dir + "/polymerase/production_hq.scraps.bam"); + EXPECT_TRUE(stitcher.HasNext()); + auto virtualRecord = stitcher.Next(); + EXPECT_FALSE(stitcher.HasNext()); + + // Read original polymerase read + BamFile polyBam(tests::Data_Dir + "/polymerase/production.polymerase.bam"); + EntireFileQuery polyQuery(polyBam); + + auto begin = polyQuery.begin(); + auto end = polyQuery.end(); + EXPECT_TRUE(begin != end); + auto polyRecord = *begin; + + EXPECT_FALSE(polyRecord.HasPulseCall()); + EXPECT_FALSE(virtualRecord.HasPulseCall()); + EXPECT_EQ(polyRecord.FullName(), virtualRecord.FullName()); + EXPECT_EQ(polyRecord.HoleNumber(), virtualRecord.HoleNumber()); + EXPECT_EQ(polyRecord.ReadAccuracy(), virtualRecord.ReadAccuracy()); + EXPECT_EQ(polyRecord.NumPasses(), virtualRecord.NumPasses()); + EXPECT_EQ(polyRecord.Sequence(), virtualRecord.Sequence()); + EXPECT_EQ(polyRecord.Qualities(), virtualRecord.Qualities()); + EXPECT_EQ(polyRecord.DeletionQV(), virtualRecord.DeletionQV()); + EXPECT_EQ(polyRecord.DeletionTag(), virtualRecord.DeletionTag()); + EXPECT_EQ(polyRecord.InsertionQV(), virtualRecord.InsertionQV()); + EXPECT_EQ(polyRecord.MergeQV(), virtualRecord.MergeQV()); + EXPECT_EQ(polyRecord.SubstitutionQV(), virtualRecord.SubstitutionQV()); + EXPECT_EQ(polyRecord.SubstitutionTag(), virtualRecord.SubstitutionTag()); + EXPECT_EQ(polyRecord.IPD(), virtualRecord.IPDV1Frames()); + EXPECT_EQ(polyRecord.ReadGroup(), virtualRecord.ReadGroup()); + + EXPECT_TRUE(polyRecord.HasDeletionQV()); + EXPECT_TRUE(polyRecord.HasDeletionTag()); + EXPECT_TRUE(polyRecord.HasInsertionQV()); + EXPECT_TRUE(polyRecord.HasMergeQV()); + EXPECT_TRUE(polyRecord.HasSubstitutionQV()); + EXPECT_TRUE(polyRecord.HasSubstitutionTag()); + EXPECT_TRUE(polyRecord.HasIPD()); + EXPECT_FALSE(polyRecord.HasLabelQV()); + EXPECT_FALSE(polyRecord.HasAltLabelQV()); + EXPECT_FALSE(polyRecord.HasAltLabelTag()); + EXPECT_FALSE(polyRecord.HasPkmean()); + EXPECT_FALSE(polyRecord.HasPkmid()); + EXPECT_FALSE(polyRecord.HasPulseCall()); + EXPECT_FALSE(polyRecord.HasPulseWidth()); + EXPECT_FALSE(polyRecord.HasPrePulseFrames()); + EXPECT_FALSE(polyRecord.HasPulseCallWidth()); + + EXPECT_TRUE(virtualRecord.HasDeletionQV()); + EXPECT_TRUE(virtualRecord.HasDeletionTag()); + EXPECT_TRUE(virtualRecord.HasInsertionQV()); + EXPECT_TRUE(virtualRecord.HasMergeQV()); + EXPECT_TRUE(virtualRecord.HasSubstitutionQV()); + EXPECT_TRUE(virtualRecord.HasSubstitutionTag()); + EXPECT_TRUE(virtualRecord.HasIPD()); + EXPECT_FALSE(virtualRecord.HasLabelQV()); + EXPECT_FALSE(virtualRecord.HasAltLabelQV()); + EXPECT_FALSE(virtualRecord.HasAltLabelTag()); + EXPECT_FALSE(virtualRecord.HasPkmean()); + EXPECT_FALSE(virtualRecord.HasPkmid()); + EXPECT_FALSE(virtualRecord.HasPulseCall()); + EXPECT_FALSE(virtualRecord.HasPulseWidth()); + EXPECT_FALSE(virtualRecord.HasPrePulseFrames()); + EXPECT_FALSE(virtualRecord.HasPulseCallWidth()); +} + +TEST(ZmwReadStitching, VirtualRecord_VirtualRegionsTable) +{ + ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/production.subreads.bam", + tests::Data_Dir + "/polymerase/production.scraps.bam"); + EXPECT_TRUE(stitcher.HasNext()); + const auto virtualRecord = stitcher.Next(); + + const auto subreads = virtualRecord.VirtualRegionsTable(VirtualRegionType::SUBREAD); + const auto adapters = virtualRecord.VirtualRegionsTable(VirtualRegionType::ADAPTER); + const auto hqRegions = virtualRecord.VirtualRegionsTable(VirtualRegionType::HQREGION); + const auto lqRegions = virtualRecord.VirtualRegionsTable(VirtualRegionType::LQREGION); + const auto barcodes = virtualRecord.VirtualRegionsTable(VirtualRegionType::BARCODE); + const auto filtered = virtualRecord.VirtualRegionsTable(VirtualRegionType::FILTERED); + + EXPECT_FALSE(subreads.empty()); + EXPECT_FALSE(adapters.empty()); + EXPECT_FALSE(hqRegions.empty()); + EXPECT_FALSE(lqRegions.empty()); + EXPECT_FALSE(barcodes.empty()); + EXPECT_TRUE(filtered.empty()); // this type not present in this data +} + +TEST(ZmwReadStitching, LegacyTypedefsOk) +{ + { + VirtualPolymeraseReader reader(tests::Data_Dir + "/polymerase/internal.subreads.bam", + tests::Data_Dir + "/polymerase/internal.scraps.bam"); + size_t count = 0; + while (reader.HasNext()) { + const auto record = reader.Next(); + (void)record; + ++count; + } + EXPECT_EQ(3, count); + } + + { + VirtualPolymeraseCompositeReader reader{ DataSet{} }; + EXPECT_FALSE(reader.HasNext()); + } +} + diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt new file mode 100644 index 0000000..5c589c1 --- /dev/null +++ b/tools/CMakeLists.txt @@ -0,0 +1,47 @@ + + +if(DEFINED PacBioBAM_build_pbindex) + + # Deprecating the "PacBioBAM_build_pbindex" command line option in favor of more + # general "PacBioBAM_build_tools", as we're starting to add new utilities. + # + # That said, I don't want to break current auto tests/builds, so I'm providing a + # warning message so devs are aware. + # + # construct warning message + set(pbindex_warning "\nDeprecated:\n-DPacBioBAM_build_pbindex\n") + if (PacBioBAM_build_pbindex) + set(pbindex_warning "${pbindex_warning} Building as requested,") + else() + set(pbindex_warning "${pbindex_warning} Skipping as requested,") + endif() + set(pbindex_warning "${pbindex_warning} but support for this option will be removed at some point in the future.\n") + message(AUTHOR_WARNING "${pbindex_warning} ** Use -DPacBioBAM_build_tools instead. **\n") + + # force PacBioBAM_build_tools option + set(PacBioBAM_build_tools + ${PacBioBAM_build_pbindex} CACHE BOOL + "Build PacBioBAM with add'l utilities (e.g. pbindex, pbindexdump)." FORCE) +endif() + +if (PacBioBAM_build_tools) + + # tools directory + set(ToolsCommonDir ${PacBioBAM_ToolsDir}/common) + set(PacBioBAM_CramTestsDir ${PacBioBAM_TestsDir}/src/cram) + + # quash warning with OptionParser + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag("-Wno-unused-private-field" HAS_NO_UNUSED_PRIVATE_FIELD) + if(HAS_NO_UNUSED_PRIVATE_FIELD) + set(PacBioBAM_CXX_FLAGS "${PacBioBAM_CXX_FLAGS} -Wno-unused-private-field") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PacBioBAM_CXX_FLAGS}") + + # tools + add_subdirectory(bam2sam) + add_subdirectory(pbindex) + add_subdirectory(pbindexdump) + add_subdirectory(pbmerge) + +endif() diff --git a/tools/bam2sam/CMakeLists.txt b/tools/bam2sam/CMakeLists.txt new file mode 100644 index 0000000..5554970 --- /dev/null +++ b/tools/bam2sam/CMakeLists.txt @@ -0,0 +1,39 @@ + +set(Bam2SamSrcDir ${PacBioBAM_ToolsDir}/bam2sam/src) + +# create version header +set(Bam2Sam_VERSION ${PacBioBAM_VERSION}) +configure_file( + ${Bam2SamSrcDir}/Bam2SamVersion.h.in ${GeneratedDir}/Bam2SamVersion.h @ONLY +) + +# list source files +set(BAM2SAM_SOURCES + ${ToolsCommonDir}/OptionParser.cpp + ${Bam2SamSrcDir}/main.cpp + ${Bam2SamSrcDir}/Bam2Sam.cpp +) + +# build bam2sam executable +include(PbbamTool) +create_pbbam_tool( + TARGET bam2sam + SOURCES ${BAM2SAM_SOURCES} +) + +# cram tests +if (PacBioBAM_build_tests) + + configure_file( + ${PacBioBAM_CramTestsDir}/bam2sam.t.in + ${GeneratedDir}/bam2sam.t + ) + + add_test( + NAME bam2sam_CramTests + WORKING_DIRECTORY ${PacBioBAM_TestsDir}/scripts + COMMAND "python" cram.py + ${GeneratedDir}/bam2sam.t + ) + +endif() diff --git a/tools/bam2sam/src/Bam2Sam.cpp b/tools/bam2sam/src/Bam2Sam.cpp new file mode 100644 index 0000000..5fde774 --- /dev/null +++ b/tools/bam2sam/src/Bam2Sam.cpp @@ -0,0 +1,121 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "Bam2Sam.h" +#include +#include +#include +#include +using namespace bam2sam; +using namespace std; + +namespace bam2sam { + +struct HtslibFileDeleter +{ + void operator()(samFile* file) + { + if (file) + sam_close(file); + file = nullptr; + } +}; + +struct HtslibHeaderDeleter +{ + void operator()(bam_hdr_t* hdr) + { + if (hdr) + bam_hdr_destroy(hdr); + hdr = nullptr; + } +}; + +struct HtslibRecordDeleter +{ + void operator()(bam1_t* b) + { + if (b) + bam_destroy1(b); + b = nullptr; + } +}; + +} // namespace bam2sam + +void PbBam2Sam::Run(const Settings &settings) +{ + int htslibResult = 0; + + // open files + + unique_ptr inFileWrapper(sam_open(settings.inputFilename_.c_str(), "rb")); + samFile* in = inFileWrapper.get(); + if (!in || !in->fp.bgzf) + throw std::runtime_error("could not read from stdin"); + + unique_ptr outFileWrapper(sam_open("-", "w")); + samFile* out = outFileWrapper.get(); + if (!out) + throw std::runtime_error("could not write to stdout"); + + // fetch & write header + + unique_ptr headerWrapper(bam_hdr_read(in->fp.bgzf)); + bam_hdr_t* hdr = headerWrapper.get(); + if (!hdr) + throw std::runtime_error("could not read header"); + + if (!settings.noHeader_) { + htslibResult = sam_hdr_write(out, hdr); + if (htslibResult != 0) + throw std::runtime_error("could not write header"); + if (settings.printHeaderOnly_) + return; + } + + // fetch & write records + + unique_ptr recordWrapper(bam_init1()); + bam1_t* b = recordWrapper.get(); + + while ((htslibResult = sam_read1(in, hdr, b)) >= 0) { + htslibResult = sam_write1(out, hdr, b); + if (htslibResult < 0) + throw std::runtime_error("error writing record to stdout"); + } +} diff --git a/tools/bam2sam/src/Bam2Sam.h b/tools/bam2sam/src/Bam2Sam.h new file mode 100644 index 0000000..4a7ffbb --- /dev/null +++ b/tools/bam2sam/src/Bam2Sam.h @@ -0,0 +1,53 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef BAM2SAM_H +#define BAM2SAM_H + +#include "Settings.h" + +namespace bam2sam { + +class PbBam2Sam +{ +public: + static void Run(const Settings& settings); +}; + +} // namespace bam2sam + +#endif // PBIBAM2SAM_H diff --git a/tools/bam2sam/src/Bam2SamVersion.h.in b/tools/bam2sam/src/Bam2SamVersion.h.in new file mode 100644 index 0000000..10319b7 --- /dev/null +++ b/tools/bam2sam/src/Bam2SamVersion.h.in @@ -0,0 +1,49 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef BAM2SAMVERSION_H +#define BAM2SAMVERSION_H + +#include + +namespace bam2sam { + +const std::string Version = std::string("@Bam2Sam_VERSION@"); + +} // namespace bam2sam + +#endif // BAM2SAMVERSION_H diff --git a/tools/bam2sam/src/Settings.h b/tools/bam2sam/src/Settings.h new file mode 100644 index 0000000..d570dc9 --- /dev/null +++ b/tools/bam2sam/src/Settings.h @@ -0,0 +1,63 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef SETTINGS_H +#define SETTINGS_H + +#include +#include + +namespace bam2sam { + +class Settings +{ +public: + Settings(void) + : noHeader_(false) + , printHeaderOnly_(false) + { } + +public: + std::string inputFilename_; + bool noHeader_; + bool printHeaderOnly_; + std::vector errors_; +}; + +} // namespace bam2sam + +#endif // SETTINGS_H diff --git a/tools/bam2sam/src/main.cpp b/tools/bam2sam/src/main.cpp new file mode 100644 index 0000000..d27b42f --- /dev/null +++ b/tools/bam2sam/src/main.cpp @@ -0,0 +1,127 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "../common/OptionParser.h" +#include "Bam2Sam.h" +#include "Bam2SamVersion.h" +#include +#include +#include +#include + +static +bam2sam::Settings fromCommandLine(optparse::OptionParser& parser, + int argc, char* argv[]) +{ + bam2sam::Settings settings; + + const optparse::Values options = parser.parse_args(argc, argv); + + // input + const std::vector positionalArgs = parser.args(); + const size_t numPositionalArgs = positionalArgs.size(); + if (numPositionalArgs == 0) + settings.inputFilename_ = "-"; // stdin + else if (numPositionalArgs == 1) + settings.inputFilename_ = parser.args().front(); + else { + assert(numPositionalArgs > 1); + settings.errors_.push_back("bam2sam does not support more than one input file per run"); + } + + // header options + if (options.is_set("no_header")) + settings.noHeader_ = options.get("no_header"); + if (options.is_set("header_only")) + settings.printHeaderOnly_ = options.get("header_only"); + + if (settings.noHeader_ && settings.printHeaderOnly_) + settings.errors_.push_back("conflicting arguments requested: --no-header and --header-only"); + + return settings; +} + +int main(int argc, char* argv[]) +{ + // setup help & options + optparse::OptionParser parser; + parser.description("bam2sam converts a BAM file to SAM. It is essentially a stripped-down " + "'samtools view', mostly useful for testing/debugging without requiring samtools. " + "Input BAM file is read from a file or stdin, and SAM output is written to stdout." + ); + parser.prog("bam2sam"); + parser.usage("bam2sam [options] [input]"); + parser.version(bam2sam::Version); + parser.add_version_option(true); + parser.add_help_option(true); + + auto optionGroup = optparse::OptionGroup(parser, "Options"); + optionGroup.add_option("") + .dest("input") + .metavar("input") + .help("Input BAM file. If not provided, stdin will be used as input."); + optionGroup.add_option("--no-header") + .dest("no_header") + .action("store_true") + .help("Omit header from output."); + optionGroup.add_option("--header-only") + .dest("header_only") + .action("store_true") + .help("Print only the header (no records)."); + parser.add_option_group(optionGroup); + + // parse command line for settings + const bam2sam::Settings settings = fromCommandLine(parser, argc, argv); + if (!settings.errors_.empty()) { + std::cerr << std::endl; + for (const auto e : settings.errors_) + std::cerr << "ERROR: " << e << std::endl; + std::cerr << std::endl; + parser.print_help(); + return EXIT_FAILURE; + } + + // run tool + try { + bam2sam::PbBam2Sam::Run(settings); + return EXIT_SUCCESS; + } + catch (std::exception& e) { + std::cerr << "ERROR: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/tools/common/BamFileMerger.h b/tools/common/BamFileMerger.h new file mode 100644 index 0000000..d2a6bb2 --- /dev/null +++ b/tools/common/BamFileMerger.h @@ -0,0 +1,78 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#ifndef BAMFILEMERGER_H +#define BAMFILEMERGER_H + +#include +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace common { + +class BamFileMerger +{ +public: + /// \brief Runs merger on a dataset, applying any supplied filters. + /// + /// When this function exits, a merged BAM (and optional PBI) will have been + /// written and closed. + /// + /// \param[in] dataset provides input filenames & filters + /// \param[in] outputFilename resulting BAM output + /// \param[in] mergeProgram info about the calling program. Adds a @PG entry to merged header. + /// \param[in] createPbi if true, creates a PBI alongside output BAM + /// + /// \throws std::runtime_error if any any errors encountered while reading or writing + /// + static void Merge(const PacBio::BAM::DataSet& dataset, + const std::string& outputFilename, + const PacBio::BAM::ProgramInfo& mergeProgram = PacBio::BAM::ProgramInfo(), + bool createPbi = true); +}; + +} // namespace common +} // namespace BAM +} // namespace PacBio + +#include "BamFileMerger.inl" + +#endif // BAMFILEMERGER_H diff --git a/tools/common/BamFileMerger.inl b/tools/common/BamFileMerger.inl new file mode 100644 index 0000000..18dfbca --- /dev/null +++ b/tools/common/BamFileMerger.inl @@ -0,0 +1,262 @@ +// Copyright (c) 2015, Pacific Biosciences of California, Inc. +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted (subject to the limitations in the +// disclaimer below) provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of Pacific Biosciences nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. + +// Author: Derek Barnett + +#include "BamFileMerger.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace PacBio { +namespace BAM { +namespace common { + +// ICollator + +class ICollator +{ +public: + ~ICollator(void) { } + + bool GetNext(BamRecord& record) + { + // nothing left to read + if (mergeItems_.empty()) + return false; + + // non-destructive 'pop' of first item from queue + auto firstIter = mergeItems_.begin(); + auto firstItem = PacBio::BAM::internal::CompositeMergeItem{ std::move(firstIter->reader), + std::move(firstIter->record) + }; + mergeItems_.pop_front(); + + // store its record in our output record + std::swap(record, firstItem.record); + + // try fetch 'next' from first item's reader + // if successful, re-insert it into container & re-sort on our new values + // otherwise, this item will go out of scope & reader destroyed + if (firstItem.reader->GetNext(firstItem.record)) { + mergeItems_.push_front(std::move(firstItem)); + UpdateSort(); + } + + // return success + return true; + } + +protected: + std::deque mergeItems_; + +protected: + ICollator(std::vector >&& readers) + { + for (auto&& reader : readers) { + auto item = internal::CompositeMergeItem{std::move(reader)}; + if (item.reader->GetNext(item.record)) + mergeItems_.push_back(std::move(item)); + } + } + + virtual void UpdateSort(void) =0; +}; + +// QNameCollator + +struct QNameSorter : std::binary_function +{ + bool operator()(const internal::CompositeMergeItem& lhs, + const internal::CompositeMergeItem& rhs) + { + const BamRecord& l = lhs.record; + const BamRecord& r = rhs.record; + + // movie name + const int cmp = l.MovieName().compare(r.MovieName()); + if (cmp != 0) + return cmp < 0; + + // hole number + const auto lhsZmw = l.HoleNumber(); + const auto rhsZmw = r.HoleNumber(); + if (lhsZmw != rhsZmw) + return lhsZmw < rhsZmw; + + // shuffle CCS reads after all others + const auto lhsReadType = l.Type(); + const auto rhsReadType = r.Type(); + if (lhsReadType == RecordType::CCS) + return false; + if (rhsReadType == RecordType::CCS) + return true; + + // sort on qStart, then finally qEnd + const auto lhsQStart = l.QueryStart(); + const auto rhsQStart = r.QueryStart(); + return lhsQStart < rhsQStart; + } +}; + +class QNameCollator : public ICollator +{ +public: + QNameCollator(std::vector>&& readers) + : ICollator(std::move(readers)) + { UpdateSort(); } + + void UpdateSort(void) + { std::sort(mergeItems_.begin(), mergeItems_.end(), QNameSorter{ }); } +}; + +// AlignedCollator + +class AlignedCollator : public ICollator +{ +public: + AlignedCollator(std::vector>&& readers) + : ICollator(std::move(readers)) + { UpdateSort(); } + + void UpdateSort(void) + { std::sort(mergeItems_.begin(), mergeItems_.end(), PacBio::BAM::PositionSorter{ }); } +}; + +// BamFileMerger + +inline +void BamFileMerger::Merge(const DataSet& dataset, + const std::string& outputFilename, + const ProgramInfo& mergeProgram, + bool createPbi) +{ + const PbiFilter filter = PbiFilter::FromDataSet(dataset); + + std::vector inputFilenames_; + const auto& bamFiles = dataset.BamFiles(); + inputFilenames_.reserve(bamFiles.size()); + for (const auto& file : bamFiles) + inputFilenames_.push_back(file.Filename()); + + if (inputFilenames_.empty()) + throw std::runtime_error("no input filenames provided to BamFileMerger"); + + if (outputFilename.empty()) + throw std::runtime_error("no output filename provide to BamFileMerger"); + + + // attempt open input files + std::vector > readers; + readers.reserve(inputFilenames_.size()); + for (const auto& fn : inputFilenames_) { + if (filter.IsEmpty()) + readers.emplace_back(new BamReader(fn)); + else + readers.emplace_back(new PbiIndexedBamReader(filter, fn)); + } + + // read headers + std::vector headers; + headers.reserve(readers.size()); + for (auto&& reader : readers) + headers.push_back(reader->Header()); + + assert(!readers.empty()); + assert(!headers.empty()); + + // merge headers + BamHeader mergedHeader = headers.front(); + const std::string& usingSortOrder = mergedHeader.SortOrder(); + const bool isCoordinateSorted = (usingSortOrder == "coordinate"); + for (size_t i = 1; i < headers.size(); ++i) { + const BamHeader& header = headers.at(i); + if (header.SortOrder() != usingSortOrder) + throw std::runtime_error("BAM file sort orders do not match, aborting merge"); + mergedHeader += headers.at(i); + } + if (mergeProgram.IsValid()) + mergedHeader.AddProgram(mergeProgram); + + // setup collator, based on sort order + std::unique_ptr collator; + if (isCoordinateSorted) + collator.reset(new AlignedCollator(std::move(readers))); + else + collator.reset(new QNameCollator(std::move(readers))); + // NOTE: readers *moved*, so no longer accessible here + + // do merge, creating PBI on-the-fly + if (createPbi && (outputFilename != "-")) { + + // TODO: this implementation recalculates all PBI values, when we really + // only need to collate entries and update offsets + + BamWriter writer(outputFilename, mergedHeader); + PbiBuilder builder{ (outputFilename + ".pbi"), + mergedHeader.NumSequences(), + isCoordinateSorted + }; + BamRecord record; + int64_t vOffset = 0; + while (collator->GetNext(record)) { + writer.Write(record, &vOffset); + builder.AddRecord(record, vOffset); + } + } + + // otherwise just merge BAM + else { + BamWriter writer(outputFilename, mergedHeader); + BamRecord record; + while (collator->GetNext(record)) + writer.Write(record); + } +} + +} // namespace common +} // namespace BAM +} // namespace PacBio diff --git a/tools/common/OptionParser.cpp b/tools/common/OptionParser.cpp new file mode 100644 index 0000000..fc73176 --- /dev/null +++ b/tools/common/OptionParser.cpp @@ -0,0 +1,562 @@ +/** + * Copyright (C) 2010 Johannes Weißl + * License: your favourite BSD-style license + * + * See OptionParser.h for help. + */ + +#include "OptionParser.h" + +#include +#include +#include +#include + +#if defined(ENABLE_NLS) && ENABLE_NLS +# include +# define _(s) gettext(s) +#else +# define _(s) ((const char *) (s)) +#endif + +using namespace std; + +namespace optparse { + +////////// auxiliary (string) functions { ////////// +class str_wrap { +public: + str_wrap(const string& l, const string& r) : lwrap(l), rwrap(r) {} + str_wrap(const string& w) : lwrap(w), rwrap(w) {} + string operator() (const string& s) { return lwrap + s + rwrap; } + const string lwrap, rwrap; +}; +template +static string str_join_trans(const string& sep, InputIterator begin, InputIterator end, UnaryOperator op) { + string buf; + for (InputIterator it = begin; it != end; ++it) { + if (it != begin) + buf += sep; + buf += op(*it); + } + return buf; +} +template +static string str_join(const string& sep, InputIterator begin, InputIterator end) { + return str_join_trans(sep, begin, end, str_wrap("")); +} +static string& str_replace(string& s, const string& patt, const string& repl) { + size_t pos = 0, n = patt.length(); + while (true) { + pos = s.find(patt, pos); + if (pos == string::npos) + break; + s.replace(pos, n, repl); + pos += repl.size(); + } + return s; +} +static string str_replace(const string& s, const string& patt, const string& repl) { + string tmp = s; + str_replace(tmp, patt, repl); + return tmp; +} +static string str_format(const string& s, size_t pre, size_t len, bool indent_first = true) { + stringstream ss; + string p; + if (indent_first) + p = string(pre, ' '); + + size_t pos = 0, linestart = 0; + size_t line = 0; + while (true) { + bool wrap = false; + + size_t new_pos = s.find_first_of(" \n\t", pos); + if (new_pos == string::npos) + break; + if (s[new_pos] == '\n') { + pos = new_pos + 1; + wrap = true; + } + if (line == 1) + p = string(pre, ' '); + if (wrap || new_pos + pre > linestart + len) { + ss << p << s.substr(linestart, pos - linestart - 1) << endl; + linestart = pos; + line++; + } + pos = new_pos + 1; + } + ss << p << s.substr(linestart) << endl; + return ss.str(); +} +static string str_inc(const string& s) { + stringstream ss; + string v = (s != "") ? s : "0"; + long i; + istringstream(v) >> i; + ss << i+1; + return ss.str(); +} +static unsigned int cols() { + unsigned int n = 80; +#ifndef _WIN32 + const char *s = getenv("COLUMNS"); + if (s) + istringstream(s) >> n; +#endif + return n; +} +static string basename(const string& s) { + string b = s; + size_t i = b.find_last_not_of('/'); + if (i == string::npos) { + if (b[0] == '/') + b.erase(1); + return b; + } + b.erase(i+1, b.length()-i-1); + i = b.find_last_of("/"); + if (i != string::npos) + b.erase(0, i+1); + return b; +} +////////// } auxiliary (string) functions ////////// + + +////////// class OptionParser { ////////// +OptionParser::OptionParser() : + _usage(_("%prog [options]")), + _add_help_option(true), + _add_version_option(true), + _interspersed_args(true) {} + +Option& OptionParser::add_option(const string& opt) { + const string tmp[1] = { opt }; + return add_option(vector(&tmp[0], &tmp[1])); +} +Option& OptionParser::add_option(const string& opt1, const string& opt2) { + const string tmp[2] = { opt1, opt2 }; + return add_option(vector(&tmp[0], &tmp[2])); +} +Option& OptionParser::add_option(const string& opt1, const string& opt2, const string& opt3) { + const string tmp[3] = { opt1, opt2, opt3 }; + return add_option(vector(&tmp[0], &tmp[3])); +} +Option& OptionParser::add_option(const vector& v) { + _opts.resize(_opts.size()+1); + Option& option = _opts.back(); + string dest_fallback; + for (vector::const_iterator it = v.begin(); it != v.end(); ++it) { + if (it->substr(0,2) == "--") { + const string s = it->substr(2); + if (option.dest() == "") + option.dest(str_replace(s, "-", "_")); + option._long_opts.insert(s); + _optmap_l[s] = &option; + } else if ( it->empty() ) { + continue; + } else { + const string s = it->substr(1,1); + if (dest_fallback == "") + dest_fallback = s; + option._short_opts.insert(s); + _optmap_s[s] = &option; + } + } + if (option.dest() == "") + option.dest(dest_fallback); + return option; +} + +OptionParser& OptionParser::add_option_group(const OptionGroup& group) { + for (list